Files
llvm-project/.github/workflows/prune-unused-branches.py
Aiden Grossman 796e7ebbe5 [Github] Turn on prune-unused-branches workflow (#182985)
This patch turns on the prune-unused-branches workflow for everyone
rather than just my user branches now that all feedback from the
discourse thread has been addressed.
2026-02-24 11:14:54 -08:00

218 lines
7.8 KiB
Python

import subprocess
import sys
import os
import logging
import zipfile
import io
import urllib.request
import github
def get_branches() -> list[str]:
git_process = subprocess.run(
["git", "branch", "--all"], stdout=subprocess.PIPE, check=True
)
branches = [
branch.strip() for branch in git_process.stdout.decode("utf-8").split("\n")
]
def branch_filter(branch_name):
user_or_revert = "users/" in branch_name or "revert-" in branch_name
origin_branch = branch_name.startswith("remotes/origin/")
return user_or_revert and origin_branch
filtered_branches = list(filter(branch_filter, branches))
return [branch.replace("remotes/origin/", "") for branch in filtered_branches]
def query_prs(github_token, extra_query_criteria) -> list[str]:
gh = github.Github(auth=github.Auth.Token(github_token))
query_template = """
query ($after: String) {{
search(query: "is:pr repo:llvm/llvm-project is:open {query_param}", type: ISSUE, first: 100, after: $after) {{
nodes {{
... on PullRequest {{
baseRefName
headRefName
isCrossRepository
number
}}
}}
pageInfo {{
hasNextPage
endCursor
}}
}}
}}"""
query = query_template.format(query_param=extra_query_criteria)
pr_data = []
has_next_page = True
variables = {"after": None}
while has_next_page:
_, res_data = gh._Github__requester.graphql_query(query, variables=variables)
page_info = res_data["data"]["search"]["pageInfo"]
has_next_page = page_info["hasNextPage"]
if has_next_page:
variables["after"] = page_info["endCursor"]
prs = res_data["data"]["search"]["nodes"]
pr_data.extend(prs)
print(f"Processed {len(prs)} PRs")
return pr_data
def get_branches_from_open_prs(github_token) -> list[str]:
pr_data = []
# We will only ever be looking to delete branches with a `users/` prefix or
# a revert- prefix as a contract with get_branches.
pr_data.extend(query_prs(github_token, "head:users/"))
pr_data.extend(query_prs(github_token, "head:revert-"))
# We need to explicitly check cases where the base is a user branch to
# ensure we capture branches that are used as a diff base for cross-repo
# PRs. Revert branches should never be the base branch of a PR.
pr_data.extend(query_prs(github_token, "base:users/"))
user_branches = []
for pr in pr_data:
if not pr["isCrossRepository"]:
if pr["baseRefName"] != "main":
user_branches.append(pr["baseRefName"])
user_branches.append(pr["headRefName"])
else:
# We want to skip cross-repo PRs where someone has simply used a
# users/ branch naming scheme for a branch in their fork.
if pr["baseRefName"] == "main":
continue
user_branches.append(pr["baseRefName"])
# Convert to a set to ensure we have no duplicates.
return list(set(user_branches))
def get_user_branches_to_remove(
user_branches: list[str],
user_branches_from_prs: list[str],
previous_run_branches: list[str],
) -> list[str]:
user_branches_to_remove = set(user_branches)
for pr_user_branch in set(user_branches_from_prs):
if pr_user_branch not in user_branches_to_remove:
logging.warning(
f"Found branch {pr_user_branch} attached to a PR, but it "
"was not found in the repository. This is likely because "
"the PR was created after this workflow cloned the repository."
)
continue
user_branches_to_remove.remove(pr_user_branch)
for branch in list(user_branches_to_remove):
if branch not in previous_run_branches:
user_branches_to_remove.remove(branch)
return list(user_branches_to_remove)
def generate_patch_for_branch(branch_name: str) -> bytes:
command_vector = [
"git",
"format-patch",
"--stdout",
"-k",
f"origin/main..origin/{branch_name}",
]
try:
result = subprocess.run(
command_vector, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
)
except subprocess.CalledProcessError as process_error:
print(process_error.stderr)
print(process_error.stdout)
raise process_error
return result.stdout
def generate_patches_for_all_branches(branches_to_remove: list[str], patches_path: str):
for index, branch in enumerate(branches_to_remove):
patch = generate_patch_for_branch(branch)
patch_filename = branch.replace("/", "-") + ".patch"
patch_path = os.path.join(patches_path, patch_filename)
with open(patch_path, "wb") as patches_file_handle:
patches_file_handle.write(patch)
if index % 50 == 0:
print(
f"Finished generating patches for {index}/{len(branches_to_remove)} branches."
)
def delete_branches(branches_to_remove: list[str]):
for branch in branches_to_remove:
command_vector = ["git", "push", "-d", "origin", branch]
try:
subprocess.run(
command_vector,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except subprocess.CalledProcessError as process_error:
print(process_error.stderr)
print(process_error.stdout)
print(f"Deleted branch {branch}")
def get_branches_found_in_previous_run(github_token: str) -> list[str]:
gh = github.Github(auth=github.Auth.Token(github_token))
repo = gh.get_repo("llvm/llvm-project")
workflow_run = None
for workflow_run in iter(
repo.get_workflow("prune-branches.yml").get_runs(branch="main")
):
if workflow_run.status == "completed":
break
assert workflow_run
workflow_artifact = None
for workflow_artifact in iter(workflow_run.get_artifacts()):
if workflow_artifact.name == "BranchList":
break
assert workflow_artifact
status, headers, response = workflow_artifact._requester.requestJson(
"GET", workflow_artifact.archive_download_url
)
# Github will always send a redirect to where the file actuall lives.
assert status == 302
with urllib.request.urlopen(headers["location"]) as response:
raw_bytes = response.read()
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as zip_file:
branch_names = zip_file.read("branches.txt").decode("utf-8").split("\n")[:-1]
return branch_names
def main(github_token):
if len(sys.argv) != 2:
print(
"Invalid invocation. Correct usage: python3 prune-unused-branches.py <output diectory>"
)
sys.exit(1)
previous_run_branches = get_branches_found_in_previous_run(github_token)
print(
f"{len(previous_run_branches)} branches existed the last time the workflow ran."
)
user_branches = get_branches()
output_dir = sys.argv[1]
with open(os.path.join(output_dir, "branches.txt"), "w") as branches_file:
branches_file.writelines([user_branch + "\n" for user_branch in user_branches])
user_branches_from_prs = get_branches_from_open_prs(github_token)
print(f"Found {len(user_branches)} user branches in the repository")
print(f"Found {len(user_branches_from_prs)} user branches associated with PRs")
user_branches_to_remove = get_user_branches_to_remove(
user_branches, user_branches_from_prs, previous_run_branches
)
print(f"Deleting {len(user_branches_to_remove)} user branches.")
generate_patches_for_all_branches(
user_branches_to_remove, os.path.join(output_dir, "patches")
)
delete_branches(user_branches_to_remove)
if __name__ == "__main__":
main(os.environ["GITHUB_TOKEN"])