astral/scripts/ensure_rolling_pr.py

#!/usr/bin/env python3
"""Create/update rolling drift PR and optionally queue remediation after rejection."""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import subprocess
import sys
import urllib.parse
from pathlib import Path
from typing import Any

# common.py lives in the same directory; ensure it can be imported when the
# script is executed directly.
_sys_path_inserted = False
if __file__:
    _script_dir = str(Path(__file__).resolve().parent)
    if _script_dir not in sys.path:
        sys.path.insert(0, _script_dir)
        _sys_path_inserted = True

import common

if _sys_path_inserted:
    sys.path.pop(0)

_env_text = common.env_text
_env_bool = common.env_bool
_normalize_exclude_csv = common.normalize_exclude_csv
_normalize_merge_strategy = common.normalize_merge_strategy
_request_json = common.request_json
_run_git = common.run_git


def _query_prs(
    repo_api: str,
    headers: dict[str, str],
    source_ref: str,
    target_ref: str,
    status: str,
) -> list[dict[str, Any]]:
    query = urllib.parse.urlencode(
        {
            "searchCriteria.status": status,
            "searchCriteria.sourceRefName": source_ref,
            "searchCriteria.targetRefName": target_ref,
            "api-version": "7.1",
        },
        quote_via=urllib.parse.quote,
        safe="/",
    )
    url = f"{repo_api}/pullrequests?{query}"
    payload = _request_json(url, headers=headers)
    return payload.get("value", []) if isinstance(payload, dict) else []


def _normalize_branch(branch: str) -> str:
    b = branch.strip()
    if b.startswith("refs/heads/"):
        return b[len("refs/heads/") :]
    return b


def _ref_from_branch(branch: str) -> str:
    return f"refs/heads/{_normalize_branch(branch)}"


def _pr_web_url(pr_payload: dict[str, Any]) -> str:
    pr_id = pr_payload.get("pullRequestId")
    return (
        pr_payload.get("url", "")
        .replace("_apis/git/repositories", "_git")
        .replace(f"/pullRequests/{pr_id}", f"/pullrequest/{pr_id}")
    )


def _current_tree_id(repo_root: str) -> str:
    return _run_git(repo_root, ["rev-parse", "HEAD^{tree}"])


def _tree_id_for_commitish(repo_root: str, commitish: str) -> str:
    return _run_git(repo_root, ["rev-parse", f"{commitish}^{{tree}}"])


def _ref_has_commit(repo_root: str, ref: str) -> bool:
    proc = subprocess.run(
        ["git", "rev-parse", "--verify", "--quiet", f"{ref}^{{commit}}"],
        cwd=repo_root,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    return proc.returncode == 0


def _commit_tree_id(repo_api: str, headers: dict[str, str], commit_id: str) -> str:
    url = f"{repo_api}/commits/{commit_id}?api-version=7.1"
    payload = _request_json(url, headers=headers)
    tree_id = payload.get("treeId", "") if isinstance(payload, dict) else ""
    return tree_id.strip()


def _latest_pr_by_creation(prs: list[dict[str, Any]]) -> list[dict[str, Any]]:
    return sorted(prs, key=lambda x: x.get("creationDate", ""), reverse=True)


def _normalize_repo_path(path: str) -> str:
    return str(path or "").replace("\\", "/").lstrip("./")


def _is_doc_like(path: str) -> bool:
    lp = _normalize_repo_path(path).lower()
    if lp.endswith((".md", ".html", ".htm", ".pdf", ".csv", ".txt")):
        return True
    return "/docs/" in f"/{lp}" or "/object inventory/" in f"/{lp}"


def _is_report_like(path: str) -> bool:
    lp = _normalize_repo_path(path).lower()
    return "/reports/" in f"/{lp}" or "/assignment report/" in f"/{lp}"


def _is_workload_config_path(path: str, workload_dir: str, backup_folder: str, reports_subdir: str) -> bool:
    lp = _normalize_repo_path(path).lower()
    backup_norm = _normalize_repo_path(backup_folder).lower().strip("/")
    workload_norm = _normalize_repo_path(workload_dir).lower().strip("/")
    reports_norm = _normalize_repo_path(reports_subdir).lower().strip("/")

    if not backup_norm or not workload_norm:
        return False

    workload_prefix = f"{backup_norm}/{workload_norm}/"
    if not lp.startswith(workload_prefix):
        return False

    if reports_norm and lp.startswith(f"{backup_norm}/{reports_norm}/"):
        return False

    if _is_doc_like(lp) or _is_report_like(lp):
        return False
    return True


def _config_fingerprint_from_local_tree(
    repo_root: str, commitish: str, workload_dir: str, backup_folder: str, reports_subdir: str
) -> str:
    backup_norm = _normalize_repo_path(backup_folder).strip("/")
    workload_norm = _normalize_repo_path(workload_dir).strip("/")
    path_prefix = f"{backup_norm}/{workload_norm}" if backup_norm and workload_norm else ""
    if not path_prefix:
        return ""

    try:
        out = _run_git(repo_root, ["ls-tree", "-r", "--full-tree", commitish, "--", path_prefix])
    except Exception:
        return ""

    pairs: list[str] = []
    for line in out.splitlines():
        if "\t" not in line:
            continue
        left, rel_path = line.split("\t", 1)
        parts = left.split()
        if len(parts) < 3 or parts[1] != "blob":
            continue
        blob_id = parts[2].strip()
        if not blob_id:
            continue
        if not _is_workload_config_path(rel_path, workload_dir, backup_folder, reports_subdir):
            continue
        pairs.append(f"{_normalize_repo_path(rel_path)}\t{blob_id}")

    if not pairs:
        return ""
    pairs.sort(key=lambda item: item.lower())
    joined = "\n".join(pairs).encode("utf-8")
    return hashlib.sha256(joined).hexdigest()


def _config_fingerprint_from_tree_api(
    repo_api: str, headers: dict[str, str], tree_id: str, workload_dir: str, backup_folder: str, reports_subdir: str
) -> str:
    if not tree_id:
        return ""
    url = f"{repo_api}/trees/{tree_id}?recursive=true&api-version=7.1"
    payload = _request_json(url, headers=headers)
    entries = payload.get("treeEntries", []) if isinstance(payload, dict) else []

    pairs: list[str] = []
    for entry in entries:
        if not isinstance(entry, dict):
            continue
        if str(entry.get("gitObjectType", "")).lower() != "blob":
            continue
        rel_path = str(entry.get("relativePath", ""))
        if not _is_workload_config_path(rel_path, workload_dir, backup_folder, reports_subdir):
            continue
        blob_id = str(entry.get("objectId", "")).strip()
        if not blob_id:
            continue
        pairs.append(f"{_normalize_repo_path(rel_path)}\t{blob_id}")

    if not pairs:
        return ""
    pairs.sort(key=lambda item: item.lower())
    joined = "\n".join(pairs).encode("utf-8")
    return hashlib.sha256(joined).hexdigest()


def _workload_config_diff_exists(
    repo_root: str,
    baseline_commitish: str,
    drift_commitish: str,
    workload_dir: str,
    backup_folder: str,
    reports_subdir: str,
) -> bool:
    baseline_fingerprint = _config_fingerprint_from_local_tree(
        repo_root=repo_root,
        commitish=baseline_commitish,
        workload_dir=workload_dir,
        backup_folder=backup_folder,
        reports_subdir=reports_subdir,
    )
    drift_fingerprint = _config_fingerprint_from_local_tree(
        repo_root=repo_root,
        commitish=drift_commitish,
        workload_dir=workload_dir,
        backup_folder=backup_folder,
        reports_subdir=reports_subdir,
    )

    if baseline_fingerprint and drift_fingerprint:
        return baseline_fingerprint != drift_fingerprint

    try:
        return _tree_id_for_commitish(repo_root, baseline_commitish) != _tree_id_for_commitish(repo_root, drift_commitish)
    except Exception:
        return True


def _find_matching_abandoned_pr(
    repo_api: str,
    headers: dict[str, str],
    abandoned_prs: list[dict[str, Any]],
    drift_tree: str,
    repo_root: str,
    workload_dir: str,
    backup_folder: str,
    reports_subdir: str,
    drift_commitish: str,
) -> tuple[dict[str, Any] | None, str]:
    current_config_fingerprint = _config_fingerprint_from_local_tree(
        repo_root=repo_root,
        commitish=drift_commitish,
        workload_dir=workload_dir,
        backup_folder=backup_folder,
        reports_subdir=reports_subdir,
    )
    tree_fingerprint_cache: dict[str, str] = {}

    for pr in _latest_pr_by_creation(abandoned_prs):
        commit_id = (
            ((pr.get("lastMergeSourceCommit") or {}).get("commitId"))
            or ((pr.get("lastMergeCommit") or {}).get("commitId"))
            or ""
        ).strip()
        if not commit_id:
            continue
        try:
            pr_tree = _commit_tree_id(repo_api, headers, commit_id)
        except Exception:
            continue
        if pr_tree and pr_tree == drift_tree:
            return pr, "exact-tree"

        if current_config_fingerprint and pr_tree:
            if pr_tree not in tree_fingerprint_cache:
                try:
                    tree_fingerprint_cache[pr_tree] = _config_fingerprint_from_tree_api(
                        repo_api=repo_api,
                        headers=headers,
                        tree_id=pr_tree,
                        workload_dir=workload_dir,
                        backup_folder=backup_folder,
                        reports_subdir=reports_subdir,
                    )
                except Exception:
                    tree_fingerprint_cache[pr_tree] = ""
            if tree_fingerprint_cache[pr_tree] and tree_fingerprint_cache[pr_tree] == current_config_fingerprint:
                return pr, "config-fingerprint"

    return None, ""


def _pr_has_reject_vote(pr: dict[str, Any]) -> bool:
    reviewers = pr.get("reviewers", [])
    if not isinstance(reviewers, list):
        return False
    for reviewer in reviewers:
        if not isinstance(reviewer, dict):
            continue
        try:
            vote = int(reviewer.get("vote", 0))
        except Exception:
            vote = 0
        if vote == -10:
            return True
    return False


def _current_pr_merge_strategy(pr: dict[str, Any]) -> str:
    completion_options = pr.get("completionOptions")
    if not isinstance(completion_options, dict):
        return ""
    raw = str(completion_options.get("mergeStrategy") or "").strip()
    if not raw:
        return ""
    return _normalize_merge_strategy(raw)


def _build_description(workload: str, drift_branch: str, baseline_branch: str, build_number: str, build_id: str) -> str:
    is_entra = workload.lower() == "entra"
    lead = "Rolling Entra drift PR — backup pipeline" if is_entra else "Rolling drift PR — backup pipeline"
    return (
        f"{lead} run `{build_number}` (build {build_id})\n\n"
        f"Source: `{drift_branch}` → Target: `{baseline_branch}`\n"
    )


def _threads_with_marker(repo_api: str, headers: dict[str, str], pr_id: int, marker: str) -> bool:
    url = f"{repo_api}/pullrequests/{pr_id}/threads?api-version=7.1"
    payload = _request_json(url, headers=headers)
    threads = payload.get("value", []) if isinstance(payload, dict) else []
    for thread in threads:
        for comment in thread.get("comments", []):
            content = str(comment.get("content", ""))
            if marker in content:
                return True
    return False


def _queue_restore_pipeline(
    collection_uri: str,
    project: str,
    headers: dict[str, str],
    definition_id: int,
    baseline_branch: str,
    include_entra_update: bool,
    dry_run: bool,
    update_assignments: bool,
    remove_unmanaged: bool,
    max_workers: int,
    exclude_csv: str,
) -> dict[str, Any]:
    build_api = f"{collection_uri}/{project}/_apis/build/builds?api-version=7.1"
    template_parameters = {
        "dryRun": dry_run,
        "updateAssignments": update_assignments,
        "removeObjectsNotInBaseline": remove_unmanaged,
        "includeEntraUpdate": include_entra_update,
        "baselineBranch": baseline_branch,
        "maxWorkers": max_workers,
    }
    exclude_csv = _normalize_exclude_csv(exclude_csv)
    if exclude_csv:
        template_parameters["excludeCsv"] = exclude_csv
    body = {
        "definition": {"id": definition_id},
        "sourceBranch": _ref_from_branch(baseline_branch),
        "templateParameters": template_parameters,
    }
    return _request_json(build_api, headers=headers, method="POST", body=body)


def _post_pr_thread(repo_api: str, headers: dict[str, str], pr_id: int, content: str) -> None:
    url = f"{repo_api}/pullrequests/{pr_id}/threads?api-version=7.1"
    body = {
        "comments": [{"parentCommentId": 0, "content": content, "commentType": 1}],
        "status": "active",
    }
    _request_json(url, headers=headers, method="POST", body=body)


def main() -> int:
    parser = argparse.ArgumentParser(description="Ensure rolling PR exists with optional remediation-on-rejection")
    parser.add_argument("--repo-root", required=True)
    parser.add_argument("--workload", required=True, choices=["intune", "entra"])
    parser.add_argument("--drift-branch", required=True)
    parser.add_argument("--baseline-branch", required=True)
    parser.add_argument("--pr-title", required=True)
    args = parser.parse_args()

    token = os.environ.get("SYSTEM_ACCESSTOKEN", "").strip()
    if not token:
        raise SystemExit("SYSTEM_ACCESSTOKEN is empty. Enable OAuth token access for this pipeline.")

    collection_uri = os.environ["SYSTEM_COLLECTIONURI"].rstrip("/")
    project = os.environ["SYSTEM_TEAMPROJECT"]
    repository_id = os.environ["BUILD_REPOSITORY_ID"]
    build_number = os.environ.get("BUILD_BUILDNUMBER", "")
    build_id = os.environ.get("BUILD_BUILDID", "")

    auto_remediate = _env_bool("AUTO_REMEDIATE_ON_PR_REJECTION", False)
    include_entra_update = _env_bool("AUTO_REMEDIATE_INCLUDE_ENTRA_UPDATE", False)
    remediation_def_id_raw = _env_text("AUTO_REMEDIATE_RESTORE_PIPELINE_ID", "")
    remediation_dry_run = _env_bool("AUTO_REMEDIATE_DRY_RUN", False)
    remediation_update_assignments = _env_bool("AUTO_REMEDIATE_UPDATE_ASSIGNMENTS", True)
    remediation_remove_unmanaged = _env_bool("AUTO_REMEDIATE_REMOVE_OBJECTS", False)
    remediation_max_workers_raw = _env_text("AUTO_REMEDIATE_MAX_WORKERS", "10")
    remediation_exclude_csv = _normalize_exclude_csv(_env_text("AUTO_REMEDIATE_EXCLUDE_CSV", ""))
    pr_merge_strategy = _normalize_merge_strategy(_env_text("ROLLING_PR_MERGE_STRATEGY", "rebase"))
    create_as_draft = _env_bool("ROLLING_PR_DELAY_REVIEWER_NOTIFICATIONS", False)

    try:
        remediation_max_workers = int(remediation_max_workers_raw)
    except ValueError as exc:
        raise SystemExit(f"Invalid AUTO_REMEDIATE_MAX_WORKERS value: {remediation_max_workers_raw}") from exc

    if auto_remediate and not remediation_def_id_raw:
        print(
            "WARNING: AUTO_REMEDIATE_ON_PR_REJECTION=true but AUTO_REMEDIATE_RESTORE_PIPELINE_ID is empty; "
            "remediation queueing disabled for this run.",
            file=sys.stderr,
        )
        auto_remediate = False

    try:
        remediation_def_id = int(remediation_def_id_raw) if remediation_def_id_raw else 0
    except ValueError as exc:
        raise SystemExit(
            f"Invalid AUTO_REMEDIATE_RESTORE_PIPELINE_ID value: {remediation_def_id_raw}"
        ) from exc

    drift_branch = _normalize_branch(args.drift_branch)
    baseline_branch = _normalize_branch(args.baseline_branch)
    backup_folder = _env_text("BACKUP_FOLDER", "tenant-state")
    reports_subdir = _env_text("REPORTS_SUBDIR", "reports")
    workload_dir = _env_text(
        "INTUNE_BACKUP_SUBDIR" if args.workload == "intune" else "ENTRA_BACKUP_SUBDIR",
        args.workload,
    )
    source_ref = _ref_from_branch(drift_branch)
    target_ref = _ref_from_branch(baseline_branch)

    repo_api = f"{collection_uri}/{project}/_apis/git/repositories/{repository_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/json",
    }

    description = _build_description(args.workload, drift_branch, baseline_branch, build_number, build_id)
    completion_options = {"mergeStrategy": pr_merge_strategy}
    print(f"Rolling PR completion merge strategy: {pr_merge_strategy}")

    active_prs = _query_prs(repo_api, headers, source_ref, target_ref, "active")
    if active_prs:
        pr = active_prs[0]
        pr_id = pr.get("pullRequestId")
        current_title = str(pr.get("title") or "")
        current_description = str(pr.get("description") or "")
        current_merge_strategy = _current_pr_merge_strategy(pr)
        desired_description = current_description if current_description.strip() else description
        needs_patch = (
            current_title != args.pr_title
            or not current_description.strip()
            or current_merge_strategy != pr_merge_strategy
        )
        if needs_patch:
            update_url = f"{repo_api}/pullrequests/{pr_id}?api-version=7.1"
            _request_json(
                update_url,
                headers=headers,
                method="PATCH",
                body={
                    "title": args.pr_title,
                    "description": desired_description,
                    "completionOptions": completion_options,
                },
            )
        web_url = _pr_web_url(pr)
        if needs_patch:
            print(f"Updated rolling {args.workload} PR #{pr_id}: {web_url}")
        else:
            print(f"Rolling {args.workload} PR #{pr_id} already up to date: {web_url}")
        print(f"##vso[task.setvariable variable=DRIFT_PR_ID;isOutput=true]{pr_id}")
        if web_url:
            print(f"##vso[task.setvariable variable=DRIFT_PR_URL;isOutput=true]{web_url}")
        print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]0")
        return 0

    _run_git(args.repo_root, ["fetch", "--quiet", "origin", baseline_branch])
    try:
        _run_git(args.repo_root, ["fetch", "--quiet", "origin", drift_branch])
    except RuntimeError as exc:
        if "couldn't find remote ref" in str(exc).lower() or "could not find remote ref" in str(exc).lower():
            pass  # Drift branch may not exist yet; fallback to HEAD below.
        else:
            raise
    baseline_commitish = f"origin/{baseline_branch}" if _ref_has_commit(args.repo_root, f"origin/{baseline_branch}") else baseline_branch
    drift_commitish = f"origin/{drift_branch}" if _ref_has_commit(args.repo_root, f"origin/{drift_branch}") else "HEAD"
    if not _workload_config_diff_exists(
        repo_root=args.repo_root,
        baseline_commitish=baseline_commitish,
        drift_commitish=drift_commitish,
        workload_dir=workload_dir,
        backup_folder=backup_folder,
        reports_subdir=reports_subdir,
    ):
        print(
            "Suppressed PR recreation: drift branch has no effective workload configuration diff "
            f"against {baseline_branch}."
        )
        print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]1")
        return 0

    drift_tree = _tree_id_for_commitish(args.repo_root, drift_commitish)
    abandoned_prs = _query_prs(repo_api, headers, source_ref, target_ref, "abandoned")
    matching_abandoned, match_reason = _find_matching_abandoned_pr(
        repo_api=repo_api,
        headers=headers,
        abandoned_prs=abandoned_prs,
        drift_tree=drift_tree,
        repo_root=args.repo_root,
        workload_dir=workload_dir,
        backup_folder=backup_folder,
        reports_subdir=reports_subdir,
        drift_commitish=drift_commitish,
    )

    if matching_abandoned:
        if match_reason == "config-fingerprint":
            print(
                "Matched abandoned PR using configuration fingerprint "
                "(ignoring docs/reports churn)."
            )
        pr_id = int(matching_abandoned["pullRequestId"])
        if not _pr_has_reject_vote(matching_abandoned):
            print(
                "Matched abandoned PR without reviewer Reject vote; "
                "skipping remediation and suppressing PR recreation for this unchanged drift snapshot."
            )
            print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]1")
            return 0

        if not auto_remediate:
            print(
                "Suppressed PR recreation: latest drift matches a rejected PR, "
                "but AUTO_REMEDIATE_ON_PR_REJECTION is disabled."
            )
            print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]1")
            return 0

        marker = f"Automation marker: AUTO-REMEDIATE-TREE:{drift_tree}"
        already_queued = _threads_with_marker(repo_api, headers, pr_id, marker)

        if already_queued:
            print(
                "Suppressed PR recreation: latest drift matches a previously rejected PR and remediation was already queued."
            )
        else:
            queued = _queue_restore_pipeline(
                collection_uri=collection_uri,
                project=project,
                headers=headers,
                definition_id=remediation_def_id,
                baseline_branch=baseline_branch,
                include_entra_update=include_entra_update,
                dry_run=remediation_dry_run,
                update_assignments=remediation_update_assignments,
                remove_unmanaged=remediation_remove_unmanaged,
                max_workers=remediation_max_workers,
                exclude_csv=remediation_exclude_csv,
            )
            build_queued_id = queued.get("id")
            build_url = ((queued.get("_links") or {}).get("web") or {}).get("href", "")
            if not build_url and build_queued_id:
                build_url = f"{collection_uri}/{project}/_build/results?buildId={build_queued_id}"

            comment = (
                "Auto-remediation queued because the latest drift matches a rejected PR.\n\n"
                f"Workload: {args.workload}\n"
                f"Rejected PR: #{pr_id}\n"
                f"Drift tree: {drift_tree}\n"
                f"Restore pipeline definition: {remediation_def_id}\n"
                f"Restore run: {build_url or '(queued)'}\n\n"
                f"{marker}"
            )
            try:
                _post_pr_thread(repo_api, headers, pr_id, comment)
            except Exception as exc:
                print(f"WARNING: Remediation queued, but failed to post PR thread on #{pr_id}: {exc}")

            print(
                f"Queued remediation pipeline run (definition={remediation_def_id}, buildId={build_queued_id}) and suppressed PR recreation."
            )

        print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]1")
        return 0

    if abandoned_prs:
        print(
            f"No abandoned PR snapshot match for current drift tree (checked {len(abandoned_prs)} abandoned PR(s)); creating/updating rolling PR."
        )

    create_url = f"{repo_api}/pullrequests?api-version=7.1"
    created = _request_json(
        create_url,
        headers=headers,
        method="POST",
        body={
            "sourceRefName": source_ref,
            "targetRefName": target_ref,
            "title": args.pr_title,
            "description": description,
            "isDraft": create_as_draft,
            "completionOptions": completion_options,
        },
    )
    pr_id = created.get("pullRequestId")
    web_url = _pr_web_url(created)
    print(f"Created rolling {args.workload} PR #{pr_id}: {web_url}")
    print(f"##vso[task.setvariable variable=DRIFT_PR_ID;isOutput=true]{pr_id}")
    if web_url:
        print(f"##vso[task.setvariable variable=DRIFT_PR_URL;isOutput=true]{web_url}")
    print("##vso[task.setvariable variable=DRIFT_PR_SUPPRESSED;isOutput=true]0")
    return 0


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except Exception as exc:
        print(f"ERROR: Failed to ensure rolling PR: {exc}", file=sys.stderr)
        raise