claude-code/plugins/security-guidance/hooks/gitutil.py

"""
Leaf git/subprocess helpers and diff parsing for the security-guidance plugin.

Everything here is a thin wrapper over ``git``/``subprocess`` plus pure
diff-text parsing and source-file classification. None of these functions
reference any name that the test suite monkeypatches on
``security_reminder_hook`` and then calls *through* another function in this
module — that property is what makes them safe to live in their own module
while still being re-exported (so tests that patch ``hook._git_toplevel`` and
then call a handler in ``security_reminder_hook`` continue to see the patched
binding).

Functions that DO compose patched leaves (``compute_v2_review_set``,
``_list_untracked``, ``_append_reviewed_shas``) deliberately remain in
``security_reminder_hook.py`` for that reason.
"""
import contextlib
import os
import re
import subprocess

from _base import debug_log


GIT_CMD = [
    "git",
    "-c", "core.fsmonitor=false",
    "-c", "core.hooksPath=/dev/null",
]


def _git_rev_parse_head(cwd):
    """Return the current HEAD SHA, or None if not a git repo / no commits."""
    try:
        result = subprocess.run(
            [*GIT_CMD, "rev-parse", "HEAD"],
            cwd=cwd, capture_output=True, text=True, timeout=5
        )
        if result.returncode == 0 and result.stdout.strip():
            return result.stdout.strip()
        return None
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def _find_git_index(cwd):
    """
    Find the real index file for a git repo. Handles worktrees where .git
    is a file pointing to the main repo's gitdir.
    Returns the absolute path to the index file, or None.
    """
    try:
        result = subprocess.run(
            [*GIT_CMD, "rev-parse", "--git-dir"],
            cwd=cwd, capture_output=True, text=True, timeout=5
        )
        if result.returncode != 0:
            return None
        git_dir = result.stdout.strip()
        if not os.path.isabs(git_dir):
            git_dir = os.path.join(cwd, git_dir)
        index_path = os.path.join(git_dir, "index")
        return index_path if os.path.isfile(index_path) else None
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def _diff_pathspec(cwd, paths):
    """Convert absolute touched-paths to repo-relative pathspec args for
    git diff. Paths outside cwd (e.g. ~/.claude/…) are dropped. Returns the
    list to splice after `--`, or [] for an unrestricted diff. realpath both
    sides so the macOS /var ↔ /private/var symlink doesn't make in-repo
    paths look external."""
    if not paths:
        return []
    cwd_abs = os.path.realpath(cwd)
    rel = []
    for p in paths:
        try:
            r = os.path.relpath(os.path.realpath(p), cwd_abs)
        except ValueError:
            continue
        if r.startswith(".."):
            continue
        rel.append(r)
    return ["--"] + rel if rel else []


@contextlib.contextmanager
def _temp_index(cwd, untracked_paths=None):
    """Yield an env dict pointing GIT_INDEX_FILE at a throwaway copy of the
    repo's index with `git add --intent-to-add` applied, so untracked files
    show up in subsequent `git diff` calls without touching the user's real
    index. Yields None if no index can be found (bare repo / not a repo); the
    caller should fall back to a plain diff. Always cleans up the temp file.

    Perf: when `untracked_paths` is given, only those paths are added (O(n)
    in untracked count). The default `add -N .` stats every file in the
    worktree — slow in large repos vs fast targeted scan. v2 callers
    already know the untracked set from `git status --porcelain`, so they
    pass it; v1 keeps the whole-tree scan since it has no prior list."""
    import shutil
    import tempfile

    real_index = _find_git_index(cwd)
    if not real_index:
        yield None
        return

    tmp_fd, tmp_index = tempfile.mkstemp(prefix="security_hook_idx_")
    os.close(tmp_fd)
    try:
        shutil.copy2(real_index, tmp_index)
        env = {**os.environ, "GIT_INDEX_FILE": tmp_index}
        if untracked_paths is None:
            add_args = ["."]
        elif untracked_paths:
            # `git add -N -- a b nonexistent` is atomic — one missing path
            # makes it exit 128 and add NOTHING, so a file removed between
            # `git status` and here would silently drop ALL untracked files
            # from the diff. --ignore-missing only works with --dry-run, so
            # filter to surviving paths (lexists so dangling symlinks count).
            surviving = [p for p in untracked_paths
                         if os.path.lexists(os.path.join(cwd, p))]
            add_args = ["--"] + surviving if surviving else None
        else:
            add_args = None
        if add_args:
            subprocess.run(
                [*GIT_CMD, "add", "--intent-to-add"] + add_args,
                cwd=cwd, capture_output=True, text=True, timeout=10,
                env=env,
            )
        yield env
    finally:
        try:
            os.unlink(tmp_index)
        except OSError:
            pass


def _git_toplevel(cwd):
    """Absolute repo root for `cwd`, or None if not in a work tree."""
    try:
        r = subprocess.run(
            [*GIT_CMD, "rev-parse", "--show-toplevel"],
            cwd=cwd, capture_output=True, text=True, timeout=5,
        )
        return r.stdout.strip() if r.returncode == 0 and r.stdout.strip() else None
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def _git_dir(repo_root):
    """Absolute shared `.git` directory for repo_root.

    Uses `rev-parse --git-common-dir` so linked worktrees resolve to the
    SHARED gitdir, not the per-worktree `.git/worktrees/<name>/`. That way
    push-sweep's reviewed-shas record (and the bash-hook-once sentinel)
    is per-clone — a commit reviewed in one worktree counts as reviewed
    if a different worktree later pushes it. Returns None on failure so
    callers can degrade (push-sweep state is best-effort).
    """
    try:
        r = subprocess.run(
            [*GIT_CMD, "rev-parse", "--git-common-dir"],
            cwd=repo_root, capture_output=True, text=True, timeout=5,
        )
        if r.returncode != 0:
            return None
        d = r.stdout.strip()
        return d if os.path.isabs(d) else os.path.join(repo_root, d)
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def _git_rev_list_range(repo_root, base, head="HEAD"):
    """Shas in `base..head`, oldest→newest. Empty list on error."""
    try:
        r = subprocess.run(
            [*GIT_CMD, "rev-list", "--reverse", f"{base}..{head}"],
            cwd=repo_root, capture_output=True, text=True, timeout=10,
        )
        if r.returncode != 0:
            return []
        return [s for s in r.stdout.strip().split("\n") if s]
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return []


def _git_diff_range(repo_root, base, head="HEAD"):
    """`git diff -p base head` as text on success, None on error.

    Distinguishing failure from success-with-empty-diff matters: the push-sweep
    caller marks the tail reviewed when the diff is empty (nothing to review),
    but on failure (timeout, non-zero exit, missing git) it must NOT mark
    them reviewed — otherwise unreviewed commits get permanently silenced.
    """
    try:
        r = subprocess.run(
            [*GIT_CMD, "diff", "-p", "--no-color", "--no-ext-diff", base, head],
            cwd=repo_root, capture_output=True, timeout=30,
        )
        if r.returncode != 0:
            return None
        return r.stdout.decode("utf-8", errors="replace")
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def _detect_main_branch(repo_root):
    for ref in ("origin/HEAD", "origin/main", "origin/master", "main", "master"):
        try:
            r = subprocess.run(
                [*GIT_CMD, "rev-parse", "--verify", "-q", ref],
                cwd=repo_root, capture_output=True, text=True, timeout=5,
            )
            if r.returncode == 0 and r.stdout.strip():
                return ref
        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
            pass
    return None


def _git_reflog_recent_commits(repo_root, max_age_s=120, max_n=5):
    """Return (fresh_commit_shas, stale_count) from the HEAD reflog.

    Scans the last `max_n` reflog entries and returns the SHAs whose action is
    `commit*` AND whose commit timestamp is within `max_age_s` of now,
    newest-first. `stale_count` is the number of commit-action entries that
    were too old (so the caller can distinguish "no commit happened" from
    "commit happened earlier than the window").

    Used by commit-review when stdout-based `[branch sha]` detection fails
    (output piped/redirected/-q, or a chained command after `git commit`
    pushed the success line off — `git commit && git push` makes HEAD@{0}
    `update by push`, not `commit:`). The HEAD@{0}-only check
    keeps the not-yet-visible-HEAD skip rare; analysis showed the
    residual is dominated by these chained-command and noop-guard cases.

    Safety vs. blindly reading HEAD:
      - cross-repo (`cd ../other && git commit`): repo_root's own reflog has
        no fresh commit, so this returns ([], 0).
      - commit actually failed (pre-commit reject, nothing-staged): reflog's
        recent entries are the prior checkout/commit/reset → ([], 0) or only
        stale entries.
      - HEAD raced ahead (a second commit landed before this async hook ran):
        both commits appear in the scan and both get reviewed — correct.
      - prior Bash call's commit within the window: would be returned here,
        but the call site deduplicates against `.git/sg-reviewed-shas` so a
        SHA is reviewed at most once. This is also the non-overlap invariant
        with push-sweep.
    """
    if not repo_root:
        return [], 0
    try:
        # %gs (the reflog subject) is `commit: <commit-msg first line>` and can
        # contain `|`; put it LAST so split("|", 2) leaves it intact. %H is
        # hex and %ct is integer, so the first two fields are delimiter-safe.
        r = subprocess.run(
            [*GIT_CMD, "log", "-g", "-n", str(max_n),
             "--format=%H|%ct|%gs", "HEAD"],
            cwd=repo_root, capture_output=True, text=True, timeout=5,
        )
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return [], 0
    if r.returncode != 0:
        return [], 0
    import time as _time
    now = int(_time.time())
    fresh, stale = [], 0
    for idx, line in enumerate(r.stdout.splitlines()):
        parts = line.split("|", 2)
        if len(parts) != 3:
            continue
        sha, ct, subject = parts
        # `commit: msg`, `commit (amend): msg`, `commit (initial): msg`,
        # `commit (merge): msg` — all create a reviewable commit object.
        if not subject.startswith("commit"):
            continue
        try:
            age = now - int(ct)
        except ValueError:
            continue
        # HEAD@{0} (idx==0) is exempt from the age gate. The gate exists to
        # bound the WIDENED HEAD@{1..max_n-1} scan from picking up commits
        # made by *prior* Bash calls; HEAD@{0} is by definition the most
        # recent reflog entry and was previously accepted unconditionally
        # (_git_reflog_head_if_just_committed previously had no age check).
        # Applying max_age_s to idx==0 made the not-yet-visible-HEAD skip
        # noticeably more frequent on chained
        # `git commit && <slow command>` where %ct is >120s old by the
        # time the async PostToolUse hook fires.
        if idx == 0 or age <= max_age_s:
            fresh.append(sha)
        else:
            stale += 1
    return fresh, stale


def _git_name_only(cwd, base, include_untracked=False):
    """Return the set of repo-root-relative paths that differ from `base`,
    or None if git failed (unresolvable ref, not a repo, timeout). Callers
    must distinguish None (error → don't trust as a filter) from set()
    (genuinely nothing changed). `-c core.quotePath=false -z` keeps non-ASCII
    and space-containing paths intact."""
    def _run(env):
        result = subprocess.run(
            [*GIT_CMD, "-c", "core.quotePath=false", "diff", "--name-only", "-z", base],
            cwd=cwd, capture_output=True, text=True, timeout=30,
            env=env,
        )
        if result.returncode != 0:
            debug_log(f"_git_name_only({base!r}) rc={result.returncode}: {result.stderr[:200]}")
            return None
        return {p for p in result.stdout.split("\0") if p}

    try:
        if not include_untracked:
            return _run(None)
        with _temp_index(cwd) as env:
            return _run(env)
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
        debug_log(f"_git_name_only({base!r}) error: {e}")
        return None


def _git_status_porcelain(cwd):
    """One `git status --porcelain=v1 -z` → (tracked_dirty, untracked) sets of
    repo-root-relative paths, or (None, None) on error. Replaces the
    `_temp_index + git diff HEAD --name-only` pair for the v2 dirty_now
    computation: faster in large repos, and yields the
    untracked set separately so the later get_git_diff can do a targeted
    `add -N -- <files>` instead of a whole-tree `add -N .`.

    -uall: list individual files inside untracked directories (default
    collapses to `dir/`). Required so the untracked set subtracts cleanly
    against the UPS-time `_list_untracked` snapshot, which uses ls-files and
    therefore always lists individual files."""
    try:
        r = subprocess.run(
            [*GIT_CMD, "-c", "core.quotePath=false", "status",
             "--porcelain=v1", "-uall", "-z"],
            cwd=cwd, capture_output=True, text=True, timeout=30,
        )
        if r.returncode != 0:
            debug_log(f"_git_status_porcelain rc={r.returncode}: {r.stderr[:200]}")
            return None, None
        tracked, untracked = set(), set()
        entries = r.stdout.split("\0")
        i = 0
        while i < len(entries):
            e = entries[i]
            if not e:
                i += 1
                continue
            xy, path = e[:2], e[3:]
            if xy == "??":
                untracked.add(path)
            else:
                tracked.add(path)
                # Rename/copy entries are XY old\0new\0 — second NUL field is
                # the origin path; consume it so it isn't misparsed as a new
                # 2-char-status entry.
                if "R" in xy or "C" in xy:
                    i += 1
            i += 1
        return tracked, untracked
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
        debug_log(f"_git_status_porcelain error: {e}")
        return None, None


def _is_ancestor(cwd, maybe_ancestor, descendant):
    """True if `maybe_ancestor` is reachable from `descendant` (i.e. HEAD
    moved forward via commit/merge, not sideways via checkout)."""
    try:
        result = subprocess.run(
            [*GIT_CMD, "merge-base", "--is-ancestor", maybe_ancestor, descendant],
            cwd=cwd, capture_output=True, text=True, timeout=5,
        )
        return result.returncode == 0
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return False


def get_git_diff(cwd, baseline_sha, full_context=False, paths=None, untracked_paths=None):
    """
    Get the git diff between the baseline SHA and the current working tree,
    including untracked (new) files.

    Uses a temporary copy of the git index (GIT_INDEX_FILE) so the user's
    real index is never modified. The temp index gets intent-to-add entries
    for untracked files, making them visible in the diff output. Cleanup
    is just deleting the temp file in a finally block.

    If `paths` is given, the diff is restricted to those paths (relative to
    cwd; absolute paths are converted, paths outside cwd are dropped).
    `untracked_paths` (repo-root-relative) is forwarded to _temp_index so it
    can add only those files instead of scanning the whole worktree.
    """
    pathspec = _diff_pathspec(cwd, paths)
    if paths and not pathspec:
        # Caller restricted to specific paths but none are inside this repo
        # (e.g. only ~/.claude/... edits). Returning "" flows to skip(6); an
        # empty pathspec would mean an UNRESTRICTED diff — the bug this whole
        # change exists to fix.
        return ""

    cmd = [*GIT_CMD, "diff", "--no-color", "--no-ext-diff", baseline_sha] + (["--unified=99999"] if full_context else []) + pathspec
    try:
        with _temp_index(cwd, untracked_paths) as env:
            # env is None when no index could be found (bare repo / not a
            # repo) — diff still runs, just without untracked-file support.
            result = subprocess.run(cmd, cwd=cwd, capture_output=True, timeout=30, env=env)
        if result.returncode != 0:
            debug_log(f"git diff failed: {result.stderr[:200].decode('utf-8', errors='replace')}")
            return None
        # Decode with errors='replace' so binary diffs don't crash
        return result.stdout.decode("utf-8", errors="replace")
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
        debug_log(f"git diff error: {e}")
        return None


# Source file extensions worth reviewing for security
SOURCE_CODE_EXTENSIONS = {
    '.py', '.js', '.ts', '.jsx', '.tsx', '.go', '.java', '.rb', '.php',
    '.rs', '.c', '.cpp', '.h', '.hpp', '.cs', '.swift', '.kt', '.scala',
    '.html', '.htm', '.ejs', '.yaml', '.yml', '.properties',
    '.mjs', '.cjs', '.mts', '.cts', '.vue', '.svelte',
    '.sh', '.bash', '.zsh', '.fish', '.ksh', '.ps1', '.sql',
    '.gradle', '.groovy',
    '.tf', '.hcl', '.tfvars',
    '.json', '.toml', '.ipynb',
}

# Reviewable files identified by basename rather than extension (lowercased).
# These are by-convention extensionless but contain executable recipes/DSL
# with shell/exec surface (Make recipes, Jenkinsfile Groovy, Rakefile Ruby).
SOURCE_CODE_BASENAMES = {
    'dockerfile', 'makefile', 'gnumakefile', 'jenkinsfile', 'vagrantfile',
    'rakefile', 'gemfile', 'procfile', 'brewfile', 'justfile',
}

# Extensionless basenames that are NOT source — plain-text metadata. Anything
# extensionless not in this set is treated as source (likely a shebang script
# under bin/ or scripts/). Analysis of skipped reviews found
# extensionless executables (bin/deploy, scripts/run-canary) were the largest
# remaining false-negative class — they carry shell-injection surface but
# `splitext` gives '' so they were filtered out. _cap_files_for_prompt bounds
# the byte cost downstream, and the reviewer ignores prose, so opting
# extensionless IN with this small deny-list is the better default than
# opting OUT.
NON_SOURCE_EXTENSIONLESS_BASENAMES = {
    'license', 'licence', 'copying', 'notice', 'patents', 'authors',
    'contributors', 'maintainers', 'changelog', 'changes', 'news',
    'readme', 'todo', 'install', 'version', 'codeowners',
    'owners', 'copyright',
}

# Directory components and file suffixes that are never worth reviewing even
# when the extension is in SOURCE_CODE_EXTENSIONS — vendored deps, build
# output, generated code, minified bundles, lockfiles, protobuf stubs.
# Matched as path *components* (so `node_modules/` matches anywhere in the
# path, not just as a prefix) and as case-sensitive suffixes (the ecosystems
# that emit `.min.js` / `_pb2.py` / `.pb.go` are case-consistent).
SKIP_PATH_PATTERNS = (
    'node_modules/', 'dist/', 'build/', '.next/', 'vendor/',
    '__generated__/', '__pycache__/', '.venv/', 'target/',
)
SKIP_FILE_SUFFIXES = (
    '.min.js', '.min.css', '.d.ts', '.d.mts', '.d.cts',
    '.lock', '_pb2.py', '.pb.go',
)

# Path tokens that bump a file's review priority when a commit exceeds
# MAX_DIFF_FILES and we have to pick a subset. These are exactly the surfaces
# single-shot and agentic reviews disagree on most (auth, routing, IPC,
# subprocess, deserialization). Matched as lowercase substrings against the
# path; not regex — keep it cheap.
_SECURITY_RISK_PATH_TOKENS = (
    "auth", "login", "session", "token", "secret", "credential", "perm",
    "acl", "rbac", "iam", "policy",
    "route", "handler", "controller", "endpoint", "api/", "/api", "gateway",
    "middleware", "view",
    "exec", "subprocess", "shell", "spawn", "command",
    "client", "request", "fetch", "http", "url",
    "serialize", "pickle", "yaml", "parse", "deser",
    # Short tokens that would substring-match unrelated names (`format`,
    # `transform`, `sandbox`, `platform`) are intentionally omitted —
    # `sql`/`query` already cover the DB surface.
    "sql", "query",
)
# Suffixes that pass _is_reviewable_source but are almost always low-signal
# in large scaffolds — generated clients, migrations, test fixtures, config
# shims. These go to the BACK of the priority sort, not dropped outright.
_LOW_PRIORITY_SUFFIXES = (
    ".gen.ts", ".gen.tsx", ".generated.ts", "_gen.py",
    ".test.ts", ".test.tsx", ".test.py", ".spec.ts", ".spec.js",
    ".config.js", ".config.ts", ".config.mjs", ".config.cjs",
)
_LOW_PRIORITY_PATH_TOKENS = (
    "/migrations/", "/alembic/versions/", "/__tests__/", "/fixtures/",
)


def _prioritize_diff_files(diff_files, cap):
    """When `diff_files` exceeds `cap`, return the top-`cap` by security
    relevance plus the count dropped. Otherwise return (diff_files, 0).

    Score = (risk_tokens_in_path, not_low_priority, added_lines). The
    added-lines proxy is `content.count('\\n+')` which counts diff additions
    cheaply without re-parsing hunks. This is a heuristic, not a guarantee —
    the goal is to review the likely-dangerous subset of an over-cap diff
    instead of reviewing nothing. Diffs that exceed the cap are typically
    large multi-file scaffolds, and the cross-file source→sink vulnerabilities
    in them concentrate in a handful of api/client/route files.
    """
    if len(diff_files) <= cap:
        return diff_files, 0

    def _score(item):
        fp, content = item
        low = fp.lower()
        # Prepend "/" so leading-slash patterns in _LOW_PRIORITY_PATH_TOKENS
        # match top-level dirs (git diff paths are repo-root-relative, e.g.
        # `migrations/001.py` not `/migrations/001.py`). Same trick as
        # _is_reviewable_source.
        low_slashed = "/" + low
        risk = sum(1 for t in _SECURITY_RISK_PATH_TOKENS if t in low)
        low_prio = (
            fp.endswith(_LOW_PRIORITY_SUFFIXES)
            or any(t in low_slashed for t in _LOW_PRIORITY_PATH_TOKENS)
        )
        # added_lines: count('\n+') over-counts by including '+++' header and
        # any literal '+' at line start in context, but it's a consistent
        # ordinal across files in the same diff which is all we need.
        added = content.count("\n+")
        return (risk, not low_prio, added)

    ranked = sorted(diff_files, key=_score, reverse=True)
    return ranked[:cap], len(diff_files) - cap


def _is_reviewable_source(file_path):
    # Normalize for component matching: a path like `.next/x.js` or
    # `pkg/node_modules/y.ts` should both be excluded; matching against
    # `'/' + path` lets each pattern be checked as `'/' + p in '/' + path`
    # without false-positiving on `rebuild/` matching `build/`.
    norm = "/" + file_path.replace("\\", "/")
    if any(("/" + p) in norm for p in SKIP_PATH_PATTERNS):
        return False
    if file_path.endswith(SKIP_FILE_SUFFIXES):
        return False
    ext = os.path.splitext(file_path)[1].lower()
    if ext in SOURCE_CODE_EXTENSIONS:
        return True
    base = os.path.basename(file_path).lower()
    # Accept dot-suffixed variants too: `Dockerfile.dev`, `Makefile.am`,
    # `Jenkinsfile.release`. splitext gives ext='.dev'/'.am' for these so they
    # miss both the extension check and the exact-basename check otherwise.
    if base in SOURCE_CODE_BASENAMES \
            or base.split(".", 1)[0] in SOURCE_CODE_BASENAMES:
        return True
    # Extensionless files default to reviewable unless they're known
    # plain-text metadata or dotfiles. Covers shebang scripts under bin/ or
    # scripts/ (`deploy`, `run-canary`, `entrypoint`) which carry
    # shell-injection surface but were previously filtered out — the largest
    # remaining false-negative class for extensionless files. Dotfiles (`.gitignore`,
    # `.nvmrc`, `.env`) are config, not code; `.bashrc`-style runnables are
    # rare in repos and not worth the noise. The deny-list is prefix-aware on
    # `-`/`_` so dual-license / i18n variants (`LICENSE-MIT`, `README-CN`)
    # don't fall through as source.
    if ext == "" and not base.startswith("."):
        if any(base == x or base.startswith(x + "-") or base.startswith(x + "_")
               for x in NON_SOURCE_EXTENSIONLESS_BASENAMES):
            return False
        return True
    return False


def extract_file_paths_from_diff(diff_output):
    """
    Extract file paths from unified diff output (without content).
    Only includes files with source code extensions.
    Returns a list of file paths.
    """
    if not diff_output or not diff_output.strip():
        return []

    paths = []
    file_diffs = diff_output.split("diff --git ")

    for file_diff in file_diffs:
        if not file_diff.strip():
            continue
        lines = file_diff.split('\n')
        header_match = re.match(r'^a/(.+?) b/(.+)$', lines[0])
        if not header_match:
            continue
        file_path = header_match.group(2) or header_match.group(1) or ''
        if not _is_reviewable_source(file_path):
            continue
        paths.append(file_path)

    return paths


def parse_diff_into_files(diff_output):
    """
    Parse unified diff output into a list of (file_path, diff_content) tuples.
    Only includes files with source code extensions.
    """
    if not diff_output or not diff_output.strip():
        return []

    files = []
    file_diffs = diff_output.split("diff --git ")

    for file_diff in file_diffs:
        if not file_diff.strip():
            continue

        # Extract filename from first line: "a/path/to/file b/path/to/file"
        lines = file_diff.split('\n')
        header_match = re.match(r'^a/(.+?) b/(.+)$', lines[0])
        if not header_match:
            continue

        file_path = header_match.group(2) or header_match.group(1) or ''

        # Filter to source code files only
        if not _is_reviewable_source(file_path):
            continue

        # Extract the diff content (from first @@ onwards)
        diff_lines = []
        in_hunks = False
        for line in lines[1:]:
            if line.startswith('@@'):
                in_hunks = True
            if in_hunks:
                diff_lines.append(line)

        if diff_lines:
            files.append((file_path, '\n'.join(diff_lines)))

    return files


def filter_preexisting_from_diff(diff_files, cwd, baseline_sha):
    """
    Filter out pre-existing content from diff files.
    When a file is fully rewritten (Write tool replaces entire content),
    git shows all lines as removed (-) then re-added (+). This function
    detects such rewrites and strips lines from the + section that also
    appeared in the - section, so the LLM reviewer only sees truly new code.
    """
    if not baseline_sha:
        return diff_files

    filtered = []
    for file_path, diff_content in diff_files:
        lines = diff_content.split('\n')

        # Collect removed and added lines (stripping the +/- prefix)
        removed_lines = set()
        added_lines = []
        for line in lines:
            if line.startswith('-') and not line.startswith('---'):
                removed_lines.add(line[1:].strip())
            elif line.startswith('+') and not line.startswith('+++'):
                added_lines.append(line[1:].strip())

        if not removed_lines:
            # New file, no pre-existing content to filter
            filtered.append((file_path, diff_content))
            continue

        # Check what fraction of added lines were pre-existing
        preexisting_count = sum(1 for l in added_lines if l in removed_lines)
        if preexisting_count == 0:
            filtered.append((file_path, diff_content))
            continue

        added_lines_set = set(added_lines)

        # Rebuild diff with pre-existing lines converted to context (space prefix).
        # Known imprecision: .strip() matches across indentation (so reindented
        # code is treated as unchanged) and the set lets one removal mask N
        # additions of the same stripped text. Accepted trade-off — this filter
        # exists for the full-file Write rewrite case where exact-match would
        # miss everything; the diff-review prompt's previous-findings recheck
        # is the backstop.
        new_lines = []
        for line in lines:
            if line.startswith('+') and not line.startswith('+++'):
                content = line[1:].strip()
                if content in removed_lines:
                    # Convert to context line (pre-existing, not new)
                    new_lines.append(' ' + line[1:])
                else:
                    new_lines.append(line)
            elif line.startswith('-') and not line.startswith('---'):
                content = line[1:].strip()
                if content in added_lines_set:
                    # Skip removed lines that were re-added (they become context)
                    continue
                else:
                    new_lines.append(line)
            else:
                new_lines.append(line)

        filtered.append((file_path, '\n'.join(new_lines)))

    return filtered