#!/usr/bin/env python3
"""
scrub_secrets.py — deterministic secret scrubber for the agent-learning-compounder skill.

Reads text from stdin (or a file path argument), writes scrubbed text to stdout.
Conservative by design: when a pattern matches, the match is replaced with
[REDACTED:<kind>]. False positives are preferred over leaks.

Usage:
    cat session.jsonl | python scrub_secrets.py
    python scrub_secrets.py path/to/file.txt
    python scrub_secrets.py --help

Exit codes:
    0  scrubbing completed (always 0 unless I/O error)
    1  argument or I/O error

Integration in the skill:
    Mode 2 (distill-sessions) MUST pipe every quoted fragment through this
    script before writing it to the report. If the scrubbed output contains
    any [REDACTED:*] marker, the quote is dropped and replaced with a count
    or paraphrase that contains no secret-shaped content.
"""

from __future__ import annotations

import argparse
import re
import sys
from typing import Iterable, Pattern


# (label, compiled regex). Order matters: more specific patterns first so they
# claim a string before the generic catch-alls.
PATTERNS: list[tuple[str, Pattern[str]]] = [
    # ── Provider-specific tokens (highly specific, anchor on known prefixes) ──
    ("anthropic_key",   re.compile(r"sk-ant-[A-Za-z0-9_\-]{20,}")),
    ("openai_key",      re.compile(r"sk-(?:proj-)?[A-Za-z0-9_\-]{20,}")),
    ("github_pat",      re.compile(r"gh[pousr]_[A-Za-z0-9]{20,}")),
    ("github_oauth",    re.compile(r"github_pat_[A-Za-z0-9_]{20,}")),
    ("slack_token",     re.compile(r"xox[abprs]-[A-Za-z0-9\-]{10,}")),
    ("stripe_secret",   re.compile(r"(?:sk|rk)_(?:live|test)_[A-Za-z0-9]{20,}")),
    ("stripe_public",   re.compile(r"pk_(?:live|test)_[A-Za-z0-9]{20,}")),
    ("npm_token",       re.compile(r"npm_[A-Za-z0-9]{30,}")),
    ("aws_access_key",  re.compile(r"\b(?:AKIA|ASIA|AROA|AIDA|AGPA|ANPA|ANVA|ABIA)[A-Z0-9]{16}\b")),
    ("aws_access_key_id", re.compile(r"\bAKIA[0-9A-Z]{16}\b")),
    ("google_api_key",  re.compile(r"\bAIza[0-9A-Za-z_\-]{30,50}\b")),
    ("gitlab_pat",      re.compile(r"\bglpat-[A-Za-z0-9_\-]{20,}")),
    ("huggingface_token", re.compile(r"\bhf_[A-Za-z0-9]{30,}")),
    ("twilio_sk",       re.compile(r"\bSK[0-9a-f]{32}\b")),
    ("twilio_ac",       re.compile(r"\bAC[0-9a-f]{32}\b")),
    ("telegram_bot_token", re.compile(r"\b\d{6,12}:[A-Za-z0-9_\-]{35}\b")),
    # Connection strings with embedded credentials (run before basic_auth_url
    # so the more specific scheme list claims it first).
    ("connection_string_credentials", re.compile(
        r"\b(?:mongodb(?:\+srv)?|postgres(?:ql)?|mysql|redis|amqp)://[^\s/@:]+:[^\s/@]+@[^\s]+"
    )),
    # Basic-auth URL: user:password@host — require a real ":pw@" segment so
    # normal URLs without credentials pass through unchanged. The scheme is
    # restricted to RFC 3986 shape (letter then letter/digit/+/-/.).
    ("basic_auth_url", re.compile(
        r"\b[a-z][a-z0-9+\-.]*://[^\s/@:]+:[^\s/@]+@[^\s]+",
        re.I,
    )),
    # ── Cryptographic blocks (multiline) ─────────────────────────────────────
    ("private_key_block", re.compile(
        r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP |ENCRYPTED |)PRIVATE KEY-----"
        r".*?-----END (?:RSA |DSA |EC |OPENSSH |PGP |ENCRYPTED |)PRIVATE KEY-----",
        re.DOTALL,
    )),
    # Escaped-newline PEM blocks embedded in JSON (e.g. service-account JSON).
    # The literal characters in the source are: backslash, 'n'. Matches both
    # single-escaped ("\n" inside JSON) and double-escaped ("\\n" inside
    # JSON-encoded-in-JSON) renderings.
    ("private_key_block_escaped", re.compile(
        r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP |ENCRYPTED |)PRIVATE KEY-----"
        r"(?:\\+n).*?(?:\\+n)"
        r"-----END (?:RSA |DSA |EC |OPENSSH |PGP |ENCRYPTED |)PRIVATE KEY-----",
        re.DOTALL,
    )),
    # ── Azure storage account keys ───────────────────────────────────────────
    # 88-char base64 ending with `==`, no fixed prefix. Anchored with word
    # boundaries and a length-exact body so common base64 prose doesn't trigger.
    ("azure_storage_key", re.compile(
        r"(?<![A-Za-z0-9+/=])[A-Za-z0-9+/]{86}==(?![A-Za-z0-9+/=])"
    )),
    # ── JWT-shaped tokens (three base64url segments) ─────────────────────────
    ("jwt", re.compile(r"\beyJ[A-Za-z0-9_\-]{10,}\.eyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\b")),
    # ── Bearer / Authorization headers ───────────────────────────────────────
    # Charset includes `+` and `/` so full base64 bearer tokens are redacted.
    ("bearer_token", re.compile(
        r"(?i)\b(?:bearer|authorization)\s*[:=]?\s*[\"']?([A-Za-z0-9_\-\.=+/]{20,})[\"']?"
    )),
    # ── AWS secret key shape (labeled, more specific than generic) ───────────
    # Runs before generic_secret_assignment so the AWS-specific shape claims it.
    ("aws_secret_key", re.compile(
        r"(?i)aws[_\-]?secret[_\-]?access[_\-]?key\s*[:=]\s*[\"']?([A-Za-z0-9/+=]{30,})[\"']?"
    )),
    # ── Generic assignment patterns (catch-all, run after specific ones) ─────
    # Matches: password=foo  password="foo"  PASSWORD: foo  api_key='foo'  etc.
    ("generic_secret_assignment", re.compile(
        r"(?ix)"
        r"\b(?:password|passwd|pwd|secret|token|api[_\-]?key|access[_\-]?key|"
        r"client[_\-]?secret|auth[_\-]?token|private[_\-]?key|credential)\b"
        r"\s*[:=]\s*"
        r"[\"']?([^\s\"'<>;,]{6,})[\"']?"
    )),
    # Multi-line JSON form: {"api_key":\n  "abc..."} — colon, then
    # arbitrary whitespace/newlines, then a quoted value on the next line.
    ("generic_secret_assignment_multiline", re.compile(
        r"(?ix)"
        r"\"(?:password|passwd|pwd|secret|token|api[_\-]?key|access[_\-]?key|"
        r"client[_\-]?secret|auth[_\-]?token|private[_\-]?key|credential)\""
        r"\s*:\s*\n\s*"
        r"\"([^\"\n]{6,})\""
    )),
    # ── .env-style standalone lines ──────────────────────────────────────────
    ("env_assignment", re.compile(
        r"(?m)^\s*[A-Z][A-Z0-9_]{3,}\s*=\s*[\"']?([^\s\"'#]{12,})[\"']?\s*$"
    )),
]

# A standalone long token preceded by sensitive context — last-resort backstop.
# Deliberately narrow to avoid mangling normal prose. Run after all labeled ones.
# We can't use variable-width lookbehind in Python's re, so we capture the
# keyword + value and redact only the value via the capturing group.
CONTEXTUAL_BACKSTOP = re.compile(
    r"(?i)(?:key|secret|token|password)\s*=\s*([A-Za-z0-9_\-\.=/+]{12,})"
)

# Session transcript adapters can expose secrets inside tool payload JSON,
# sometimes double-escaped. Keep these here so every caller shares one scrubber.
SECRET_PAYLOAD_PATTERNS: list[tuple[str, Pattern[str], str]] = [
    (
        "secret_payload",
        re.compile(
            r'(\\?"name\\?"\s*:\s*\\?"[^"]*(?:SECRET|TOKEN|API[_-]?KEY|PASSWORD|PRIVATE[_-]?KEY)[^"]*\\?"\s*,\s*\\?"text\\?"\s*:\s*\\?")[^"\\]+(\\?")',
            re.I,
        ),
        r"\1[REDACTED:secret_payload]\2",
    ),
    (
        "secret_payload",
        re.compile(
            r'("name"\s*:\s*"[^"]*(?:SECRET|TOKEN|API[_-]?KEY|PASSWORD|PRIVATE[_-]?KEY)[^"]*"\s*,\s*"text"\s*:\s*")[^"]+(")',
            re.I,
        ),
        r"\1[REDACTED:secret_payload]\2",
    ),
]

SENSITIVE_LINE_RE = re.compile(
    r"^.*(?:SECRET|TOKEN|API[_-]?KEY|PASSWORD|PRIVATE[_-]?KEY|secret put|Authorization:\s*Token token=|tool_use_error|Bash\(printf).*$",
    re.I | re.M,
)
UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", re.I)
TRUNCATED_UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{1,12}…", re.I)

SUPPLEMENTAL_PATTERN_LABELS = (
    "secret_payload",
    "secret_uuid",
    "uuid",
    "truncated_uuid",
)


def _replace_factory(label: str):
    """Return a re.sub callback that replaces match with [REDACTED:label].

    For patterns that have a capturing group (e.g. generic_secret_assignment),
    only the capture is redacted so the surrounding keyword stays as a hint.
    """

    def _replace(match: re.Match) -> str:
        if match.groups() and match.group(1) is not None:
            captured = match.group(1)
            # Don't re-redact an already-redacted marker — preserve the more
            # specific label from whichever pattern fired first.
            if captured.startswith("[REDACTED:") and captured.rstrip("\"'").endswith("]"):
                return match.group(0)
            start, end = match.span(1)
            prefix = match.string[match.start():start]
            suffix = match.string[end:match.end()]
            return f"{prefix}[REDACTED:{label}]{suffix}"
        return f"[REDACTED:{label}]"

    return _replace


def scrub(text: str, patterns: Iterable[tuple[str, Pattern[str]]] = PATTERNS) -> str:
    """Apply all scrub patterns to text. Returns scrubbed text."""
    out = text
    for _label, pat, replacement in SECRET_PAYLOAD_PATTERNS:
        out = pat.sub(replacement, out)
    for label, pat in patterns:
        out = pat.sub(_replace_factory(label), out)
    # Contextual backstop runs last and only redacts the captured group
    def _backstop_replace(match: re.Match) -> str:
        captured = match.group(1)
        if captured.startswith("[REDACTED:") and captured.endswith("]"):
            return match.group(0)
        start, end = match.span(1)
        prefix = match.string[match.start():start]
        suffix = match.string[end:match.end()]
        return f"{prefix}[REDACTED:contextual]{suffix}"
    out = CONTEXTUAL_BACKSTOP.sub(_backstop_replace, out)
    out = SENSITIVE_LINE_RE.sub(lambda match: UUID_RE.sub("[REDACTED:secret_uuid]", match.group(0)), out)
    out = SENSITIVE_LINE_RE.sub(lambda match: TRUNCATED_UUID_RE.sub("[REDACTED:secret_uuid]", match.group(0)), out)

    # Sensitive-line short-token gap: if a line matched SENSITIVE_LINE_RE but no
    # other redaction marker fired on it, scrub any trailing `=value` or
    # `: value` regardless of length so short tokens on obviously-sensitive
    # lines do not leak.
    def _sensitive_line_backstop(match: re.Match) -> str:
        line = match.group(0)
        if "[REDACTED" in line:
            return line
        scrubbed = re.sub(
            r"(=)\s*([^\s\"'#]+)\s*$",
            r"\1[REDACTED:sensitive_line]",
            line,
        )
        if scrubbed != line:
            return scrubbed
        scrubbed = re.sub(
            r"(:\s)([^\s\"'#]+)\s*$",
            r"\1[REDACTED:sensitive_line]",
            line,
        )
        return scrubbed

    out = SENSITIVE_LINE_RE.sub(_sensitive_line_backstop, out)

    out = UUID_RE.sub("[REDACTED:uuid]", out)
    out = TRUNCATED_UUID_RE.sub("[REDACTED:uuid]", out)
    return out


def pattern_labels() -> list[str]:
    labels = [label for label, _ in PATTERNS]
    labels.extend(["contextual_backstop", *SUPPLEMENTAL_PATTERN_LABELS])
    return labels


def _read_input(path: str | None) -> str:
    if path is None or path == "-":
        return sys.stdin.read()
    try:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    except OSError as exc:
        print(f"error: cannot read {path}: {exc}", file=sys.stderr)
        sys.exit(1)


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="scrub_secrets.py",
        description="Deterministic secret scrubber for the agent-learning-compounder skill.",
        epilog=(
            "Patterns covered: "
            + ", ".join(pattern_labels())
            + "."
        ),
    )
    parser.add_argument(
        "path",
        nargs="?",
        default=None,
        help="File to scrub. If omitted or '-', read from stdin.",
    )
    parser.add_argument(
        "--list-patterns",
        action="store_true",
        help="Print the list of pattern labels and exit.",
    )
    args = parser.parse_args(argv)

    if args.list_patterns:
        for label in pattern_labels():
            print(label)
        return 0

    text = _read_input(args.path)
    sys.stdout.write(scrub(text))
    return 0


if __name__ == "__main__":
    sys.exit(main())
