#!/usr/bin/env python3
"""Store large sanitized command output outside Claude context and query slices later."""
from __future__ import annotations

import argparse
import hashlib
import importlib.machinery
import importlib.util
import json
import os
from pathlib import Path
import re
import stat
import sys
import time
from typing import Iterable

DEFAULT_ARTIFACT_DIR = ".context-guard/artifacts"
LEGACY_ARTIFACT_DIR = ".claude-token-optimizer/artifacts"
DEFAULT_MAX_BYTES = 10_000_000
MAX_MAX_BYTES = 100_000_000
MAX_METADATA_BYTES = 64_000
DEFAULT_MAX_LINES = 80
DEFAULT_MAX_CHARS = 20_000
MAX_QUERY_LINES = 5_000
MAX_LINE_CHARS = 2_000
MAX_DIGEST_TEXT_CHARS = 360
MAX_DIGEST_TEXT_BYTES = 512
MAX_COMMAND_PREVIEW_BYTES = 2_048
MAX_TOP_ERROR_RECEIPTS = 12
MAX_DUPLICATE_GROUPS = 12
MAX_SUGGESTED_QUERIES = 12
ARTIFACT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
    "tmp": Path("/private/tmp"),
    "var": Path("/private/var"),
}
ERROR_RE = re.compile(
    r"(FAIL|FAILED|ERROR|Error:|Exception|Traceback|AssertionError|panic:|fatal:|"
    r"segmentation fault|not ok|\bE\s+assert|\[ERROR\]|✗|✖)",
    re.IGNORECASE,
)
SECRET_VALUE_RE = re.compile(
    r"(?i)(Bearer\s+\S+|Basic\s+\S+|gh[pousr]_[A-Za-z0-9_]{20,}|"
    r"github_pat_[A-Za-z0-9_]{20,}|xox[abprs]-[A-Za-z0-9-]{10,}|"
    r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
    r"AIza[0-9A-Za-z_\-]{20,}|"
    r"([A-Za-z0-9_.-]*(?:api[_-]?key|token|secret|password|passwd|pwd)[A-Za-z0-9_.-]*\s*[:=]\s*)\S+)"
)


def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
    try:
        number = int(value)
    except (TypeError, ValueError, OverflowError):
        return default
    return min(max(number, minimum), maximum)


def cap_line(line: str, limit: int = MAX_LINE_CHARS) -> str:
    if len(line) <= limit:
        return line
    marker = f"...[line trimmed: {len(line)} chars]"
    return line[: max(0, limit - len(marker))] + marker


def cap_utf8_bytes(text: str, limit: int) -> str:
    encoded = text.encode("utf-8", errors="replace")
    if len(encoded) <= limit:
        return text
    marker = f"...[line trimmed: {len(text)} chars/{len(encoded)} bytes]"
    marker_bytes = marker.encode("utf-8")
    if len(marker_bytes) >= limit:
        return marker_bytes[:limit].decode("utf-8", errors="ignore")
    keep = limit - len(marker_bytes)
    out: list[str] = []
    used = 0
    for char in text:
        char_bytes = char.encode("utf-8", errors="replace")
        if used + len(char_bytes) > keep:
            break
        out.append(char)
        used += len(char_bytes)
    return "".join(out) + marker


def cap_digest_text(text: str) -> str:
    return cap_utf8_bytes(cap_line(text, limit=MAX_DIGEST_TEXT_CHARS), MAX_DIGEST_TEXT_BYTES)


def normalized_link_target(parent: Path, raw_target: str) -> Path:
    target = Path(raw_target)
    if not target.is_absolute():
        target = parent / target
    return Path(os.path.normpath(str(target)))


def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
    if not path.is_absolute() or len(path.parts) < 2:
        return path
    first = path.parts[1]
    expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
    if expected is None:
        return path
    link = Path(path.anchor) / first
    try:
        if not stat.S_ISLNK(os.lstat(link).st_mode):
            return path
        if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
            return path
    except OSError:
        return path
    return expected.joinpath(*path.parts[2:])


def compact_items(lines: Iterable[str], *, limit: int, max_chars: int = MAX_LINE_CHARS, max_bytes: int | None = None) -> list[str]:
    out: list[str] = []
    seen: set[str] = set()
    for line in lines:
        item = cap_line(line.strip(), limit=max_chars)
        if max_bytes is not None:
            item = cap_utf8_bytes(item, max_bytes)
        if not item or item in seen:
            continue
        out.append(item)
        seen.add(item)
        if len(out) >= limit:
            break
    return out


class FallbackLineSanitizer:
    def __init__(self, *, show_paths: bool = False) -> None:
        self.show_paths = show_paths
        self.redactions = 0

    def sanitize(self, raw_line: str) -> tuple[str, bool]:
        def repl(match: re.Match[str]) -> str:
            groups = match.groups()
            if len(groups) >= 2 and groups[1]:
                return groups[1] + "[REDACTED]"
            return "[REDACTED]"

        line, count = SECRET_VALUE_RE.subn(repl, raw_line)
        if count:
            self.redactions += 1
        return line, bool(count)


def load_line_sanitizer(show_paths: bool) -> object:
    script_dir = Path(__file__).resolve().parent
    for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
        candidate = script_dir / name
        if not candidate.exists():
            continue
        try:
            loader = importlib.machinery.SourceFileLoader(f"_claude_token_sanitize_{os.getpid()}", str(candidate))
            spec = importlib.util.spec_from_loader(loader.name, loader)
            if spec is None:
                raise RuntimeError("import spec unavailable")
            module = importlib.util.module_from_spec(spec)
            loader.exec_module(module)
            return module.LineSanitizer(show_paths=show_paths)
        except Exception as exc:
            raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
    return FallbackLineSanitizer(show_paths=show_paths)


def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
    sanitizer = load_line_sanitizer(show_paths)
    redacted = 0
    out: list[str] = []
    for line in text.splitlines(True):
        sanitized, did_redact = sanitizer.sanitize(line)  # type: ignore[attr-defined]
        out.append(sanitized)
        if did_redact:
            redacted += 1
    return "".join(out), redacted


def sanitize_one_line(text: str, *, show_paths: bool = False) -> str:
    sanitized, _ = sanitize_text(text + "\n", show_paths=show_paths)
    return cap_utf8_bytes(cap_line(" ".join(sanitized.strip().split())), MAX_COMMAND_PREVIEW_BYTES)


def ensure_private_dir(path: Path) -> None:
    path = normalize_allowed_first_absolute_symlink(path)
    reject_symlink_components(path)
    path.mkdir(parents=True, exist_ok=True)
    reject_symlink_components(path)
    try:
        os.chmod(path, 0o700)
    except OSError:
        pass


def reject_symlink_components(path: Path) -> None:
    path = normalize_allowed_first_absolute_symlink(path)
    current = Path(path.anchor) if path.is_absolute() else Path()
    for part in path.parts:
        if path.is_absolute() and part == path.anchor:
            continue
        current = current / part
        try:
            st = os.lstat(current)
        except FileNotFoundError:
            return
        if stat.S_ISLNK(st.st_mode):
            raise RuntimeError(f"refusing artifact path with symlink component: {current}")
        if not stat.S_ISDIR(st.st_mode) and current != path:
            raise RuntimeError(f"refusing artifact path through non-directory component: {current}")


def regular_private_file_size(path: Path) -> int:
    path = normalize_allowed_first_absolute_symlink(path)
    reject_symlink_components(path.parent)
    st = os.lstat(path)
    if stat.S_ISLNK(st.st_mode):
        raise ValueError(f"artifact file must not be a symlink: {path.name}")
    if not stat.S_ISREG(st.st_mode):
        raise ValueError(f"artifact file must be a regular file: {path.name}")
    return int(st.st_size)


def read_bounded_private_text(path: Path, max_bytes: int) -> str:
    path = normalize_allowed_first_absolute_symlink(path)
    size = regular_private_file_size(path)
    if size > max_bytes:
        raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {size} > {max_bytes}")
    flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
    fd = os.open(str(path), flags)
    try:
        st = os.fstat(fd)
        if not stat.S_ISREG(st.st_mode):
            raise ValueError(f"artifact file must be a regular file: {path.name}")
        if st.st_size > max_bytes:
            raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {st.st_size} > {max_bytes}")
        data = os.read(fd, max_bytes + 1)
        if len(data) > max_bytes:
            raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: > {max_bytes}")
        return data.decode("utf-8", errors="replace")
    finally:
        os.close(fd)


def write_private_text(path: Path, text: str) -> None:
    path = normalize_allowed_first_absolute_symlink(path)
    ensure_private_dir(path.parent)
    tmp = path.with_name(path.name + f".tmp-{os.getpid()}-{time.time_ns()}")
    flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
    fd = os.open(str(tmp), flags, 0o600)
    try:
        with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
            handle.write(text)
    except Exception:
        try:
            tmp.unlink()
        except FileNotFoundError:
            pass
        raise
    try:
        os.replace(tmp, path)
    except Exception:
        try:
            tmp.unlink()
        except FileNotFoundError:
            pass
        raise
    try:
        os.chmod(path, 0o600)
    except OSError:
        pass


def read_bounded_stdin(max_bytes: int) -> tuple[str, bool, int]:
    data = sys.stdin.buffer.read(max_bytes + 1)
    truncated = len(data) > max_bytes
    if truncated:
        data = data[:max_bytes]
    return data.decode("utf-8", errors="replace"), truncated, len(data)


def artifact_paths(directory: Path, artifact_id: str) -> tuple[Path, Path]:
    if not ARTIFACT_ID_RE.fullmatch(artifact_id):
        raise ValueError("artifact id must be 16-64 lowercase hex chars")
    directory = normalize_allowed_first_absolute_symlink(directory)
    return directory / f"{artifact_id}.txt", directory / f"{artifact_id}.json"


def artifact_read_directories(raw_dir: str) -> list[Path]:
    """Return primary plus legacy read fallback for the default artifact dir.

    Rebranded ContextGuard stores new artifacts under `.context-guard/artifacts`,
    but users may still have receipts from the old `.claude-token-optimizer`
    default. Reads and listings include that legacy default so old receipts keep
    working; stores intentionally continue to use only the new path.
    """
    primary = normalize_allowed_first_absolute_symlink(Path(raw_dir).expanduser())
    directories = [primary]
    if Path(raw_dir).expanduser() == Path(DEFAULT_ARTIFACT_DIR):
        legacy = normalize_allowed_first_absolute_symlink(Path(LEGACY_ARTIFACT_DIR).expanduser())
        if legacy != primary:
            directories.append(legacy)
    return directories


CONTENT_TYPE_VALUES = ("json", "diff", "log", "search", "code", "prose", "text")
# Recommended retrieval strategy per content type. Pattern-oriented payloads
# (logs, search hits, diffs) are best sliced by `--pattern`; structured or
# narrative payloads (json, code, prose) read best by `--lines`. Unknown/empty
# content falls back to a bounded `head` read.
STRATEGY_BY_CONTENT_TYPE = {
    "json": "lines",
    "code": "lines",
    "prose": "lines",
    "diff": "pattern",
    "log": "pattern",
    "search": "pattern",
    "text": "head",
}
_SEARCH_HIT_RE = re.compile(r"^[^\s:]+:\d+:")
_LOG_LINE_RE = re.compile(
    r"^(\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}|"
    r"\[(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\]|"
    r"(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\b)",
    re.IGNORECASE,
)
_CODE_LINE_RE = re.compile(
    r"^\s*(def |class |import |from \S+ import |function |const |let |var |"
    r"public |private |protected |#include|package |func |fn |impl |"
    r"return\b|if\s*\(|for\s*\(|while\s*\()"
)


def classify_content_type(text: str) -> str:
    """Classify stored content into one of CONTENT_TYPE_VALUES (advisory only).

    The classification is dependency-free and deterministic: identical input
    always yields the same label. It never influences redaction or storage; it
    only drives retrieval-strategy hints, so a wrong guess degrades to a less
    ergonomic (but still correct) retrieval suggestion. Empty input is "text".
    """
    stripped = text.strip()
    if not stripped:
        return "text"
    if stripped[0] in "{[":
        try:
            json.loads(stripped)
            return "json"
        except (ValueError, RecursionError):
            pass
    lines = stripped.splitlines()
    line_count = len(lines)
    majority = max(1, line_count // 2)
    diff_hits = sum(1 for line in lines if line.startswith(("diff --git ", "@@ ", "+++ ", "--- ", "index ")))
    if diff_hits and (lines[0].startswith(("diff --git ", "--- ", "@@ ")) or diff_hits >= 2):
        return "diff"
    # Log is checked before search because timestamps (HH:MM:SS) and bracketed
    # levels can superficially resemble the `path:line:` search shape.
    if sum(1 for line in lines if _LOG_LINE_RE.match(line)) >= majority:
        return "log"
    if sum(1 for line in lines if _SEARCH_HIT_RE.match(line)) >= majority:
        return "search"
    code_hits = sum(1 for line in lines if _CODE_LINE_RE.match(line))
    brace_lines = sum(1 for line in lines if line.rstrip().endswith(("{", "}", ";", "):")))
    if code_hits >= 2 or (code_hits >= 1 and brace_lines >= max(2, line_count // 3)):
        return "code"
    return "prose"


def recommended_strategy(content_type: str) -> str:
    """Map a content type to its default retrieval strategy hint (advisory)."""
    return STRATEGY_BY_CONTENT_TYPE.get(content_type, "head")


def first_error_anchor(text: str) -> str | None:
    """Return the first literal error token in text for a pattern hint, or None.

    The returned token is taken verbatim from ERROR_RE's match, so it is
    guaranteed to be an exact substring of the stored content. This makes the
    derived `--pattern` retrieval hint deterministic and exactly round-trippable.
    """
    for line in text.splitlines():
        match = ERROR_RE.search(line)
        if match:
            token = match.group(0).strip()
            if token:
                return token
    return None


def build_retrieval_hints(
    artifact_id: str,
    sanitized_text: str,
    *,
    content_type: str,
    strategy: str,
    total_lines: int,
) -> list[dict[str, object]]:
    """Build deterministic, machine-readable retrieval hints for bounded round-trip.

    Each hint pairs a `selector` (consumable by `query_content` / the `get` CLI)
    with the exact CLI invocation for that selector. The line-range hint spans
    the full stored content when it fits the query cap, otherwise it advertises
    the first bounded chunk only. The pattern hint, when present, targets a
    literal token guaranteed to exist, so retrieval is reproducible. Order is
    fixed (lines, pattern, head) for determinism; callers pick the hint whose
    `type` matches `strategy`.
    """
    hints: list[dict[str, object]] = []
    if total_lines >= 1:
        end_line = min(total_lines, MAX_QUERY_LINES)
        lines_hint: dict[str, object] = {
            "type": "lines",
            "selector": {"start": 1, "end": end_line},
            "cli": line_query_cli(artifact_id, 1, end_line),
            "exact": total_lines <= MAX_QUERY_LINES,
        }
        if end_line > DEFAULT_MAX_LINES:
            lines_hint["max_lines"] = end_line
            lines_hint["max_lines_required"] = True
            lines_hint["note"] = (
                "`--max-lines` in this suggested query is only the returned-line cap for the selected "
                "`--lines` range; the explicit line range remains the selector."
            )
        if total_lines > MAX_QUERY_LINES:
            lines_hint["note"] = (
                f"first {MAX_QUERY_LINES} lines only; request later ranges for the full artifact. "
                "`--max-lines` is only the returned-line cap for the selected range."
            )
            lines_hint["total_lines"] = total_lines
        hints.append(lines_hint)
    anchor = first_error_anchor(sanitized_text)
    if anchor is not None:
        hints.append(
            {
                "type": "pattern",
                "selector": {"pattern": anchor},
                "cli": f"context-guard-artifact get {artifact_id} --pattern '{anchor}'",
            }
        )
    hints.append(
        {
            "type": "head",
            "selector": {"max_lines": DEFAULT_MAX_LINES},
            "cli": f"context-guard-artifact get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
        }
    )
    return hints


def line_query_cli(artifact_id: str, start: int, end: int) -> str:
    cli = f"context-guard-artifact get {artifact_id} --lines {start}:{end}"
    requested_lines = end - start + 1
    if requested_lines > DEFAULT_MAX_LINES:
        cli += f" --max-lines {min(requested_lines, MAX_QUERY_LINES)}"
    return cli


def line_receipt(artifact_id: str, line_number: int, text: str) -> dict[str, object]:
    return {
        "line": line_number,
        "text": cap_digest_text(text.strip()),
        "selector": {"type": "lines", "start": line_number, "end": line_number},
        "cli": line_query_cli(artifact_id, line_number, line_number),
    }


def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[str, object]]:
    receipts: list[dict[str, object]] = []
    seen: set[str] = set()
    for line_number, line in enumerate(lines, start=1):
        if not ERROR_RE.search(line):
            continue
        text = cap_digest_text(line.strip())
        if not text or text in seen:
            continue
        receipt = line_receipt(artifact_id, line_number, text)
        receipts.append(receipt)
        seen.add(text)
        if len(receipts) >= MAX_TOP_ERROR_RECEIPTS:
            break
    return receipts


def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: int = MAX_DUPLICATE_GROUPS) -> list[dict[str, object]]:
    counts: dict[str, int] = {}
    first_line: dict[str, int] = {}
    for line_number, line in enumerate(lines, start=1):
        text = cap_digest_text(line.strip())
        if not text:
            continue
        if text not in counts:
            first_line[text] = line_number
            counts[text] = 0
        counts[text] += 1
    groups: list[dict[str, object]] = []
    for text, count in sorted(
        ((text, count) for text, count in counts.items() if count > 1),
        key=lambda item: (-item[1], first_line[item[0]], item[0]),
    )[:limit]:
        line_number = first_line[text]
        groups.append(
            {
                "count": count,
                "first_line": line_number,
                "text": text,
                "selector": {"type": "lines", "start": line_number, "end": line_number},
                "cli": line_query_cli(artifact_id, line_number, line_number),
            }
        )
    return groups


def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int) -> dict[str, object]:
    lines = sanitized_text.splitlines()
    top_errors = compact_items(
        (line for line in lines if ERROR_RE.search(line)),
        limit=12,
        max_chars=MAX_DIGEST_TEXT_CHARS,
        max_bytes=MAX_DIGEST_TEXT_BYTES,
    )
    return {
        "status": "has_errors" if top_errors else "stored",
        "redacted_lines": redacted_lines,
        "redaction_counts": {
            "lines": redacted_lines,
            "markers": sanitized_text.count("[REDACTED]"),
        },
        "top_error_lines": top_errors,
        "top_error_receipts": build_top_error_receipts(artifact_id, lines),
        "duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines),
        "representative_head": compact_items(
            lines,
            limit=8,
            max_chars=MAX_DIGEST_TEXT_CHARS,
            max_bytes=MAX_DIGEST_TEXT_BYTES,
        ),
        "representative_tail": compact_items(
            lines[-8:],
            limit=8,
            max_chars=MAX_DIGEST_TEXT_CHARS,
            max_bytes=MAX_DIGEST_TEXT_BYTES,
        ),
    }


def suggested_queries_for(metadata: dict[str, object]) -> list[str]:
    queries: list[str] = []

    def add(value: object) -> None:
        if isinstance(value, str) and value and value not in queries:
            queries.append(value)

    digest = metadata.get("digest")
    if isinstance(digest, dict):
        for key in ("top_error_receipts", "duplicate_line_groups"):
            items = digest.get(key)
            if isinstance(items, list):
                for item in items:
                    if isinstance(item, dict):
                        add(item.get("cli"))

    retrieval = metadata.get("retrieval")
    if isinstance(retrieval, dict):
        hints = retrieval.get("hints")
        if isinstance(hints, list):
            for hint in hints:
                if isinstance(hint, dict):
                    add(hint.get("cli"))

    return queries[:MAX_SUGGESTED_QUERIES]


def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
    artifact_id = str(metadata["artifact_id"])
    return {
        "artifact_id": artifact_id,
        "stored": True,
        "created_at": metadata.get("created_at"),
        "command_preview": metadata.get("command_preview"),
        "content_type": metadata.get("content_type"),
        "input": metadata.get("input"),
        "stored_output": metadata.get("stored_output"),
        "digest": metadata.get("digest"),
        "retrieval": metadata.get("retrieval"),
        "available_queries": [
            f"context-guard-artifact get {artifact_id} --lines 1:80",
            f"context-guard-artifact get {artifact_id} --pattern ERROR --max-lines 40",
            f"context-guard-artifact get {artifact_id} --json --lines 1:20",
        ],
        "suggested_queries": suggested_queries_for(metadata),
    }


def metadata_json_text(metadata: dict[str, object]) -> str:
    return json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"


def metadata_size_bytes(metadata: dict[str, object]) -> int:
    return len(metadata_json_text(metadata).encode("utf-8", errors="replace"))


def metadata_cap_diagnostic(metadata: dict[str, object], *, stage: str) -> str:
    digest = metadata.get("digest")
    digest_counts: dict[str, int] = {}
    if isinstance(digest, dict):
        for key in (
            "representative_tail",
            "representative_head",
            "duplicate_line_groups",
            "top_error_lines",
            "top_error_receipts",
        ):
            value = digest.get(key)
            if isinstance(value, list):
                digest_counts[key] = len(value)
    counts_text = ",".join(f"{key}={value}" for key, value in digest_counts.items()) or "none"
    return (
        "artifact metadata exceeds trusted size cap before write: "
        f"metadata_bytes={metadata_size_bytes(metadata)} "
        f"metadata_cap_bytes={MAX_METADATA_BYTES} "
        f"stage={stage} "
        f"remaining_digest_items={counts_text}; "
        "authoritative artifact content was not written because the receipt would be unreadable"
    )


def shrink_digest_for_metadata_cap(metadata: dict[str, object]) -> None:
    """Keep stored metadata inside the trusted read cap before writing it.

    Digest fields are advisory receipts over the authoritative `.txt` artifact.
    If future fields or multi-byte text push metadata near the hard read cap,
    prefer dropping low-priority digest examples over writing a file that `get`
    and `list` will later reject as untrusted.
    """
    digest = metadata.get("digest")
    if not isinstance(digest, dict):
        if metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
            raise ValueError(metadata_cap_diagnostic(metadata, stage="no_digest"))
        return
    if metadata_size_bytes(metadata) <= MAX_METADATA_BYTES:
        return

    digest["capped_for_metadata"] = True
    digest["metadata_cap_bytes"] = MAX_METADATA_BYTES
    shrink_order = (
        "representative_tail",
        "representative_head",
        "duplicate_line_groups",
        "top_error_lines",
        "top_error_receipts",
    )
    while metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
        for key in shrink_order:
            items = digest.get(key)
            if isinstance(items, list) and items:
                items.pop()
                break
        else:
            raise ValueError(metadata_cap_diagnostic(metadata, stage="digest_shrink_exhausted"))


def store_command(args: argparse.Namespace) -> int:
    directory = normalize_allowed_first_absolute_symlink(Path(args.dir).expanduser())
    max_bytes = bounded_int(args.max_bytes, DEFAULT_MAX_BYTES, 1, MAX_MAX_BYTES)
    raw_text, input_truncated, input_bytes = read_bounded_stdin(max_bytes)
    sanitized_text, redacted_lines = sanitize_text(raw_text, show_paths=args.show_paths)
    content_bytes = len(sanitized_text.encode("utf-8", errors="replace"))
    content_sha = hashlib.sha256(sanitized_text.encode("utf-8", errors="replace")).hexdigest()
    command_preview = sanitize_one_line(args.command or "", show_paths=args.show_paths) if args.command else None
    id_basis = json.dumps(
        {
            "content_sha256": content_sha,
            "command_preview": command_preview,
            "input_truncated": input_truncated,
        },
        sort_keys=True,
    )
    artifact_id = hashlib.sha256(id_basis.encode("utf-8")).hexdigest()[:20]
    content_path, meta_path = artifact_paths(directory, artifact_id)
    total_lines = sanitized_text.count("\n") + (1 if sanitized_text and not sanitized_text.endswith("\n") else 0)
    content_type = classify_content_type(sanitized_text)
    strategy = recommended_strategy(content_type)
    metadata: dict[str, object] = {
        "artifact_id": artifact_id,
        "created_at": int(time.time()),
        "command_preview": command_preview,
        "content_type": content_type,
        "input": {
            "bytes_read": input_bytes,
            "truncated": input_truncated,
            "max_bytes": max_bytes,
        },
        "stored_output": {
            "bytes": content_bytes,
            "lines": total_lines,
            "sha256": content_sha,
            "content_file": content_path.name,
            "metadata_file": meta_path.name,
        },
        "digest": build_digest(sanitized_text, artifact_id=artifact_id, redacted_lines=redacted_lines),
        "retrieval": {
            "strategy": strategy,
            "deterministic": True,
            "hints": build_retrieval_hints(
                artifact_id,
                sanitized_text,
                content_type=content_type,
                strategy=strategy,
                total_lines=total_lines,
            ),
        },
    }
    shrink_digest_for_metadata_cap(metadata)
    write_private_text(content_path, sanitized_text)
    write_private_text(meta_path, metadata_json_text(metadata))
    receipt = receipt_for(metadata)
    if args.json:
        print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
    else:
        print(f"artifact_id={artifact_id}")
        stored = receipt["stored_output"]
        if isinstance(stored, dict):
            print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
        digest = receipt.get("digest")
        if isinstance(digest, dict) and digest.get("top_error_lines"):
            print("top_error_lines:")
            for line in digest["top_error_lines"]:  # type: ignore[index]
                print(f"- {line}")
        print(f"query=context-guard-artifact get {artifact_id} --lines 1:80")
    return 0


def load_metadata(directory: Path, artifact_id: str) -> dict[str, object]:
    content_path, meta_path = artifact_paths(directory, artifact_id)
    try:
        regular_private_file_size(content_path)
        meta_text = read_bounded_private_text(meta_path, MAX_METADATA_BYTES)
    except FileNotFoundError as exc:
        raise FileNotFoundError(f"artifact not found: {artifact_id}")
    data = json.loads(meta_text)
    if not isinstance(data, dict) or data.get("artifact_id") != artifact_id:
        raise ValueError(f"artifact metadata mismatch: {artifact_id}")
    return data


def parse_line_range(value: str | None) -> tuple[int, int] | None:
    if not value:
        return None
    match = re.fullmatch(r"(\d+)(?::(\d+))?", value.strip())
    if not match:
        raise ValueError("--lines must be START or START:END using 1-based inclusive line numbers")
    start = int(match.group(1))
    end = int(match.group(2) or match.group(1))
    if start < 1 or end < start:
        raise ValueError("--lines must satisfy 1 <= START <= END")
    return start, end


def cap_text(text: str, max_chars: int) -> tuple[str, bool]:
    if len(text) <= max_chars:
        return text, False
    marker = f"\n[context-guard-kit] artifact query capped: {len(text)} chars total\n"
    keep = max(0, max_chars - len(marker))
    return text[:keep].rstrip() + marker, True


def query_content(content: str, *, line_range: tuple[int, int] | None, pattern: str | None, max_lines: int) -> tuple[str, dict[str, object]]:
    lines = content.splitlines(True)
    selected: list[tuple[int, str]] = []
    if line_range is not None:
        start, end = line_range
        selected = list(enumerate(lines[start - 1 : end], start=start))
        selector = {"type": "lines", "start": start, "end": end}
    elif pattern:
        selected = [(idx, line) for idx, line in enumerate(lines, start=1) if pattern in line]
        selector = {"type": "pattern", "pattern": pattern}
    else:
        selected = list(enumerate(lines[:max_lines], start=1))
        selector = {"type": "head", "max_lines": max_lines}
    total_matches = len(selected)
    selected = selected[:max_lines]
    text = "".join(line for _idx, line in selected)
    return text, {"selector": selector, "returned_lines": len(selected), "matched_lines": total_matches, "total_lines": len(lines)}


def get_command(args: argparse.Namespace) -> int:
    artifact_id = args.artifact_id
    max_chars = bounded_int(args.max_chars, DEFAULT_MAX_CHARS, 1, 1_000_000)
    try:
        last_missing: FileNotFoundError | None = None
        for directory in artifact_read_directories(args.dir):
            try:
                metadata = load_metadata(directory, artifact_id)
                content_path, _meta_path = artifact_paths(directory, artifact_id)
                break
            except FileNotFoundError as exc:
                last_missing = exc
        else:
            if last_missing is not None:
                raise last_missing
            raise FileNotFoundError(f"artifact not found: {artifact_id}")
        stored_output = metadata.get("stored_output")
        expected_sha = stored_output.get("sha256") if isinstance(stored_output, dict) else None
        if not isinstance(expected_sha, str) or not re.fullmatch(r"[a-f0-9]{64}", expected_sha):
            raise ValueError(f"artifact metadata missing stored_output sha256: {artifact_id}")
        expected_bytes = stored_output.get("bytes") if isinstance(stored_output, dict) else None
        if not isinstance(expected_bytes, int) or expected_bytes < 0 or expected_bytes > MAX_MAX_BYTES:
            raise ValueError(f"artifact metadata has invalid stored_output bytes: {artifact_id}")
        actual_size = regular_private_file_size(content_path)
        if actual_size != expected_bytes:
            raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
        content = read_bounded_private_text(content_path, expected_bytes)
        actual_sha = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
        if actual_sha != expected_sha:
            raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
        line_range = parse_line_range(args.lines)
        if line_range is not None and args.max_lines is None:
            max_lines = min(line_range[1] - line_range[0] + 1, MAX_QUERY_LINES)
        else:
            max_lines = bounded_int(args.max_lines, DEFAULT_MAX_LINES, 1, MAX_QUERY_LINES)
        selected, query = query_content(content, line_range=line_range, pattern=args.pattern, max_lines=max_lines)
        selected, capped = cap_text(selected, max_chars)
    except (FileNotFoundError, ValueError, OSError, json.JSONDecodeError) as exc:
        print(f"context-guard-artifact: {exc}", file=sys.stderr)
        return 1
    if args.json:
        payload = {
            "artifact_id": artifact_id,
            "content_type": metadata.get("content_type"),
            "query": query,
            "capped": capped,
            "content": selected,
            "stored_output": metadata.get("stored_output"),
            "retrieval": metadata.get("retrieval"),
        }
        print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
    else:
        sys.stdout.write(selected)
    return 0


def list_command(args: argparse.Namespace) -> int:
    items: list[dict[str, object]] = []
    seen: set[str] = set()
    for directory in artifact_read_directories(args.dir):
        try:
            reject_symlink_components(directory)
            directory_is_safe = directory.is_dir() and not directory.is_symlink()
        except RuntimeError:
            directory_is_safe = False
        if not directory_is_safe:
            continue
        for meta_path in sorted(directory.glob("*.json")):
            try:
                data = json.loads(read_bounded_private_text(meta_path, MAX_METADATA_BYTES))
            except (OSError, ValueError, RuntimeError, json.JSONDecodeError):
                continue
            artifact_id = str(data.get("artifact_id", "")) if isinstance(data, dict) else ""
            if isinstance(data, dict) and ARTIFACT_ID_RE.fullmatch(artifact_id) and artifact_id not in seen:
                items.append(receipt_for(data))
                seen.add(artifact_id)
    items.sort(key=lambda item: str(item.get("artifact_id", "")))
    if args.json:
        print(json.dumps({"artifacts": items}, ensure_ascii=False, indent=2, sort_keys=True))
    else:
        for item in items:
            stored = item.get("stored_output")
            if isinstance(stored, dict):
                print(f"{item['artifact_id']}\t{stored.get('lines')} lines\t{stored.get('bytes')} bytes")
            else:
                print(item["artifact_id"])
    return 0


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Store sanitized large outputs as queryable local artifacts.")
    parser.add_argument("--dir", default=DEFAULT_ARTIFACT_DIR, help=f"artifact directory (default: {DEFAULT_ARTIFACT_DIR})")
    subparsers = parser.add_subparsers(dest="command_name", required=True)

    store = subparsers.add_parser("store", help="store stdin as a sanitized artifact and print a compact receipt")
    store.add_argument("--command", help="optional command label to sanitize into the receipt")
    store.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES, help="maximum stdin bytes to read before truncating")
    store.add_argument(
        "--show-paths",
        action="store_true",
        help="show raw absolute paths instead of path hashes; local debugging only because private paths may be exposed",
    )
    store.add_argument("--json", action="store_true", help="emit receipt JSON")
    store.set_defaults(func=store_command)

    get = subparsers.add_parser("get", help="query a stored artifact")
    get.add_argument("artifact_id")
    get.add_argument("--lines", help="1-based inclusive line range, e.g. 10:40")
    get.add_argument("--pattern", help="literal substring filter")
    get.add_argument("--max-lines", type=int, default=None)
    get.add_argument("--max-chars", type=int, default=DEFAULT_MAX_CHARS)
    get.add_argument("--json", action="store_true", help="emit query JSON with content")
    get.set_defaults(func=get_command)

    list_parser = subparsers.add_parser("list", help="list stored artifacts")
    list_parser.add_argument("--json", action="store_true", help="emit list JSON")
    list_parser.set_defaults(func=list_command)
    return parser


def main() -> int:
    parser = build_parser()
    args = parser.parse_args()
    try:
        return int(args.func(args))
    except (RuntimeError, ValueError) as exc:
        print(f"context-guard-artifact: {exc}", file=sys.stderr)
        return 1


if __name__ == "__main__":
    raise SystemExit(main())
