#!/usr/bin/env bash
set -euo pipefail

tmp_dir="$(mktemp -d)"
cleanup() {
  rm -rf "$tmp_dir"
}
trap cleanup EXIT

repo_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
agentrail="${repo_dir}/scripts/agentrail"

assert_grep() {
  local pattern="$1"
  local file="$2"
  local message="$3"

  if ! grep -q -- "$pattern" "$file"; then
    echo "$message" >&2
    echo "--- output ---" >&2
    cat "$file" >&2
    exit 1
  fi
}

fixture="${tmp_dir}/installed"
mkdir -p "$fixture"
git -C "$fixture" init --quiet
"$agentrail" install --target "$fixture" >"${tmp_dir}/install.out"

mkdir -p \
  "${fixture}/docs/agents" \
  "${fixture}/docs/memory" \
  "${fixture}/docs/prd" \
  "${fixture}/.agentrail/runs/issue-84-retry" \
  "${fixture}/src"

cat >"${fixture}/docs/agents/issue-84.md" <<'DOC'
# Issue 84 Retrieval Evaluation

Issue #84 adds retrieval quality evaluation with recall@5, recall@10, citation coverage, required source inclusion, and excluded source checks.
DOC

cat >"${fixture}/docs/prd/context-engine.md" <<'DOC'
# Context Engine PRD

Context engine evaluation for issue #84 must stay local-first and auditable.
DOC

cat >"${fixture}/src/retrieval_eval.py" <<'PY'
def issue_84_retrieval_quality_suite():
    return "issue #84 retrieval quality evaluation recall@5 recall@10 citation coverage"
PY

cat >"${fixture}/src/graph_relation.js" <<'JS'
function graphRelationSubject() {
  return "relationship-heavy-source";
}

module.exports = { graphRelationSubject };
JS

mkdir -p "${fixture}/tests"
cat >"${fixture}/tests/graph_relation.test.js" <<'JS'
const subject = require("../src/graph_relation");

test("graph relationship fixture reaches related test", () => {
  expect(subject).toBeTruthy();
});
JS

cat >"${fixture}/docs/memory/retrieval-evaluation.md" <<'DOC'
---
kind: lesson
source: issue-84
confidence: high
created_at: 2026-06-04T09:00:00Z
expires_at: 2026-12-31T00:00:00Z
---
# Retrieval Evaluation Lesson

For issue #84, retrieval evaluation evidence must map expected sources to command output.
DOC

cat >"${fixture}/docs/memory/expired-retrieval.md" <<'DOC'
---
kind: lesson
source: issue-12
confidence: low
created_at: 2024-01-01T00:00:00Z
expires_at: 2024-02-01T00:00:00Z
---
# Expired Retrieval Lesson

This stale source mentions retrieval quality evaluation but must not be required by issue #84 evaluation.
DOC

cat >"${fixture}/.agentrail/runs/issue-84-retry/findings.json" <<'JSON'
{
  "issue": 84,
  "findings": [
    {
      "severity": "high",
      "message": "Prior mistake for issue #84: evaluation report omitted citation coverage."
    }
  ]
}
JSON

cat >"${fixture}/.env" <<'ENV'
TOKEN=must-not-appear
ENV

node - "${fixture}/.agentrail/config.json" <<'NODE'
const fs = require("fs");
const configPath = process.argv[2];
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
config.context.excludeGlobs = [...(config.context.excludeGlobs || []), "retrieval-fixtures.json", "broken-fixtures.json"];
config.context.externalSources = [
  {
    id: "external:denied-eval",
    uri: "external://denied-eval",
    authority: "denied",
    visibility: "denied",
    linkedIssues: [84],
    note: "issue #84 denied retrieval evaluation source"
  }
];
fs.writeFileSync(configPath, `${JSON.stringify(config, null, 2)}\n`);
NODE

cat >"${fixture}/retrieval-fixtures.json" <<'JSON'
{
  "schemaVersion": 1,
  "fixtures": [
    {
      "name": "issue-84-local-quality",
      "task": "issue #84 retrieval quality evaluation recall@5 recall@10 citation coverage src/retrieval_eval.py",
      "requiredSources": [
        "docs/agents/issue-84.md",
        "src/retrieval_eval.py",
        "docs/memory/retrieval-evaluation.md",
        ".agentrail/runs/issue-84-retry/findings.json"
      ],
      "expectedFiles": ["src/retrieval_eval.py"],
      "expectedDocs": ["docs/agents/issue-84.md", "docs/prd/context-engine.md"],
      "expectedMemory": ["docs/memory/retrieval-evaluation.md"],
      "expectedPriorMistakes": [".agentrail/runs/issue-84-retry/findings.json"],
      "expectedExcludedSources": [".env", "external://denied-eval"]
    },
    {
      "name": "issue-84-optional-embedding",
      "task": "semantic-only issue #84 retrieval evaluation",
      "optionalProviderEnv": ["AGENTRAIL_CONTEXT_EVAL_EMBEDDINGS"],
      "requiredSources": ["docs/agents/issue-84.md"],
      "expectedFiles": [],
      "expectedDocs": ["docs/agents/issue-84.md"],
      "expectedMemory": [],
      "expectedPriorMistakes": [],
      "expectedExcludedSources": ["external://denied-eval"]
    },
    {
      "name": "relationship-heavy-graph-expansion",
      "task": "graphRelationSubject()",
      "requiredSources": [
        "src/graph_relation.js",
        "tests/graph_relation.test.js"
      ],
      "expectedFiles": [
        "src/graph_relation.js",
        "tests/graph_relation.test.js"
      ],
      "expectedDocs": [],
      "expectedMemory": [],
      "expectedPriorMistakes": [],
      "expectedExcludedSources": [".env", "external://denied-eval"],
      "expectedGraphExpandedSources": ["tests/graph_relation.test.js"]
    },
    {
      "name": "small-budget-required-sources",
      "task": "graphRelationSubject() tests/graph_relation.test.js",
      "limit": 2,
      "requiredSources": [
        "src/graph_relation.js",
        "tests/graph_relation.test.js"
      ],
      "expectedFiles": [
        "src/graph_relation.js",
        "tests/graph_relation.test.js"
      ],
      "expectedDocs": [],
      "expectedMemory": [],
      "expectedPriorMistakes": [],
      "expectedExcludedSources": [],
      "expectedGraphExpandedSources": ["tests/graph_relation.test.js"],
      "minPrecisionAtBudget": 1
    }
  ]
}
JSON

if ! "$agentrail" context evaluate "${fixture}/retrieval-fixtures.json" --target "$fixture" --json >"${tmp_dir}/eval.json"; then
  echo "context evaluation fixture suite failed" >&2
  cat "${tmp_dir}/eval.json" >&2
  exit 1
fi
"$agentrail" context evaluate "${fixture}/retrieval-fixtures.json" --target "$fixture" >"${tmp_dir}/eval.txt"

node - "${tmp_dir}/eval.json" <<'NODE'
const fs = require("fs");
const report = JSON.parse(fs.readFileSync(process.argv[2], "utf8"));
if (!report.passed) {
  console.error(JSON.stringify(report, null, 2));
  process.exit(1);
}
if (report.summary.fixtures !== 4 || report.summary.passed !== 3 || report.summary.skipped !== 1) {
  console.error(`unexpected summary: ${JSON.stringify(report.summary)}`);
  process.exit(1);
}
const local = report.fixtures.find((fixture) => fixture.name === "issue-84-local-quality");
if (!local || local.status !== "passed") {
  console.error("local evaluation fixture did not pass");
  process.exit(1);
}
for (const field of ["requiredSourceInclusion", "recallAt5", "recallAt10", "staleSourceExclusion", "citationCoverage", "reasonCoverage", "budgetMetadataPresence", "staleOrDeniedLeakage"]) {
  if (!(field in local.metrics)) {
    console.error(`evaluation report missing ${field}`);
    process.exit(1);
  }
}
if (!local.metrics.requiredSourceInclusion.passed) {
  console.error("required source inclusion did not pass");
  process.exit(1);
}
if (!local.metrics.staleSourceExclusion.passed) {
  console.error("stale/denied source exclusion did not pass");
  process.exit(1);
}
if (!local.metrics.staleOrDeniedLeakage.passed) {
  console.error("compiler stale/denied leakage check did not pass");
  process.exit(1);
}
if (!local.metrics.budgetMetadataPresence.passed) {
  console.error(`compiler budget metadata check did not pass: ${JSON.stringify(local.metrics.budgetMetadataPresence)}`);
  process.exit(1);
}
if (!("graphExpansion" in local.metrics)) {
  console.error("evaluation report did not include graph expansion metrics");
  process.exit(1);
}
if (!("precisionAtBudget" in local.metrics)) {
  console.error("evaluation report did not include precision-at-budget metrics");
  process.exit(1);
}
if (local.metrics.reasonCoverage !== 1) {
  console.error(`reason coverage was not complete: ${local.metrics.reasonCoverage}`);
  process.exit(1);
}
if (!local.topResults.every((item) => item.candidateId)) {
  console.error(`top results are missing candidate IDs: ${JSON.stringify(local.topResults)}`);
  process.exit(1);
}
if (local.topResults.some((item) => item.path === ".env" || item.path === "external://denied-eval")) {
  console.error("denied source appeared in top results");
  process.exit(1);
}
const optional = report.fixtures.find((fixture) => fixture.name === "issue-84-optional-embedding");
if (!optional || optional.status !== "skipped") {
  console.error("optional embedding fixture was not skipped without provider env");
  process.exit(1);
}
const relationship = report.fixtures.find((fixture) => fixture.name === "relationship-heavy-graph-expansion");
if (!relationship || relationship.status !== "passed") {
  console.error(`relationship-heavy fixture did not pass: ${JSON.stringify(relationship)}`);
  process.exit(1);
}
if (!relationship.metrics.requiredSourceInclusion.passed) {
  console.error(`relationship-heavy fixture missed required sources: ${JSON.stringify(relationship.metrics.requiredSourceInclusion)}`);
  process.exit(1);
}
if (relationship.metrics.citationCoverage !== 1) {
  console.error(`relationship-heavy fixture citation coverage was not complete: ${relationship.metrics.citationCoverage}`);
  process.exit(1);
}
if (!relationship.metrics.staleOrDeniedLeakage.passed) {
  console.error(`relationship-heavy fixture leaked stale or denied sources: ${JSON.stringify(relationship.metrics.staleOrDeniedLeakage)}`);
  process.exit(1);
}
if (!relationship.metrics.graphExpansion.passed || relationship.metrics.graphExpansion.maxHops !== 2) {
  console.error(`relationship-heavy fixture did not report bounded graph expansion evidence: ${JSON.stringify(relationship.metrics.graphExpansion)}`);
  process.exit(1);
}
if (!relationship.metrics.graphExpansion.addedCandidateIds.includes("tests/graph_relation.test.js")) {
  console.error(`relationship-heavy fixture did not record graph-expanded test source: ${JSON.stringify(relationship.metrics.graphExpansion.addedCandidateIds)}`);
  process.exit(1);
}
const budget = report.fixtures.find((fixture) => fixture.name === "small-budget-required-sources");
if (!budget || budget.status !== "passed") {
  console.error(`small-budget fixture did not pass: ${JSON.stringify(budget)}`);
  process.exit(1);
}
if (budget.metrics.budgetMetadataPresence.budget.maxItems !== 2 || budget.metrics.budgetMetadataPresence.retrievalBudget.maxItems !== 2) {
  console.error(`small-budget fixture did not preserve maxItems metadata: ${JSON.stringify(budget.metrics.budgetMetadataPresence)}`);
  process.exit(1);
}
if (budget.metrics.precisionAtBudget.precision !== 1) {
  console.error(`small-budget fixture precision was not exact: ${JSON.stringify(budget.metrics.precisionAtBudget)}`);
  process.exit(1);
}
if (budget.metrics.precisionAtBudget.droppedRequiredSources.length !== 0) {
  console.error(`small-budget fixture dropped required sources: ${JSON.stringify(budget.metrics.precisionAtBudget)}`);
  process.exit(1);
}
NODE

cat >"${fixture}/broken-fixtures.json" <<'JSON'
{
  "fixtures": [
    {
      "name": "missing-required-context",
      "task": "issue #84 retrieval quality evaluation",
      "requiredSources": ["docs/agents/missing-required.md"],
      "expectedFiles": [],
      "expectedDocs": ["docs/agents/missing-required.md"],
      "expectedMemory": [],
      "expectedPriorMistakes": [],
      "expectedExcludedSources": [],
      "expectedGraphExpandedSources": ["tests/missing-graph.test.js"],
      "minPrecisionAtBudget": 1
    }
  ]
}
JSON

if "$agentrail" context evaluate "${fixture}/broken-fixtures.json" --target "$fixture" --json >"${tmp_dir}/broken.json"; then
  echo "context evaluation did not fail when required context was missed" >&2
  cat "${tmp_dir}/broken.json" >&2
  exit 1
fi
assert_grep "docs/agents/missing-required.md" "${tmp_dir}/broken.json" "missing required source was not reported"
assert_grep "failureDetails" "${tmp_dir}/broken.json" "broken report did not include structured failure details"
assert_grep "missing_required_source" "${tmp_dir}/broken.json" "broken report did not identify missing required source failure kind"
assert_grep "nearestIncludedCandidates" "${tmp_dir}/broken.json" "broken report did not include nearest included candidates"
assert_grep "graph_expansion" "${tmp_dir}/broken.json" "broken report did not identify graph expansion failure kind"
assert_grep "startedFromAnchors" "${tmp_dir}/broken.json" "broken report did not include graph expansion start anchors"
assert_grep "budgetImpact" "${tmp_dir}/broken.json" "broken report did not include graph expansion budget impact"
assert_grep "precision_at_budget" "${tmp_dir}/broken.json" "broken report did not identify precision-at-budget failure kind"
assert_grep "noisyCandidates" "${tmp_dir}/broken.json" "broken report did not include noisy budget candidates"
assert_grep "reasonCoverage" "${tmp_dir}/eval.txt" "text report did not include reason coverage"
assert_grep "budgetMetadataPresence" "${tmp_dir}/eval.txt" "text report did not include budget metadata presence"
assert_grep "graphExpansion=True" "${tmp_dir}/eval.txt" "text report did not include graph expansion pass evidence"
assert_grep "precisionAtBudget=" "${tmp_dir}/eval.txt" "text report did not include precision-at-budget evidence"

echo "context evaluation test passed"
echo "evaluation report output:"
cat "${tmp_dir}/eval.txt"
