#!/bin/bash
# ~/.panopticon/bin/work-agent-stop-hook  (PAN-800)
#
# Detects work agents that forgot to call `pan work done` after finishing their
# implementation, and nudges them via tmux. Writes the lifecycle resolution
# signal through POST /api/agents/:id/heartbeat (kind=resolution_set) —
# runtime.json is no longer touched.
#
# Escalation tiers:
#   working   — default while tool activity is recent
#   done      — STATE.md marked complete AND branch pushed
#   api_error — transient provider error detected (auto-retry nudge)
#   needs_input — LLM verdict STOPPED_FOR_INPUT
#   stuck     — 2+ consecutive UNCLEAR LLM verdicts
#
# The model used for the completion-check LLM is configurable via
# PANOPTICON_COMPLETION_CHECK_MODEL or the completion-check-hook override
# in ~/.panopticon/config.yaml.

set +e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=pan-hook-lib.sh
. "$SCRIPT_DIR/pan-hook-lib.sh" 2>/dev/null || exit 0

pan_resolve_agent_id || exit 0

# Only run for work agents (agent-*, excluding specialists).
case "$AGENT_ID" in
  agent-*)
    case "$AGENT_ID" in
      specialist-*) exit 0 ;;
    esac
    ;;
  *)
    exit 0
    ;;
esac

ISSUE_ID=$(echo "$AGENT_ID" | sed 's/^agent-//' | tr '[:lower:]' '[:upper:]')

AGENT_STATE_DIR="$HOME/.panopticon/agents/$AGENT_ID"
LOG_DIR="$HOME/.panopticon/logs"
mkdir -p "$LOG_DIR"

COMPLETED_FILE="$AGENT_STATE_DIR/completed"
[ -f "$COMPLETED_FILE" ] && exit 0

# Skip if the issue is actively flowing through the specialist pipeline.
REVIEW_STATUS_FILE="$HOME/.panopticon/review-status.json"
if [ -f "$REVIEW_STATUS_FILE" ]; then
  IN_ACTIVE_PIPELINE=$(python3 -c "
import json, sys
try:
    with open('$REVIEW_STATUS_FILE') as f:
        statuses = json.load(f)
    issue = '$ISSUE_ID'
    status = statuses.get(issue) or statuses.get(issue.lower()) or statuses.get(issue.upper())
    if not status:
        print('no'); sys.exit(0)
    review = status.get('reviewStatus', '')
    test = status.get('testStatus', '')
    merge = status.get('mergeStatus', '')
    ready = status.get('readyForMerge', False)
    verification = status.get('verificationStatus', '')
    is_active = (
        review in ('reviewing', 'passed') and
        test in ('passed', 'reviewing', 'pending') and
        (ready or merge in ('queued', 'merging', 'verifying'))
    )
    needs_action = (
        review in ('blocked', 'failed') or
        verification == 'failed'
    )
    print('yes' if (is_active and not needs_action) else 'no')
except Exception:
    print('no')
" 2>/dev/null || echo "no")
  if [ "$IN_ACTIVE_PIPELINE" = "yes" ]; then
    echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID skipped — actively in specialist pipeline" \
      >> "$LOG_DIR/hooks.log" 2>/dev/null || true
    exit 0
  fi
fi

# Cooldown: don't nudge more than once per 10 minutes.
NUDGE_FILE="$AGENT_STATE_DIR/.last-completion-nudge"
if [ -f "$NUDGE_FILE" ]; then
  LAST_NUDGE=$(cat "$NUDGE_FILE" 2>/dev/null || echo "0")
  NOW=$(date +%s)
  if [ $(( NOW - LAST_NUDGE )) -lt 600 ]; then
    exit 0
  fi
fi

# Local cache of the last resolution THIS HOOK emitted for this agent.
# Authoritative for "what did I say last time" — avoids dashboard round-trips
# that can silently fail under load and reset the ratchet to count=1.
RESOLUTION_CACHE="$AGENT_STATE_DIR/last-resolution.tsv"
RESOLUTION_LOCK="$AGENT_STATE_DIR/last-resolution.lock"
MAX_CHANNEL_REPLY_SUMMARY_LENGTH=4096

normalize_log_text() {
  python3 - "$1" "$MAX_CHANNEL_REPLY_SUMMARY_LENGTH" <<'PY'
import re
import sys

text = sys.argv[1]
limit = int(sys.argv[2])
text = text.replace("\r", " ").replace("\n", " ").replace("\t", " ")
text = re.sub(r'\x1B\[[0-?]*[ -/]*[@-~]', '', text)
text = ''.join(ch for ch in text if ch >= ' ')
text = re.sub(r'\s+', ' ', text).strip()
print(text[:limit], end='')
PY
}

# Read last-emitted resolution from local cache. Echoes "resolution count"
# (space-separated) or empty string if cache missing/unreadable.
read_cached_resolution() {
  [ -f "$RESOLUTION_CACHE" ] || return 0
  local line
  line=$(flock -s -w 1 "$RESOLUTION_LOCK" -c "cat '$RESOLUTION_CACHE'" 2>/dev/null || echo "")
  [ -z "$line" ] && return 0
  local res count
  res=$(echo "$line" | awk -F'\t' '{print $1}')
  count=$(echo "$line" | awk -F'\t' '{print $2}')
  # Sanity: count must be numeric
  case "$count" in
    ''|*[!0-9]*) return 0 ;;
  esac
  echo "$res $count"
}

# Write last-emitted resolution to local cache (flock-guarded).
write_cached_resolution() {
  local res="$1" count="$2"
  local tmp="$RESOLUTION_CACHE.tmp.$$"
  printf '%s\t%s\t%s\n' "$res" "$count" "$(date -Iseconds)" > "$tmp" 2>/dev/null || return 0
  flock -x -w 1 "$RESOLUTION_LOCK" -c "mv '$tmp' '$RESOLUTION_CACHE'" 2>/dev/null || rm -f "$tmp"
}

# Emit a resolution event via the heartbeat endpoint.
# Uses the local cache as the source of truth for prev resolution/count
# (the dashboard snapshot is unreliable under load); falls back to the
# dashboard only if no local cache exists (first emit for this agent).
emit_resolution() {
  local resolution="$1"
  local bootstrap_snapshot="${2:-}"
  local prev_resolution="" prev_count=0 prev_source=""
  local cached
  cached=$(read_cached_resolution)
  if [ -n "$cached" ]; then
    prev_resolution=$(echo "$cached" | awk '{print $1}')
    prev_count=$(echo "$cached" | awk '{print $2}')
    prev_source="cache"
  else
    # No local cache — bootstrap from structured reply snapshot when available,
    # otherwise fall back to the dashboard snapshot.
    local snap="$bootstrap_snapshot"
    if [ -z "$snap" ]; then
      snap=$(curl -s -m 0.5 "$PAN_DASHBOARD_URL/api/agents/$AGENT_ID/runtime" 2>/dev/null || echo "")
      prev_source="dashboard"
    else
      prev_source="structured_snapshot"
    fi
    if [ -n "$snap" ] && command -v jq >/dev/null 2>&1; then
      prev_resolution=$(echo "$snap" | jq -r '.snapshot.resolution // ""' 2>/dev/null || echo "")
      prev_count=$(echo "$snap" | jq -r '.snapshot.resolutionCount // 0' 2>/dev/null || echo "0")
    else
      prev_source="none"
    fi
  fi
  local new_count
  if [ "$prev_resolution" = "$resolution" ]; then
    new_count=$(( prev_count + 1 ))
  else
    new_count=1
  fi

  local body="{\"kind\":\"resolution_set\",\"resolution\":\"$resolution\",\"resolutionCount\":$new_count}"
  pan_emit_event "$AGENT_ID" "$body"

  write_cached_resolution "$resolution" "$new_count"

  echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID resolution=$resolution count=$new_count (prev=$prev_resolution/$prev_count src=$prev_source)" \
    >> "$LOG_DIR/hooks.log" 2>/dev/null || true
}

# ── Phase 1: evidence-based completion detection ─────────────────────────
WORKSPACE=""
STATE_FILE="$AGENT_STATE_DIR/state.json"
if [ -f "$STATE_FILE" ]; then
  WORKSPACE=$(python3 -c "import json; d=json.load(open('$STATE_FILE')); print(d.get('workspace',''))" 2>/dev/null || echo "")
fi

STATE_MD_COMPLETE=false
BRANCH_PUSHED=false

if [ -n "$WORKSPACE" ] && [ -f "$WORKSPACE/.planning/STATE.md" ]; then
  if grep -qiE '(^##?\s*(Status|Current Status).*:.*\b(COMPLETE|DONE|FINISHED)\b|^Status:\s*(COMPLETE|DONE|FINISHED)|implementation[[:space:]]+complete|all[[:space:]]+tasks[[:space:]]+complete)' "$WORKSPACE/.planning/STATE.md" 2>/dev/null; then
    STATE_MD_COMPLETE=true
  fi
fi

if [ -n "$WORKSPACE" ] && git -C "$WORKSPACE" rev-parse --git-dir >/dev/null 2>&1; then
  BRANCH=$(git -C "$WORKSPACE" branch --show-current 2>/dev/null || echo "")
  if [ -n "$BRANCH" ]; then
    UNPUSHED=$(git -C "$WORKSPACE" log "origin/$BRANCH..HEAD" --oneline 2>/dev/null || echo "no-remote")
    [ "$UNPUSHED" = "" ] && BRANCH_PUSHED=true
  fi
fi

nudge_agent() {
  local msg="$1"
  local tmpfile
  tmpfile=$(mktemp)
  echo "$msg" > "$tmpfile"
  tmux load-buffer "$tmpfile" 2>/dev/null
  tmux paste-buffer -t "$AGENT_ID" 2>/dev/null
  sleep 0.3
  tmux send-keys -t "$AGENT_ID" C-m 2>/dev/null
  rm -f "$tmpfile"
}

if [ "$STATE_MD_COMPLETE" = "true" ] && [ "$BRANCH_PUSHED" = "true" ]; then
  emit_resolution "done"
  date +%s > "$NUDGE_FILE" 2>/dev/null || true
  nudge_agent "Your STATE.md indicates work is complete and your branch is pushed. You MUST run this command now:

pan work done $ISSUE_ID -c \"Implementation complete\"

If you still have remaining tasks, continue working on them. Do NOT stop until all work is done AND you have called pan work done."
  exit 0
fi

if [ "$STATE_MD_COMPLETE" = "true" ] && [ "$BRANCH_PUSHED" = "false" ]; then
  echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID still_working (STATE.md=complete, branch=not-pushed)" \
    >> "$LOG_DIR/hooks.log" 2>/dev/null || true
  exit 0
fi

handle_structured_channel_reply() {
  command -v jq >/dev/null 2>&1 || return 1
  local snap
  snap=$(curl -s -m 0.5 "$PAN_DASHBOARD_URL/api/agents/$AGENT_ID/runtime" 2>/dev/null || echo "")
  [ -z "$snap" ] && return 1

  local reply_kind reply_summary normalized_reply_summary
  reply_kind=$(echo "$snap" | jq -r '.snapshot.channelReply.kind // ""' 2>/dev/null || echo "")
  [ -z "$reply_kind" ] && return 1
  reply_summary=$(echo "$snap" | jq -r '.snapshot.channelReply.summary // ""' 2>/dev/null || echo "")
  normalized_reply_summary=$(normalize_log_text "$reply_summary")

  case "$reply_kind" in
    done)
      echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID using structured channel_reply kind=done summary=$normalized_reply_summary" \
        >> "$LOG_DIR/hooks.log" 2>/dev/null || true
      emit_resolution "done" "$snap"
      date +%s > "$NUDGE_FILE" 2>/dev/null || true
      nudge_agent "You reported completion via channel_reply. If your implementation is complete, you MUST run this command now:

pan work done $ISSUE_ID -c \"Implementation complete\"

If you still have remaining tasks, continue working on them. Do NOT stop until all work is done AND you have called pan work done."
      return 0
      ;;
    needs_input)
      echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID using structured channel_reply kind=needs_input summary=$normalized_reply_summary" \
        >> "$LOG_DIR/hooks.log" 2>/dev/null || true
      emit_resolution "needs_input" "$snap"
      return 0
      ;;
    *)
      return 1
      ;;
  esac
}

if handle_structured_channel_reply; then
  exit 0
fi

# ── Phase 2: pane heuristic before the LLM call ──────────────────────────
# Capture 200 lines to give the LLM enough context (a short summary in the
# final chunk alongside earlier tool output is often the key disambiguator).
# Prompt-indicator regex covers Claude Code (Worked for) and non-Claude runtimes
# routed through Claude Code (Sautéed/Cooking/Thinking) that share the `❯` prompt.
OUTPUT=$(tmux capture-pane -t "$AGENT_ID" -p -S -200 2>/dev/null || echo "")
[ -z "$OUTPUT" ] && exit 0
echo "$OUTPUT" | tail -20 | grep -qE '(^❯|Worked for|Sautéed|Cooking|Thinking for)' || exit 0

# ── Phase 2.5: API error detection (auto-retry) ─────────────────────────
# If the agent stopped because of a transient API error, nudge it to retry
# instead of burning an LLM call on completion analysis. The agent was doing
# work and got cut off by infrastructure — it should just continue.
API_ERROR_PATTERNS=(
  "API Error: The server had an error while processing your request"
  "API Error: Overloaded"
  "API Error: Rate limit"
  "API Error: Request was aborted"
  "API Error: Timed out"
  "529 Overloaded"
  "502 Bad Gateway"
  "503 Service Unavailable"
)

API_ERROR_DETECTED=false
for pattern in "${API_ERROR_PATTERNS[@]}"; do
  if echo "$OUTPUT" | grep -qF "$pattern"; then
    API_ERROR_DETECTED=true
    break
  fi
done

if [ "$API_ERROR_DETECTED" = "true" ]; then
  # Cooldown: don't retry more than once per 5 minutes for API errors.
  API_ERROR_NUDGE_FILE="$AGENT_STATE_DIR/.last-api-error-nudge"
  if [ -f "$API_ERROR_NUDGE_FILE" ]; then
    LAST_API_NUDGE=$(cat "$API_ERROR_NUDGE_FILE" 2>/dev/null || echo "0")
    NOW=$(date +%s)
    if [ $(( NOW - LAST_API_NUDGE )) -lt 300 ]; then
      echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID api_error detected but cooldown active" \
        >> "$LOG_DIR/hooks.log" 2>/dev/null || true
      exit 0
    fi
  fi

  emit_resolution "api_error"
  date +%s > "$API_ERROR_NUDGE_FILE" 2>/dev/null || true

  echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID api_error detected — nudging retry" \
    >> "$LOG_DIR/hooks.log" 2>/dev/null || true

  nudge_agent "You stopped due to a transient API error. This is a temporary server issue, not a problem with your code. Continue your work from where you left off. Do NOT start over — pick up exactly where you stopped."
  exit 0
fi

# ── Phase 3: LLM fallback ────────────────────────────────────────────────
COMPLETION_MODEL="${PANOPTICON_COMPLETION_CHECK_MODEL:-}"
if [ -z "$COMPLETION_MODEL" ]; then
  CONFIG_FILE="$HOME/.panopticon/config.yaml"
  if [ -f "$CONFIG_FILE" ]; then
    COMPLETION_MODEL=$(grep 'completion-check-hook:' "$CONFIG_FILE" 2>/dev/null | awk '{print $2}' | tr -d '"' || echo "")
  fi
fi
COMPLETION_MODEL="${COMPLETION_MODEL:-claude-haiku-4-5}"

ANALYSIS_PROMPT="You are analyzing a work agent's terminal output to determine whether it finished its implementation but failed to call 'pan work done' as its final action.

The agent was working on issue $ISSUE_ID. The contract is: once every task is complete, the agent MUST run \`pan work done <ISSUE_ID>\` as a Bash command. Finishing work without running that command is a failure — the pipeline stalls.

Here is the last 200 lines of its terminal output:

<terminal_output>
$OUTPUT
</terminal_output>

Decide FORGOT_COMPLETION if ANY of these are true AND the transcript does NOT show a 'pan work done' Bash invocation in the tail:
- the agent wrote a summary/wrap-up message ('all tests pass', 'implementation complete', 'all beads closed', 'ready for review', etc.) and then stopped
- the agent closed beads or ran 'bd close' and finished with a prompt
- the agent committed + pushed and the last message is a status summary instead of a pan-work-done command

Decide STILL_WORKING only if the tail clearly shows an in-progress task (mid-edit, running tests, reading files) with no wrap-up summary.
Decide STOPPED_FOR_INPUT only if the agent explicitly asked the human a question or flagged a blocker.
Use UNCLEAR only as a last resort when the tail is genuinely ambiguous (e.g. no summary and no active work).

Respond with EXACTLY one word, nothing else: FORGOT_COMPLETION | STILL_WORKING | STOPPED_FOR_INPUT | UNCLEAR"

CLAUDE_STDERR="$LOG_DIR/work-agent-stop-hook.claude-stderr.log"
RESULT=$(echo "$ANALYSIS_PROMPT" | claude -p --model "$COMPLETION_MODEL" 2>>"$CLAUDE_STDERR")
CLAUDE_RC=$?
if [ $CLAUDE_RC -ne 0 ] || [ -z "$RESULT" ]; then
  echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID claude -p failed rc=$CLAUDE_RC (see $CLAUDE_STDERR), fallback to UNCLEAR" \
    >> "$LOG_DIR/hooks.log" 2>/dev/null || true
  RESULT="UNCLEAR"
fi
VERDICT=$(echo "$RESULT" | tr -d '[:space:]' | head -c 30)

echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID ($ISSUE_ID) -> $VERDICT (model: $COMPLETION_MODEL, rc=$CLAUDE_RC)" \
  >> "$LOG_DIR/hooks.log" 2>/dev/null || true

case "$VERDICT" in
  FORGOT_COMPLETION)
    emit_resolution "done"
    date +%s > "$NUDGE_FILE" 2>/dev/null || true
    nudge_agent "You stopped without calling pan work done. If your implementation is complete, you MUST run this command now:

pan work done $ISSUE_ID -c \"Implementation complete\"

If you still have remaining tasks, continue working on them. Do NOT stop until all work is done AND you have called pan work done."
    ;;
  STOPPED_FOR_INPUT)
    emit_resolution "needs_input"
    ;;
  UNCLEAR)
    # Escalate to stuck after 2+ consecutive UNCLEAR — the hook has looked at
    # this agent's tail twice and still can't tell what it's doing, which is
    # itself signal. Uses the local resolution cache (not the dashboard) so
    # this decision is deterministic even when the dashboard is unreachable.
    escalate_to_stuck=false
    esc_prev_resolution=""
    esc_prev_count=0
    esc_cached=$(read_cached_resolution)
    if [ -n "$esc_cached" ]; then
      esc_prev_resolution=$(echo "$esc_cached" | awk '{print $1}')
      esc_prev_count=$(echo "$esc_cached" | awk '{print $2}')
      # Stay stuck once escalated (don't ratchet back to unclear on subsequent
      # UNCLEAR verdicts — deacon's poke logic counts consecutive stuck heartbeats).
      if { [ "$esc_prev_resolution" = "unclear" ] && [ "$esc_prev_count" -ge 1 ]; } \
         || [ "$esc_prev_resolution" = "stuck" ]; then
        escalate_to_stuck=true
      fi
    fi
    echo "[$(date -Iseconds)] work-agent-stop-hook: $AGENT_ID UNCLEAR gate prev=$esc_prev_resolution/$esc_prev_count escalate=$escalate_to_stuck" \
      >> "$LOG_DIR/hooks.log" 2>/dev/null || true
    if [ "$escalate_to_stuck" = "true" ]; then
      emit_resolution "stuck"
    else
      emit_resolution "unclear"
    fi
    ;;
esac

exit 0
