#!/usr/bin/env bash
# activation-scan — probe reachability of registered ARQERA providers.
# Habitat-portable; runs on Mac, Homebase, DGX, GH Actions, customer Mac.
#
# Per operator direction 2026-05-18: "ARQERA already possesses far more
# capability than it operationally activates." This script metabolizes
# part of that capacity by:
#   - probing each registered provider's free/public endpoint
#   - classifying each as reachable / needs-credential / unreachable
#   - emitting JSON for substrate consumption
#
# Probes are READ-ONLY GETs. No paid calls. No credentials sent. If a
# provider requires auth to probe further, classification is
# "needs-credential" — that signals a capability the substrate can
# unlock by adding (governed) credentials, NOT a dead capability.
#
# Usage:
#   scripts/activation-scan              # human-readable table
#   scripts/activation-scan --json       # JSON output for substrate emit
#   scripts/activation-scan --emit       # emit arq://body/activation_map/v1 to substrate (requires twin)

set -euo pipefail

FORMAT="human"
EMIT="false"
while [[ $# -gt 0 ]]; do
  case "$1" in
    --json) FORMAT="json"; shift ;;
    --emit) EMIT="true"; FORMAT="json"; shift ;;
    --help|-h) sed -n '2,20p' "$0"; exit 0 ;;
    *) echo "unknown arg: $1" >&2; exit 2 ;;
  esac
done

for tool in curl python3; do
  if ! command -v "$tool" >/dev/null 2>&1; then
    echo "error: $tool not on PATH" >&2; exit 2
  fi
done

# Provider registry — minimal v1 set, tier-tagged. Each row:
#   provider | tier | url | classification_on_2xx | notes
#
# tier semantics (matches arq://doc/protocol/capability-declaration-contract-v1):
#   0 = self-hosted / free-substrate (Modal Tier-0, Homebase vLLM)
#   0.5 = vendor-free (GH Actions free min, Cloudflare workers free)
#   1 = budget paid (Cerebras, Groq, DeepSeek)
#   2 = mid paid (OpenAI, Mistral)
#   3 = premium paid (Anthropic Claude Opus, Gemini Ultra)
#
# Probe URLs target free/public endpoints. If a probe needs auth even to
# verify reachability, the row is tagged needs-credential on 401/403.
read -r -d '' PROVIDERS <<'EOF' || true
modal-tier0|0|https://gashiru--arqera-inference-api.modal.run/|reachable|self-hosted Qwen3 endpoint
addressing-service|0.5|https://addressing.arqera.io/|reachable|ARQERA substrate addressing
github-actions|0.5|https://api.github.com/zen|reachable|free runner habitat
cloudflare-workers|0.5|https://workers.cloudflare.com/|reachable|edge runtime free tier
huggingface|0.5|https://huggingface.co/api/whoami-v2|needs-credential|HF inference; auth probe
groq|1|https://api.groq.com/openai/v1/models|needs-credential|Groq inference; OpenAI-compat
cerebras|1|https://api.cerebras.ai/v1/models|needs-credential|Cerebras inference; OpenAI-compat
deepseek|1|https://api.deepseek.com/v1/models|needs-credential|DeepSeek inference; OpenAI-compat
openai|2|https://api.openai.com/v1/models|needs-credential|OpenAI inference
mistral|2|https://api.mistral.ai/v1/models|needs-credential|Mistral inference
anthropic|3|https://api.anthropic.com/v1/messages|needs-credential|Anthropic inference; POST not GET
gemini|3|https://generativelanguage.googleapis.com/v1beta/models|needs-credential|Gemini inference
lambda-labs|1|https://api.lambdalabs.com/api/v1/instances|needs-credential|Lambda Labs inference
nango|0.5|https://api.nango.dev/|needs-credential|Nango integration proxy
sentry|0.5|https://sentry.io/|reachable|Sentry web (auth not required for landing)
figma|0.5|https://api.figma.com/v1/me|needs-credential|Figma API
github|0.5|https://api.github.com/zen|reachable|GitHub API anonymous
modal-platform|0.5|https://modal.com/|reachable|Modal platform landing
EOF

probe_one() {
  local row="$1"
  IFS='|' read -r provider tier url default_class notes <<< "$row"
  local t0 t1 ms code class
  t0=$(python3 -c 'import time;print(time.time())')
  code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "$url" 2>/dev/null || echo 000)
  t1=$(python3 -c 'import time;print(time.time())')
  ms=$(python3 -c "print(int(($t1-$t0)*1000))")
  # Normalize unexpected curl output (e.g., DNS failure → empty body
  # concatenated with `|| echo 000`) by checking for a 3-digit code.
  if ! [[ "$code" =~ ^[0-9]{3}$ ]]; then code="000"; fi
  case "$code" in
    000)    class="unreachable" ;;   # DNS/connect failure
    5*)     class="unreachable" ;;   # server error
    401|403) class="needs-credential" ;;
    404)    class="reachable" ;;     # endpoint exists; probe path doesn't match
    405)    class="reachable" ;;     # method-not-allowed = endpoint up, wrong verb
    2*|3*)  class="reachable" ;;
    *)      class="reachable" ;;     # 4xx other (rate limit, payload-required, etc.) = endpoint up
  esac
  echo "$provider|$tier|$url|$code|$ms|$class|$notes"
}

RESULTS=""
while IFS= read -r row; do
  [[ -z "$row" ]] && continue
  RESULTS+="$(probe_one "$row")"$'\n'
done <<< "$PROVIDERS"

# bash heredoc <<'PY' on `python3 -` overrides stdin (python reads
# the heredoc as its source, not the pipe). So pass results via env
# and let python parse from there.
PROBED_JSON=$(RAW_RESULTS="$RESULTS" python3 - <<'PY'
import os, json, datetime
probes = []
totals = {"total": 0, "reachable": 0, "unreachable": 0, "needs_credential": 0}
for line in os.environ.get("RAW_RESULTS", "").strip().splitlines():
    if not line: continue
    parts = line.split('|', 6)
    if len(parts) != 7: continue
    provider, tier, url, http, ms, classification, notes = parts
    probes.append({
        "provider": provider,
        "tier": tier,
        "url": url,
        "http": http,
        "ms": int(ms),
        "classification": classification,
        "notes": notes,
    })
    totals["total"] += 1
    if classification == "reachable":
        totals["reachable"] += 1
    elif classification == "needs-credential":
        totals["needs_credential"] += 1
    elif classification == "unreachable":
        totals["unreachable"] += 1
print(json.dumps({
    "schema_version": 1,
    # Sentry HIGH on #3913: use datetime.timezone.utc (Python 3.7+)
    # instead of datetime.UTC (3.11+); GH Actions runners may be older.
    "scanned_at": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "probes": probes,
    "totals": totals,
}, indent=2))
PY
)

if [[ "$EMIT" == "true" ]]; then
  if ! command -v twin >/dev/null 2>&1; then
    echo "error: --emit needs twin CLI on PATH" >&2; exit 2
  fi
  PAYLOAD=$(echo "$PROBED_JSON" | python3 -c "import json,sys;print(json.dumps(json.load(sys.stdin)))")
  twin --use-keychain act emit body activation_map v1 \
    --payload "$PAYLOAD" \
    --source twin-activation-scan \
    --sync
  echo "$PROBED_JSON"
  exit 0
fi

if [[ "$FORMAT" == "json" ]]; then
  echo "$PROBED_JSON"
  exit 0
fi

# Human format
echo "=== ARQERA provider activation scan ==="
echo ""
TOTAL=$(echo "$PROBED_JSON" | python3 -c "import json,sys;print(json.load(sys.stdin)['totals']['total'])")
REACHABLE=$(echo "$PROBED_JSON" | python3 -c "import json,sys;print(json.load(sys.stdin)['totals']['reachable'])")
NEEDSCRED=$(echo "$PROBED_JSON" | python3 -c "import json,sys;print(json.load(sys.stdin)['totals']['needs_credential'])")
UNREACH=$(echo "$PROBED_JSON" | python3 -c "import json,sys;print(json.load(sys.stdin)['totals']['unreachable'])")
echo "  reachable:        $REACHABLE / $TOTAL"
echo "  needs-credential: $NEEDSCRED"
echo "  unreachable:      $UNREACH"
echo ""
printf "  %-25s %-5s %-6s %-8s %s\n" "provider" "tier" "http" "ms" "classification"
echo "$PROBED_JSON" | python3 -c "
import json,sys
d=json.load(sys.stdin)
for p in d['probes']:
    print(f\"  {p['provider']:<25} {p['tier']:<5} {p['http']:<6} {p['ms']:<8} {p['classification']}\")
"
