#!/usr/bin/env bash
# check-substrate-survivability — habitat-portable substrate health probe.
#
# Same probes as .github/workflows/substrate-survivability-probe.yml.
# Runnable on any habitat (Mac, Homebase, DGX, GH Actions, a customer
# Mac). Proves substrate addressing-service + staging + Modal are
# reachable from this habitat's network position.
#
# Per arq://doc/plan/decentralization-execution-wave-1: a substrate-
# survivability probe that ONLY runs on Mac is itself centralization.
# The same probe must be runnable from anywhere.
#
# Usage:
#   scripts/check-substrate-survivability                  # default probes
#   scripts/check-substrate-survivability --json           # JSON output
#   scripts/check-substrate-survivability --include-modal  # include Modal probe (off by default for cost)
#
# Exit codes:
#   0 — all critical probes PASS
#   1 — at least one critical probe FAIL
#   2 — environment issue (missing curl/python3/jq)

set -euo pipefail

FORMAT="human"
INCLUDE_MODAL="false"
while [[ $# -gt 0 ]]; do
  case "$1" in
    --json) FORMAT="json"; shift ;;
    --include-modal) INCLUDE_MODAL="true"; shift ;;
    --help|-h) sed -n '2,20p' "$0"; exit 0 ;;
    *) echo "unknown arg: $1" >&2; exit 2 ;;
  esac
done

for tool in curl python3; do
  if ! command -v "$tool" >/dev/null 2>&1; then
    echo "error: $tool not on PATH" >&2; exit 2
  fi
done

ADDR_BASE="${TWIN_ADDRESSING_BASE:-https://addressing.arqera.io}"
STG_BASE="${ARQERA_STAGING_URL:-https://staging.arqera.io}"

probe_http() {
  local label="$1" url="$2" max="${3:-10}"
  local t0 t1 ms code
  t0=$(python3 -c 'import time;print(time.time())')
  code=$(curl -s -o /tmp/probe_body_$$ -w '%{http_code}' --max-time "$max" "$url" 2>/dev/null || echo 000)
  t1=$(python3 -c 'import time;print(time.time())')
  ms=$(python3 -c "print(int(($t1-$t0)*1000))")
  echo "$label|$code|$ms"
}

# Use the well-known root probe for the addressing service. The
# `/_health` endpoint does not exist (returns 404); the canonical
# "addressing-service is up" signal is a successful address read for a
# known body. We do both: the addressing-service root and the body-read.
H1=$(probe_http "addressing-root" "${ADDR_BASE}/" 10)
H2=$(probe_http "staging-api-health" "${STG_BASE}/api/health" 15)
ENC=$(python3 -c "import urllib.parse;print(urllib.parse.quote('arq://body/sovereignty_metrics/aggregate-current',safe=''))")
H3=$(probe_http "substrate-body-read" "${ADDR_BASE}/address/${ENC}?full=1" 10)

if [[ "$INCLUDE_MODAL" == "true" ]]; then
  H4=$(probe_http "modal-endpoint" "https://gashiru--arqera-inference-api.modal.run/" 10)
else
  H4="modal-endpoint|skipped|0"
fi

# Habitat identification
PEER_FP="$(twin --use-keychain status 2>/dev/null | grep -E '^\s*fingerprint' | head -1 | sed -E 's/.*:[[:space:]]*//' | tr -d '[:space:]' || echo unknown)"
HOSTNAME="$(hostname -s 2>/dev/null || echo unknown)"
UNAME_STR="$(uname -srm 2>/dev/null || echo unknown)"

# Parse + determine pass/fail (critical = addressing + substrate-read; staging + modal info-only).
# Use plain variables — macOS default bash 3.2 lacks associative arrays.
IFS='|' read -r _ ADDR_CODE ADDR_MS <<< "$H1"
IFS='|' read -r _ STG_CODE STG_MS <<< "$H2"
IFS='|' read -r _ SUB_CODE SUB_MS <<< "$H3"
IFS='|' read -r _ MODAL_CODE MODAL_MS <<< "$H4"

EXIT_CODE=0
PASS_PROBES=0
FAIL_PROBES=0
# addressing-root: any 2xx/3xx/4xx (404 on the bare root) proves the
# service is reachable; only 000/5xx = service down.
[[ "$ADDR_CODE" =~ ^[234] ]] && PASS_PROBES=$((PASS_PROBES+1)) || { EXIT_CODE=1; FAIL_PROBES=$((FAIL_PROBES+1)); }
# substrate-body-read: must be 2xx — proves we can serve a known body.
[[ "$SUB_CODE"  =~ ^[23] ]]  && PASS_PROBES=$((PASS_PROBES+1)) || { EXIT_CODE=1; FAIL_PROBES=$((FAIL_PROBES+1)); }
# staging behind auth — 2xx/3xx are OK
[[ "$STG_CODE"  =~ ^[23] ]]  && PASS_PROBES=$((PASS_PROBES+1)) || FAIL_PROBES=$((FAIL_PROBES+1))
if [[ "$MODAL_CODE" != "skipped" ]]; then
  [[ "$MODAL_CODE" =~ ^[234] ]] && PASS_PROBES=$((PASS_PROBES+1)) || FAIL_PROBES=$((FAIL_PROBES+1))
fi

if [[ "$FORMAT" == "json" ]]; then
  ADDR_CODE="$ADDR_CODE" ADDR_MS="$ADDR_MS" STG_CODE="$STG_CODE" STG_MS="$STG_MS" \
  SUB_CODE="$SUB_CODE" SUB_MS="$SUB_MS" MODAL_CODE="$MODAL_CODE" MODAL_MS="$MODAL_MS" \
  PEER_FP="$PEER_FP" HOSTNAME_STR="$HOSTNAME" UNAME_STR="$UNAME_STR" \
  PASS_PROBES="$PASS_PROBES" FAIL_PROBES="$FAIL_PROBES" \
  python3 - <<'PY'
import json, os
print(json.dumps({
  "habitat_peer_fingerprint": os.environ.get("PEER_FP",""),
  "hostname_short":           os.environ.get("HOSTNAME_STR",""),
  "uname":                    os.environ.get("UNAME_STR",""),
  "addressing_health_http":   os.environ.get("ADDR_CODE",""),
  "addressing_health_ms":     int(os.environ.get("ADDR_MS","0")),
  "staging_api_health_http":  os.environ.get("STG_CODE",""),
  "staging_api_health_ms":    int(os.environ.get("STG_MS","0")),
  "substrate_body_read_http": os.environ.get("SUB_CODE",""),
  "substrate_body_read_ms":   int(os.environ.get("SUB_MS","0")),
  "modal_endpoint_http":      os.environ.get("MODAL_CODE",""),
  "modal_endpoint_ms":        int(os.environ.get("MODAL_MS","0")),
  "pass_probes":              int(os.environ.get("PASS_PROBES","0")),
  "fail_probes":              int(os.environ.get("FAIL_PROBES","0")),
}, indent=2))
PY
  exit $EXIT_CODE
fi

echo "=== substrate survivability probe ($HOSTNAME — $UNAME_STR) ==="
echo "  habitat peer fingerprint: ${PEER_FP:0:12}…"
echo ""
printf "  %-25s %-8s %-10s\n" "probe" "http" "latency_ms"
printf "  %-25s %-8s %-10s\n" "addressing-health"   "$ADDR_CODE"  "$ADDR_MS"
printf "  %-25s %-8s %-10s\n" "staging-api-health"  "$STG_CODE"   "$STG_MS"
printf "  %-25s %-8s %-10s\n" "substrate-body-read" "$SUB_CODE"   "$SUB_MS"
printf "  %-25s %-8s %-10s\n" "modal-endpoint"      "$MODAL_CODE" "$MODAL_MS"
echo ""
echo "  pass: $PASS_PROBES   fail: $FAIL_PROBES"
[[ $EXIT_CODE -eq 0 ]] && echo "  verdict: PASS (critical probes green)" || echo "  verdict: FAIL (critical probe down)"
exit $EXIT_CODE
