#!/bin/bash
# harness-memd
# Harness memory daemon lifecycle manager
#
# Usage:
#   scripts/harness-memd start [--quiet]
#   scripts/harness-memd stop [--quiet]
#   scripts/harness-memd restart [--quiet]
#   scripts/harness-memd status
#   scripts/harness-memd cleanup-stale
#   scripts/harness-memd doctor

set -euo pipefail

SCRIPT_SOURCE="${BASH_SOURCE[0]}"
while [ -L "$SCRIPT_SOURCE" ]; do
  SCRIPT_SOURCE_DIR="$(cd -P "$(dirname "$SCRIPT_SOURCE")" && pwd)"
  SCRIPT_TARGET="$(readlink "$SCRIPT_SOURCE")"
  if [[ "$SCRIPT_TARGET" != /* ]]; then
    SCRIPT_SOURCE="${SCRIPT_SOURCE_DIR}/${SCRIPT_TARGET}"
  else
    SCRIPT_SOURCE="$SCRIPT_TARGET"
  fi
done
SCRIPT_DIR="$(cd -P "$(dirname "$SCRIPT_SOURCE")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
STATE_DIR="${HARNESS_MEM_HOME:-$HOME/.harness-mem}"
DEFAULT_STATE_DIR="$HOME/.harness-mem"
PID_FILE="${STATE_DIR}/daemon.pid"
LOCK_FILE="${STATE_DIR}/daemon.lock"
LOG_FILE="${STATE_DIR}/daemon.log"
UI_LOG_FILE="${STATE_DIR}/harness-mem-ui.log"
HEARTBEAT_FILE="${STATE_DIR}/daemon.heartbeat"
UI_PID_FILE="${STATE_DIR}/harness-mem-ui.pid"
DB_PATH="${HARNESS_MEM_DB_PATH:-$STATE_DIR/harness-mem.db}"
DEFAULT_DB_PATH="${DEFAULT_STATE_DIR}/harness-mem.db"
HOST="${HARNESS_MEM_HOST:-127.0.0.1}"
PORT="${HARNESS_MEM_PORT:-37888}"
DEFAULT_HOST="127.0.0.1"
DEFAULT_PORT="37888"
UI_PORT="${HARNESS_MEM_UI_PORT:-37901}"
HEALTH_URL="http://${HOST}:${PORT}/health"
READY_URL="http://${HOST}:${PORT}/health/ready"
UI_CONTEXT_URL="http://${HOST}:${UI_PORT}/api/context"
HEALTH_PROBE_TIMEOUT_SEC="${HARNESS_MEM_HEALTH_PROBE_TIMEOUT_SEC:-3}"
START_TIMEOUT_SEC="${HARNESS_MEM_START_TIMEOUT_SEC:-20}"
STOP_TIMEOUT_SEC="${HARNESS_MEM_STOP_TIMEOUT_SEC:-5}"
LOG_MAX_BYTES="${HARNESS_MEM_LOG_MAX_BYTES:-5242880}"
LOG_ROTATE_KEEP="${HARNESS_MEM_LOG_ROTATE_KEEP:-5}"
UI_ENABLED_RAW="${HARNESS_MEM_ENABLE_UI:-true}"
QUIET="false"
DAEMON_LAUNCHD_LABEL="${HARNESS_MEM_DAEMON_LAUNCHD_LABEL:-com.harness-mem.daemon}"
UI_LAUNCHD_LABEL="${HARNESS_MEM_UI_LAUNCHD_LABEL:-com.harness-mem.ui}"

DAEMON_ENTRY="${REPO_ROOT}/memory-server/src/index.ts"
UI_ENTRY="${REPO_ROOT}/harness-mem-ui/src/server.ts"
PROJECT_ROOT="${HARNESS_MEM_CODEX_PROJECT_ROOT:-$PWD}"
CONFIG_PATH="${STATE_DIR}/config.json"

ensure_config() {
  mkdir -p "$STATE_DIR"
  if [ ! -f "$CONFIG_PATH" ]; then
    cat > "$CONFIG_PATH" <<'CONF'
{
  "backend_mode": "local",
  "embedding_provider": "auto",
  "embedding_model": "multilingual-e5",
  "managed": {
    "endpoint": "",
    "api_key": ""
  }
}
CONF
  else
    local tmp="${CONFIG_PATH}.tmp.$$"
    jq '
      .backend_mode = (.backend_mode // "local")
      | .embedding_provider = (.embedding_provider // "auto")
      | .embedding_model = (.embedding_model // "multilingual-e5")
      | .managed = ((.managed // {}) + {
          endpoint: (.managed.endpoint // ""),
          api_key: (.managed.api_key // "")
        })
    ' "$CONFIG_PATH" > "$tmp" && mv "$tmp" "$CONFIG_PATH"
  fi
}

to_lower() {
  printf '%s' "$1" | tr '[:upper:]' '[:lower:]'
}

is_trueish() {
  case "$(to_lower "$1")" in
    1|true|yes|on)
      return 0
      ;;
    *)
      return 1
      ;;
  esac
}

is_ui_enabled() {
  is_trueish "$UI_ENABLED_RAW"
}

should_delegate_daemon_start_to_launchctl() {
  if is_trueish "${HARNESS_MEM_DISABLE_LAUNCHCTL_DELEGATION:-false}"; then
    return 1
  fi

  if [ -n "${HARNESS_MEM_DAEMON_LAUNCHD_LABEL:-}" ]; then
    return 0
  fi

  [ "$DAEMON_LAUNCHD_LABEL" = "com.harness-mem.daemon" ] \
    && [ "$STATE_DIR" = "$DEFAULT_STATE_DIR" ] \
    && [ "$DB_PATH" = "$DEFAULT_DB_PATH" ] \
    && [ "$HOST" = "$DEFAULT_HOST" ] \
    && [ "$PORT" = "$DEFAULT_PORT" ]
}

read_backend_mode() {
  ensure_config
  if [ -f "$CONFIG_PATH" ]; then
    local mode
    mode="$(jq -r '.backend_mode // "local"' "$CONFIG_PATH" 2>/dev/null || echo "local")"
    case "$mode" in
      local|managed|hybrid) echo "$mode" ;;
      *) echo "local" ;;
    esac
  else
    echo "local"
  fi
}

read_embedding_provider() {
  ensure_config
  if [ -f "$CONFIG_PATH" ]; then
    local provider
    provider="$(jq -r '.embedding_provider // "auto"' "$CONFIG_PATH" 2>/dev/null || echo "auto")"
    case "$provider" in
      auto|adaptive|fallback|openai|ollama|local) echo "$provider" ;;
      *) echo "auto" ;;
    esac
  else
    echo "auto"
  fi
}

read_embedding_model() {
  ensure_config
  if [ -f "$CONFIG_PATH" ]; then
    local model
    model="$(jq -r '.embedding_model // "multilingual-e5"' "$CONFIG_PATH" 2>/dev/null || echo "multilingual-e5")"
    if [ -n "$model" ] && [ "$model" != "null" ]; then
      echo "$model"
    else
      echo "multilingual-e5"
    fi
  else
    echo "multilingual-e5"
  fi
}

has_launchctl() {
  command -v launchctl >/dev/null 2>&1
}

launchctl_target() {
  local label="${1:-}"
  local uid
  uid="$(id -u 2>/dev/null || true)"
  if [ -z "$label" ] || [ -z "$uid" ]; then
    return 1
  fi
  printf 'gui/%s/%s' "$uid" "$label"
}

is_launchctl_job_loaded() {
  local target
  if ! has_launchctl; then
    return 1
  fi
  target="$(launchctl_target "${1:-}")" || return 1
  launchctl print "$target" >/dev/null 2>&1
}

kickstart_launchctl_job() {
  local target
  target="$(launchctl_target "${1:-}")" || return 1
  launchctl kickstart -k "$target" >/dev/null 2>&1
}

log() {
  if [ "$QUIET" = "true" ]; then
    return
  fi
  echo "$@"
}

warn() {
  if [ "$QUIET" = "true" ]; then
    return
  fi
  echo "[WARN] $@" >&2
}

is_uint() {
  case "$1" in
    ''|*[!0-9]*) return 1 ;;
    *) return 0 ;;
  esac
}

normalize_rotation_settings() {
  if ! is_uint "$LOG_MAX_BYTES" || [ "$LOG_MAX_BYTES" -lt 1024 ]; then
    LOG_MAX_BYTES=5242880
  fi
  if ! is_uint "$LOG_ROTATE_KEEP" || [ "$LOG_ROTATE_KEEP" -lt 1 ]; then
    LOG_ROTATE_KEEP=5
  fi
}

file_size_bytes() {
  local file="$1"
  local result
  # Try GNU stat first (Linux, MINGW64/Git Bash), then BSD stat (macOS), then wc fallback.
  # Each variant is captured in a variable to prevent stdout leakage from failed attempts
  # (MINGW64 stat -f prints filesystem info to stdout even on failure).
  result="$(stat -c "%s" "$file" 2>/dev/null)" && echo "$result" && return 0
  result="$(stat -f "%z" "$file" 2>/dev/null)" && echo "$result" && return 0
  wc -c < "$file"
}

rotate_log_file_if_needed() {
  local file="$1"
  local label="$2"
  if [ ! -f "$file" ] || [ "$LOG_MAX_BYTES" -le 0 ]; then
    return 0
  fi

  local size
  size="$(file_size_bytes "$file" | tr -dc '0-9' || true)"
  if ! is_uint "$size" || [ "$size" -lt "$LOG_MAX_BYTES" ]; then
    return 0
  fi

  local i
  for ((i=LOG_ROTATE_KEEP; i>=2; i--)); do
    if [ -f "${file}.$((i-1))" ]; then
      mv -f "${file}.$((i-1))" "${file}.${i}" 2>/dev/null || true
    fi
  done

  cp "$file" "${file}.1" 2>/dev/null || cat "$file" > "${file}.1" 2>/dev/null || true
  : > "$file"
  log "rotated ${label} log (size=${size}, max=${LOG_MAX_BYTES})"
}

apply_log_rotation() {
  rotate_log_file_if_needed "$LOG_FILE" "daemon"
  rotate_log_file_if_needed "$UI_LOG_FILE" "ui"
}

spawn_detached_command() {
  local pid_file="$1"
  local log_file="$2"
  shift 2

  if command -v setsid >/dev/null 2>&1; then
    setsid "$@" >> "$log_file" 2>&1 < /dev/null &
  else
    nohup "$@" >> "$log_file" 2>&1 < /dev/null &
  fi

  echo $! > "$pid_file"
}

is_pid_running() {
  local pid="$1"
  if [ -z "$pid" ]; then
    return 1
  fi
  kill -0 "$pid" >/dev/null 2>&1
}

read_pid_file() {
  if [ -f "$PID_FILE" ]; then
    tr -dc '0-9' < "$PID_FILE" 2>/dev/null || true
  fi
}

read_lock_pid() {
  if [ -f "$LOCK_FILE" ]; then
    tr -dc '0-9' < "$LOCK_FILE" 2>/dev/null || true
  fi
}

read_ui_pid_file() {
  if [ -f "$UI_PID_FILE" ]; then
    tr -dc '0-9' < "$UI_PID_FILE" 2>/dev/null || true
  fi
}

ensure_state_dir() {
  mkdir -p "$STATE_DIR"
  chmod 700 "$STATE_DIR" 2>/dev/null || true
}

cleanup_stale_pid() {
  local pid
  pid="$(read_pid_file)"
  if [ -n "$pid" ] && ! is_pid_running "$pid"; then
    rm -f "$PID_FILE"
    warn "Removed stale pid file (pid=${pid})"
  fi
}

cleanup_stale_lock() {
  local lock_pid
  lock_pid="$(read_lock_pid)"

  if [ -z "$lock_pid" ]; then
    rm -f "$LOCK_FILE"
    return
  fi

  if ! is_pid_running "$lock_pid"; then
    rm -f "$LOCK_FILE"
    warn "Removed stale lock file (pid=${lock_pid})"
  fi
}

cleanup_runtime_artifacts() {
  rm -f "$PID_FILE" "$LOCK_FILE"
}

acquire_lock() {
  ensure_state_dir
  cleanup_stale_lock

  if [ -f "$LOCK_FILE" ]; then
    local lock_pid
    lock_pid="$(read_lock_pid)"
    if [ -n "$lock_pid" ] && is_pid_running "$lock_pid"; then
      warn "Another harness-memd operation is in progress (pid=${lock_pid})"
      return 1
    fi
    rm -f "$LOCK_FILE"
  fi

  if ( set -o noclobber; echo "$$" > "$LOCK_FILE" ) 2>/dev/null; then
    return 0
  fi

  warn "Failed to acquire daemon lock"
  return 1
}

release_lock() {
  local lock_pid
  lock_pid="$(read_lock_pid)"
  if [ "$lock_pid" = "$$" ]; then
    rm -f "$LOCK_FILE"
  fi
}

wait_for_health() {
  local retries=$((START_TIMEOUT_SEC * 2))
  local attempt=0

  while [ "$attempt" -lt "$retries" ]; do
    if is_health_reachable; then
      return 0
    fi
    sleep 0.5
    attempt=$((attempt + 1))
  done

  return 1
}

wait_for_ui() {
  local retries=$((START_TIMEOUT_SEC * 2))
  local attempt=0

  while [ "$attempt" -lt "$retries" ]; do
    if is_ui_reachable; then
      return 0
    fi
    sleep 0.5
    attempt=$((attempt + 1))
  done

  return 1
}

is_health_reachable() {
  local response
  response="$(curl --silent --show-error --max-time "$HEALTH_PROBE_TIMEOUT_SEC" "$READY_URL" 2>/dev/null || true)"
  if [ -z "$response" ]; then
    response="$(curl --silent --show-error --max-time "$HEALTH_PROBE_TIMEOUT_SEC" "$HEALTH_URL" 2>/dev/null || true)"
  fi
  if [ -z "$response" ]; then
    return 1
  fi

  if command -v jq >/dev/null 2>&1; then
    printf '%s' "$response" | jq -e '.ok == true and (.items | type == "array")' >/dev/null 2>&1
    return $?
  fi

  printf '%s' "$response" | grep -Eq '"ok"[[:space:]]*:[[:space:]]*true'
}

is_ui_reachable() {
  local response
  response="$(curl --silent --show-error --max-time 1 "$UI_CONTEXT_URL" 2>/dev/null || true)"
  if [ -z "$response" ]; then
    return 1
  fi

  if command -v jq >/dev/null 2>&1; then
    printf '%s' "$response" | jq -e '.ok == true' >/dev/null 2>&1
    return $?
  fi

  printf '%s' "$response" | grep -Eq '"ok"[[:space:]]*:[[:space:]]*true'
}

is_expected_daemon_pid() {
  local pid="$1"
  if [ -z "$pid" ] || ! is_pid_running "$pid"; then
    return 1
  fi

  local args
  args="$(ps -p "$pid" -o args= 2>/dev/null || true)"
  if [ -z "$args" ]; then
    return 1
  fi

  if [[ "$args" == *"$DAEMON_ENTRY"* ]]; then
    return 0
  fi

  if [[ "$args" == *"memory-server/src/index.ts"* ]] && [[ "$args" == *"harness-mem"* ]]; then
    return 0
  fi

  return 1
}

discover_daemon_pid_from_port() {
  if ! command -v lsof >/dev/null 2>&1; then
    return 1
  fi

  local pid
  while IFS= read -r pid; do
    [ -n "$pid" ] || continue
    if is_expected_daemon_pid "$pid"; then
      echo "$pid"
      return 0
    fi
  done < <(lsof -nP -tiTCP:"$PORT" -sTCP:LISTEN 2>/dev/null || true)

  return 1
}

is_expected_ui_pid() {
  local pid="$1"
  if [ -z "$pid" ] || ! is_pid_running "$pid"; then
    return 1
  fi

  local args
  args="$(read_process_args "$pid")"
  if [ -z "$args" ]; then
    return 1
  fi

  if [[ "$args" == *"$UI_ENTRY"* ]]; then
    return 0
  fi

  if [[ "$args" == *"harness-mem-ui/src/server.ts"* ]] && [[ "$args" == *"harness-mem"* ]]; then
    return 0
  fi

  return 1
}

discover_ui_pid_from_port() {
  if ! command -v lsof >/dev/null 2>&1; then
    return 1
  fi

  local pid
  while IFS= read -r pid; do
    [ -n "$pid" ] || continue
    if is_expected_ui_pid "$pid"; then
      echo "$pid"
      return 0
    fi
  done < <(lsof -nP -tiTCP:"$UI_PORT" -sTCP:LISTEN 2>/dev/null || true)

  return 1
}

discover_listener_pid_for_port() {
  local target_port="$1"
  if ! command -v lsof >/dev/null 2>&1; then
    return 1
  fi

  local pid
  pid="$(lsof -nP -tiTCP:"$target_port" -sTCP:LISTEN 2>/dev/null | head -n1 || true)"
  if [ -z "$pid" ]; then
    return 1
  fi

  echo "$pid"
  return 0
}

discover_listener_pid_from_port() {
  discover_listener_pid_for_port "$PORT"
}

read_process_args() {
  local pid="$1"
  if [ -z "$pid" ]; then
    return 0
  fi
  ps -p "$pid" -o args= 2>/dev/null || true
}

sync_ui_pid_file() {
  local runtime_pid
  runtime_pid="$(discover_ui_pid_from_port || true)"
  if [ -n "$runtime_pid" ]; then
    if [ "$(read_ui_pid_file)" != "$runtime_pid" ]; then
      echo "$runtime_pid" > "$UI_PID_FILE"
    fi
    echo "$runtime_pid"
    return 0
  fi

  local file_pid
  file_pid="$(read_ui_pid_file)"
  if [ -n "$file_pid" ] && ! is_pid_running "$file_pid"; then
    rm -f "$UI_PID_FILE"
  fi
  return 1
}

is_pid_listening_on_port() {
  local pid="$1"
  if [ -z "$pid" ] || ! command -v lsof >/dev/null 2>&1; then
    return 1
  fi
  lsof -nP -a -p "$pid" -iTCP:"$PORT" -sTCP:LISTEN >/dev/null 2>&1
}

refresh_pid_file_from_runtime() {
  local runtime_pid
  runtime_pid="$(discover_daemon_pid_from_port || true)"
  if [ -z "$runtime_pid" ]; then
    return 1
  fi

  if [ "$(read_pid_file)" != "$runtime_pid" ]; then
    echo "$runtime_pid" > "$PID_FILE"
  fi

  echo "$runtime_pid"
  return 0
}

start_ui() {
  if ! is_ui_enabled; then
    rm -f "$UI_PID_FILE"
    log "harness-mem-ui startup disabled (HARNESS_MEM_ENABLE_UI=false)"
    return 0
  fi

  if [ ! -f "$UI_ENTRY" ]; then
    echo "[ERR] harness-mem-ui entry not found: $UI_ENTRY" >&2
    return 1
  fi

  if ! command -v bun >/dev/null 2>&1; then
    echo "[ERR] bun is required but not found (for harness-mem-ui)" >&2
    return 1
  fi

  local runtime_pid
  runtime_pid="$(discover_ui_pid_from_port || true)"
  if [ -n "$runtime_pid" ]; then
    echo "$runtime_pid" > "$UI_PID_FILE"
    if wait_for_ui; then
      log "harness-mem-ui already running (pid=${runtime_pid}, port=${UI_PORT})"
      return 0
    fi
    warn "UI listener exists on port ${UI_PORT} (pid=${runtime_pid}) but /api/context is unreachable"
    warn "Refusing to spawn duplicate UI process. Check runtime with: scripts/harness-memd doctor"
    return 1
  fi

  local listener_pid
  listener_pid="$(discover_listener_pid_for_port "$UI_PORT" || true)"
  if [ -n "$listener_pid" ]; then
    local listener_args
    listener_args="$(read_process_args "$listener_pid")"
    warn "UI port ${UI_PORT} is already in use by another process (pid=${listener_pid})"
    if [ -n "$listener_args" ]; then
      warn "Process args: ${listener_args}"
    fi
    warn "Resolve the UI port conflict before starting harness-mem-ui"
    return 1
  fi

  log "Starting harness-mem-ui... (port=${UI_PORT})"
  (
    cd "$REPO_ROOT"
    spawn_detached_command "$UI_PID_FILE" "$UI_LOG_FILE" env \
      HARNESS_MEM_HOST="$HOST" \
      HARNESS_MEM_PORT="$PORT" \
      HARNESS_MEM_UI_PORT="$UI_PORT" \
      bun run "$UI_ENTRY"
  )

  local new_ui_pid
  new_ui_pid="$(read_ui_pid_file)"
  if [ -z "$new_ui_pid" ]; then
    echo "[ERR] Failed to create UI pid file" >&2
    return 1
  fi

  if wait_for_ui; then
    log "harness-mem-ui started (pid=${new_ui_pid}, url=http://${HOST}:${UI_PORT})"
    return 0
  fi

  warn "UI health check failed after start timeout"
  if is_pid_running "$new_ui_pid"; then
    kill -TERM "$new_ui_pid" >/dev/null 2>&1 || true
    sleep 1
    if is_pid_running "$new_ui_pid"; then
      kill -KILL "$new_ui_pid" >/dev/null 2>&1 || true
    fi
  fi
  rm -f "$UI_PID_FILE"
  return 1
}

stop_ui() {
  local ui_pid
  ui_pid="$(read_ui_pid_file)"
  if [ -z "$ui_pid" ]; then
    ui_pid="$(discover_ui_pid_from_port || true)"
  fi

  if [ -z "$ui_pid" ]; then
    rm -f "$UI_PID_FILE"
    return 0
  fi

  if ! is_pid_running "$ui_pid"; then
    rm -f "$UI_PID_FILE"
    return 0
  fi

  log "Stopping harness-mem-ui (pid=${ui_pid})..."
  kill -TERM "$ui_pid" >/dev/null 2>&1 || true

  local waited=0
  while is_pid_running "$ui_pid" && [ "$waited" -lt "$STOP_TIMEOUT_SEC" ]; do
    sleep 1
    waited=$((waited + 1))
  done

  if is_pid_running "$ui_pid"; then
    warn "UI did not exit in ${STOP_TIMEOUT_SEC}s, sending SIGKILL"
    kill -KILL "$ui_pid" >/dev/null 2>&1 || true
    sleep 0.5
  fi

  rm -f "$UI_PID_FILE"
  log "harness-mem-ui stopped"
  return 0
}

start_daemon() {
  ensure_state_dir
  apply_log_rotation

  if [ ! -f "$DAEMON_ENTRY" ]; then
    echo "[ERR] memory daemon entry not found: $DAEMON_ENTRY" >&2
    return 1
  fi

  if ! command -v bun >/dev/null 2>&1; then
    echo "[ERR] bun is required but not found" >&2
    return 1
  fi

  # When the daemon is LaunchAgent-managed, launchd must own the long-running
  # server process. Spawning a detached daemon here makes launchd think the job
  # exited, which can lead to periodic start attempts and EADDRINUSE loops.
  if should_delegate_daemon_start_to_launchctl && is_launchctl_job_loaded "$DAEMON_LAUNCHD_LABEL"; then
    local launchd_runtime_pid
    launchd_runtime_pid="$(discover_daemon_pid_from_port || true)"
    if [ -n "$launchd_runtime_pid" ]; then
      echo "$launchd_runtime_pid" > "$PID_FILE"
      if wait_for_health; then
        log "harness-memd already running (pid=${launchd_runtime_pid})"
        if start_ui; then
          return 0
        fi
        return 1
      fi
      warn "LaunchAgent is loaded and daemon listener exists on port ${PORT} (pid=${launchd_runtime_pid}) but health endpoint is unreachable"
      warn "Refusing to spawn detached daemon under LaunchAgent management"
      return 1
    fi

    if ! kickstart_launchctl_job "$DAEMON_LAUNCHD_LABEL"; then
      warn "launchctl kickstart failed for ${DAEMON_LAUNCHD_LABEL}"
      return 1
    fi
    if wait_for_health; then
      local runtime_pid
      runtime_pid="$(refresh_pid_file_from_runtime || true)"
      if [ -n "$runtime_pid" ]; then
        log "harness-memd started via launchctl (pid=${runtime_pid}, port=${PORT})"
      else
        log "harness-memd started via launchctl (port=${PORT})"
      fi
      if start_ui; then
        return 0
      fi
      return 1
    fi
    warn "launchctl start did not restore health in time"
    return 1
  fi

  local pid
  pid="$(read_pid_file)"

  # If another manager (e.g. launchctl) already runs the daemon on this port,
  # trust runtime health and re-sync the pid file instead of spawning duplicates.
  if is_health_reachable; then
    local runtime_pid
    runtime_pid="$(refresh_pid_file_from_runtime || true)"
    if [ -n "$runtime_pid" ]; then
      log "harness-memd already running (pid=${runtime_pid})"
    else
      log "harness-memd already running (health endpoint reachable)"
    fi
    if start_ui; then
      return 0
    fi
    return 1
  fi

  if [ -n "$pid" ] && is_pid_running "$pid"; then
    if is_health_reachable; then
      log "harness-memd already running (pid=${pid})"
      if start_ui; then
        return 0
      fi
      return 1
    fi
    if is_pid_listening_on_port "$pid"; then
      warn "Pid exists but health check failed. restarting stale daemon..."
      stop_daemon || true
    else
      warn "Pid file points to running process not listening on target port ${PORT}; preserving existing process"
    fi
  fi

  cleanup_stale_pid

  # Guard against duplicate spawn when listener exists but health probe is
  # temporarily lagging. This avoids noisy EADDRINUSE restart storms.
  local runtime_pid
  runtime_pid="$(discover_daemon_pid_from_port || true)"
  if [ -n "$runtime_pid" ]; then
    echo "$runtime_pid" > "$PID_FILE"
    if wait_for_health; then
      log "harness-memd already running (pid=${runtime_pid})"
      if start_ui; then
        return 0
      fi
      return 1
    fi
    warn "Daemon listener exists on port ${PORT} (pid=${runtime_pid}) but health endpoint is unreachable"
    warn "Refusing to spawn duplicate daemon. Check runtime with: scripts/harness-memd status"
    return 1
  fi

  local listener_pid
  listener_pid="$(discover_listener_pid_from_port || true)"
  if [ -n "$listener_pid" ]; then
    local listener_args
    listener_args="$(read_process_args "$listener_pid")"
    warn "Port ${PORT} is already in use by another process (pid=${listener_pid})"
    if [ -n "$listener_args" ]; then
      warn "Process args: ${listener_args}"
    fi
    warn "Resolve the port conflict before starting harness-memd"
    return 1
  fi

  local backend_mode managed_endpoint managed_api_key embedding_provider embedding_model
  backend_mode="$(read_backend_mode)"
  managed_endpoint="$(jq -r '.managed.endpoint // ""' "$CONFIG_PATH" 2>/dev/null || echo "")"
  managed_api_key="$(jq -r '.managed.api_key // ""' "$CONFIG_PATH" 2>/dev/null || echo "")"
  embedding_provider="${HARNESS_MEM_EMBEDDING_PROVIDER:-$(read_embedding_provider)}"
  embedding_model="${HARNESS_MEM_EMBEDDING_MODEL:-$(read_embedding_model)}"
  log "Starting harness-memd... (backend: ${backend_mode}, embedding: ${embedding_provider}:${embedding_model})"
  if [ "$backend_mode" != "local" ] && [ -n "$managed_endpoint" ]; then
    log "  managed endpoint: ${managed_endpoint}"
  fi
  (
    cd "$REPO_ROOT"
    spawn_detached_command "$PID_FILE" "$LOG_FILE" env \
      HARNESS_MEM_DB_PATH="$DB_PATH" \
      HARNESS_MEM_HOST="$HOST" \
      HARNESS_MEM_PORT="$PORT" \
      HARNESS_MEM_CODEX_PROJECT_ROOT="$PROJECT_ROOT" \
      HARNESS_MEM_BACKEND_MODE="$backend_mode" \
      HARNESS_MEM_MANAGED_ENDPOINT="$managed_endpoint" \
      HARNESS_MEM_MANAGED_API_KEY="$managed_api_key" \
      HARNESS_MEM_EMBEDDING_PROVIDER="$embedding_provider" \
      HARNESS_MEM_EMBEDDING_MODEL="$embedding_model" \
      bun run "$DAEMON_ENTRY"
  )

  local new_pid
  new_pid="$(read_pid_file)"
  if [ -z "$new_pid" ]; then
    echo "[ERR] Failed to create pid file" >&2
    return 1
  fi

  if wait_for_health; then
    log "harness-memd started (pid=${new_pid}, port=${PORT})"
    if start_ui; then
      return 0
    fi
    return 1
  fi

  warn "Health check failed after start timeout"
  if is_pid_running "$new_pid"; then
    kill -TERM "$new_pid" >/dev/null 2>&1 || true
    sleep 1
    if is_pid_running "$new_pid"; then
      kill -KILL "$new_pid" >/dev/null 2>&1 || true
    fi
  fi

  cleanup_runtime_artifacts
  return 1
}

stop_daemon() {
  ensure_state_dir

  stop_ui || true

  local pid
  pid="$(read_pid_file)"

  if [ -z "$pid" ]; then
    cleanup_runtime_artifacts
    log "harness-memd is not running"
    return 0
  fi

  if ! is_pid_running "$pid"; then
    cleanup_runtime_artifacts
    log "harness-memd pid file was stale and has been cleaned"
    return 0
  fi

  log "Stopping harness-memd (pid=${pid})..."
  kill -TERM "$pid" >/dev/null 2>&1 || true

  local waited=0
  while is_pid_running "$pid" && [ "$waited" -lt "$STOP_TIMEOUT_SEC" ]; do
    sleep 1
    waited=$((waited + 1))
  done

  if is_pid_running "$pid"; then
    warn "Daemon did not exit in ${STOP_TIMEOUT_SEC}s, sending SIGKILL"
    kill -KILL "$pid" >/dev/null 2>&1 || true
    sleep 0.5
  fi

  cleanup_runtime_artifacts
  log "harness-memd stopped"
  return 0
}

status_daemon() {
  ensure_state_dir
  apply_log_rotation
  sync_ui_pid_file >/dev/null 2>&1 || true

  local pid
  pid="$(read_pid_file)"

  if [ -z "$pid" ] || ! is_pid_running "$pid"; then
    if is_health_reachable; then
      local runtime_pid
      runtime_pid="$(refresh_pid_file_from_runtime || true)"
      if [ -n "$runtime_pid" ]; then
        pid="$runtime_pid"
      else
        echo "running (health endpoint reachable, pid unresolved)"
        return 0
      fi
    else
      if [ -z "$pid" ]; then
        echo "stopped"
      else
        echo "stale (pid file exists but process is dead)"
      fi
      return 1
    fi
  fi

  if is_health_reachable; then
    local heartbeat_info=""
    if [ -f "$HEARTBEAT_FILE" ]; then
      heartbeat_info=" heartbeat=$(cat "$HEARTBEAT_FILE" 2>/dev/null | tr '\n' ' ')"
    fi
    echo "running pid=${pid} port=${PORT}${heartbeat_info}"
    return 0
  fi

  echo "degraded (pid=${pid} running but health endpoint unreachable)"
  return 1
}

cleanup_stale() {
  ensure_state_dir
  apply_log_rotation
  cleanup_stale_pid
  cleanup_stale_lock
  sync_ui_pid_file >/dev/null 2>&1 || true
  if [ -f "$PID_FILE" ]; then
    local pid
    pid="$(read_pid_file)"
    if [ -n "$pid" ] && ! is_pid_running "$pid"; then
      rm -f "$PID_FILE"
    fi
  fi
  if [ -f "$LOCK_FILE" ]; then
    local lock_pid
    lock_pid="$(read_lock_pid)"
    if [ -n "$lock_pid" ] && ! is_pid_running "$lock_pid"; then
      rm -f "$LOCK_FILE"
    fi
  fi
  log "stale artifacts cleaned"
}

doctor_daemon() {
  ensure_state_dir
  apply_log_rotation
  sync_ui_pid_file >/dev/null 2>&1 || true
  local ok=0
  local health_ok=0

  echo "== harness-memd doctor =="
  echo "state_dir: $STATE_DIR"
  echo "db_path: $DB_PATH"
  echo "ready_url: $READY_URL"
  echo "health_url: $HEALTH_URL"

  if command -v bun >/dev/null 2>&1; then
    echo "[ok] bun available: $(bun --version 2>/dev/null || echo unknown)"
  else
    echo "[ng] bun is not installed"
    ok=1
  fi

  if command -v curl >/dev/null 2>&1; then
    echo "[ok] curl available"
  else
    echo "[ng] curl is not installed"
    ok=1
  fi

  if is_health_reachable; then
    health_ok=1
  fi

  if [ -f "$PID_FILE" ]; then
    local pid
    pid="$(read_pid_file)"
    if [ -n "$pid" ] && is_pid_running "$pid"; then
      echo "[ok] pid file points to running process: $pid"
    elif [ "$health_ok" -eq 1 ]; then
      local runtime_pid
      runtime_pid="$(refresh_pid_file_from_runtime || true)"
      if [ -n "$runtime_pid" ]; then
        echo "[ok] refreshed stale pid file from active runtime: $runtime_pid"
      else
        echo "[warn] pid file is stale"
        ok=1
      fi
    else
      echo "[warn] pid file is stale"
      ok=1
    fi
  elif [ "$health_ok" -eq 1 ]; then
    local runtime_pid
    runtime_pid="$(refresh_pid_file_from_runtime || true)"
    if [ -n "$runtime_pid" ]; then
      echo "[ok] reconstructed pid file from active runtime: $runtime_pid"
    else
      echo "[info] pid file not found"
    fi
  else
    echo "[info] pid file not found"
  fi

  if [ "$health_ok" -eq 1 ]; then
    echo "[ok] health endpoint reachable"
  else
    echo "[warn] health endpoint not reachable"
    ok=1
  fi

  if is_ui_enabled; then
    if is_ui_reachable; then
      echo "[ok] ui endpoint reachable: http://${HOST}:${UI_PORT}"
    else
      echo "[warn] ui endpoint not reachable: http://${HOST}:${UI_PORT}"
      ok=1
    fi

    local ui_pid
    ui_pid="$(read_ui_pid_file)"
    if [ -n "$ui_pid" ] && is_pid_running "$ui_pid"; then
      echo "[ok] ui pid file points to running process: $ui_pid (port=${UI_PORT})"
    elif [ -n "$ui_pid" ]; then
      echo "[warn] ui pid file is stale"
      ok=1
    else
      echo "[warn] ui pid not found"
      ok=1
    fi
  else
    echo "[info] ui startup disabled (HARNESS_MEM_ENABLE_UI=false)"
  fi

  return "$ok"
}

main() {
  local cmd="${1:-status}"
  shift || true

  while [ "$#" -gt 0 ]; do
    case "$1" in
      --quiet)
        QUIET="true"
        ;;
      *)
        ;;
    esac
    shift || true
  done

  normalize_rotation_settings

  case "$cmd" in
    start)
      acquire_lock || exit 1
      trap release_lock EXIT
      start_daemon
      ;;
    stop)
      acquire_lock || exit 1
      trap release_lock EXIT
      stop_daemon
      ;;
    restart)
      acquire_lock || exit 1
      trap release_lock EXIT
      if is_launchctl_job_loaded "$DAEMON_LAUNCHD_LABEL"; then
        # launchd will invoke this script's start path, which owns the real
        # daemon transition and needs the operation lock itself.
        release_lock
        trap - EXIT
        if ! kickstart_launchctl_job "$DAEMON_LAUNCHD_LABEL"; then
          warn "launchctl kickstart failed for ${DAEMON_LAUNCHD_LABEL}"
          exit 1
        fi
        if wait_for_health; then
          local runtime_pid
          runtime_pid="$(refresh_pid_file_from_runtime || true)"
          if [ -n "$runtime_pid" ]; then
            log "harness-memd restarted via launchctl (pid=${runtime_pid}, port=${PORT})"
          else
            log "harness-memd restarted via launchctl (port=${PORT})"
          fi
          if is_ui_enabled && is_launchctl_job_loaded "$UI_LAUNCHD_LABEL"; then
            kickstart_launchctl_job "$UI_LAUNCHD_LABEL" || true
            sync_ui_pid_file >/dev/null 2>&1 || true
          fi
          exit 0
        fi
        warn "launchctl restart did not restore health in time"
        exit 1
      fi
      stop_daemon || true
      start_daemon
      ;;
    status)
      status_daemon
      ;;
    cleanup-stale)
      acquire_lock || exit 1
      trap release_lock EXIT
      cleanup_stale
      ;;
    doctor)
      doctor_daemon
      ;;
    *)
      echo "Usage: $0 {start|stop|restart|status|cleanup-stale|doctor} [--quiet]" >&2
      exit 1
      ;;
  esac
}

main "$@"
