# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

# do/benchmark — Run SageMaker AI Benchmark against deployed endpoint
# Uses NVIDIA AIPerf via the SageMaker AI Benchmarking service to measure
# LLM endpoint performance: throughput, latency, TTFT, and ITL.

set -e
set -u
set -o pipefail

# ── Source project configuration ──────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/config"

# ── Parse flags ───────────────────────────────────────────────────────────────
CLEAN_AFTER=false
FORCE=false
IC_ARG=""
ADAPTER_ARG=""
ARG_NO_STALE_WARNING=false
ARG_WORKLOAD=""
while [ $# -gt 0 ]; do
    case "$1" in
        --clean) CLEAN_AFTER=true; shift ;;
        --force) FORCE=true; shift ;;
        --no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
        --workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
        --ic) shift; IC_ARG="${1:-}"; shift ;;
        --adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
        --help|-h)
            echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean] [--no-stale-warning]"
            echo ""
            echo "Run SageMaker AI Benchmark against the deployed endpoint."
            echo ""
            echo "Options:"
            echo "  --ic <name>         Benchmark a specific inference component"
            echo "  --adapter <name>    Benchmark a specific LoRA adapter IC"
            echo "  --force             Create a new benchmark job even if one is already running"
            echo "  --clean             Delete workload config and benchmark job after displaying results"
            echo "  --no-stale-warning  Suppress schema registry staleness warning"
            echo ""
            echo "IC resolution:"
            echo "  --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
            echo "  --ic <name>      Use IC_DEPLOYED_NAME from do/ic/<name>.conf"
            echo "  (no flag)        Use first IC in do/ic/ alphabetically, or legacy config"
            echo ""
            echo "Idempotency:"
            echo "  If a benchmark job is already in progress, re-running without --force"
            echo "  will resume waiting for the existing job and display its results."
            echo ""
            echo "Prerequisites:"
            echo "  • Endpoint must be deployed and InService (run ./do/deploy first)"
            echo "  • AWS credentials must be configured"
            exit 0
            ;;
        *) shift ;;
    esac
done


# ── Require --workload flag ───────────────────────────────────────────────────
if [ -z "${ARG_WORKLOAD}" ]; then
    echo "❌ --workload <name> is required"
    echo ""
    # List available workloads from the MCP catalog
    _CATALOG_FOR_HELP=""
    if command -v npm &>/dev/null; then
        _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
        if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
            _CATALOG_FOR_HELP="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
        fi
    fi
    if [ -n "${_CATALOG_FOR_HELP}" ]; then
        echo "   Available workloads:"
        python3 -c "
import json
with open('${_CATALOG_FOR_HELP}') as f:
    catalog = json.load(f)
for name, wl in catalog.get('workloads', {}).items():
    print(f'     {name:30s} {wl.get("description", "")[:50]}')
" 2>/dev/null || echo "   (could not read workload catalog)"
    else
        echo "   Run 'ml-container-creator mcp init' to install workload profiles"
    fi
    echo ""
    echo "   Usage: ./do/benchmark --workload multi_turn_chat"
    exit 1
fi

# ── Workload Resolution (from workload-picker MCP server catalog) ─────────────
# If --workload is passed with a named workload (not "manual"), resolve
# the workload parameters from the MCP server's catalog file. This overrides
# BENCHMARK_INPUT_TOKENS_MEAN, BENCHMARK_OUTPUT_TOKENS_MEAN, BENCHMARK_STREAMING,
# and BENCHMARK_CONCURRENCY_LEVELS from do/config.
BENCHMARK_WORKLOAD="${ARG_WORKLOAD:-manual}"

if [ "${BENCHMARK_WORKLOAD}" != "manual" ]; then
    # Locate the workload catalog (npm global or local)
    _WORKLOAD_CATALOG=""
    if [ -f "$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
        _WORKLOAD_CATALOG="$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
    elif command -v npm &>/dev/null; then
        _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
        if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
            _WORKLOAD_CATALOG="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
        fi
    fi

    if [ -n "${_WORKLOAD_CATALOG}" ]; then
        _WL_PARAMS=$(python3 -c "
import json, sys
with open('${_WORKLOAD_CATALOG}') as f:
    catalog = json.load(f)
wl = catalog.get('workloads', {}).get('${BENCHMARK_WORKLOAD}')
if wl:
    print(json.dumps(wl))
else:
    print('null')
" 2>/dev/null) || _WL_PARAMS="null"

        if [ "${_WL_PARAMS}" != "null" ] && [ -n "${_WL_PARAMS}" ]; then
            echo "📋 Workload profile: ${BENCHMARK_WORKLOAD}"
            BENCHMARK_INPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['input_tokens_mean'])")
            BENCHMARK_OUTPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['output_tokens_mean'])")
            BENCHMARK_STREAMING=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(str(json.load(sys.stdin)['streaming']).lower())")
            # Set concurrency levels for multi-level mode if not already overridden
            if [ -z "${BENCHMARK_CONCURRENCY_LEVELS:-}" ]; then
                BENCHMARK_CONCURRENCY_LEVELS=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(','.join(str(x) for x in json.load(sys.stdin)['concurrency_levels']))")
            fi
            # Also override single-level BENCHMARK_CONCURRENCY with first level from workload
            BENCHMARK_CONCURRENCY=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['concurrency_levels'][0])")
            echo "   Input tokens: ${BENCHMARK_INPUT_TOKENS_MEAN}, Output tokens: ${BENCHMARK_OUTPUT_TOKENS_MEAN}"
            echo "   Streaming: ${BENCHMARK_STREAMING}, Concurrency: ${BENCHMARK_CONCURRENCY_LEVELS:-${BENCHMARK_CONCURRENCY}}"
            echo ""
        else
            echo "⚠️  Unknown workload '${BENCHMARK_WORKLOAD}' — using do/config defaults"
        fi
    else
        echo "⚠️  Workload catalog not found — using do/config defaults"
    fi
fi

# ── Resolve profile-level values ──────────────────────────────────────────────
# Read S3 buckets and account info from the bootstrap profile
_PROFILE_JSON=""
if command -v python3 &>/dev/null; then
    _PROFILE_JSON=$(python3 -c "
import json, os
config_path = os.path.expanduser('~/.ml-container-creator/config.json')
try:
    with open(config_path) as f:
        config = json.load(f)
    profile = config['profiles'][config['activeProfile']]
    print(json.dumps(profile))
except:
    print('{}')
" 2>/dev/null) || _PROFILE_JSON="{}"
fi

# Extract benchmark-relevant profile values
BENCHMARK_S3_OUTPUT_PATH=$(echo "${_PROFILE_JSON}" | python3 -c "
import sys, json
p = json.load(sys.stdin)
bucket = p.get('benchmarkS3Bucket', '')
if not bucket:
    acct = p.get('accountId', 'unknown')
    region = p.get('awsRegion', 'us-east-1')
    bucket = f'mlcc-benchmark-{acct}-{region}'
print(f's3://{bucket}/${PROJECT_NAME}/')
" 2>/dev/null) || BENCHMARK_S3_OUTPUT_PATH=""

CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""

# Derive job names at runtime (unique per invocation)
BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"

# Ensure benchmark params have defaults (in case workload catalog wasn't found)
BENCHMARK_CONCURRENCY=${BENCHMARK_CONCURRENCY:-10}
BENCHMARK_INPUT_TOKENS_MEAN=${BENCHMARK_INPUT_TOKENS_MEAN:-550}
BENCHMARK_OUTPUT_TOKENS_MEAN=${BENCHMARK_OUTPUT_TOKENS_MEAN:-150}
BENCHMARK_STREAMING=${BENCHMARK_STREAMING:-true}



# ── Multi-level concurrency support (CI Stage 2) ─────────────────────────────
# When BENCHMARK_CONCURRENCY_LEVELS is set (comma-separated integers, e.g. "1,4,8"
# or JSON array string, e.g. "[1,4,8]"), and we are NOT already in single-level
# execution mode (_BENCHMARK_SINGLE_LEVEL), the script iterates over each level,
# re-invoking itself for each one.
# Results from all levels are aggregated into a combined JSON for the benchmark writer.
# This supports Requirement 1.5: configurable concurrency levels per config.
if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
    # Normalize: strip brackets and spaces, convert to comma-separated
    _NORMALIZED_LEVELS=$(echo "${BENCHMARK_CONCURRENCY_LEVELS}" | tr -d '[] ' )

    # Skip if empty after normalization
    if [ -n "${_NORMALIZED_LEVELS}" ]; then
        echo "📊 Multi-level benchmark: running concurrency levels [${_NORMALIZED_LEVELS}]"
        echo ""

        IFS=',' read -ra _LEVELS <<< "${_NORMALIZED_LEVELS}"
        _ALL_RESULTS_DIR="${SCRIPT_DIR}/../benchmarks/multi-level-$(date +%Y%m%d-%H%M%S)"
        mkdir -p "${_ALL_RESULTS_DIR}"
        _LEVEL_FAILURES=0

        for _LEVEL in "${_LEVELS[@]}"; do
            _LEVEL=$(echo "${_LEVEL}" | tr -d ' ')
            # Skip non-numeric values
            if ! [[ "${_LEVEL}" =~ ^[0-9]+$ ]]; then
                echo "⚠️  Skipping invalid concurrency level: ${_LEVEL}"
                continue
            fi
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
            echo "  Running benchmark at concurrency level: ${_LEVEL}"
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
            echo ""

            # Re-invoke self with overridden concurrency and single-level flag
            export BENCHMARK_CONCURRENCY="${_LEVEL}"
            export _BENCHMARK_SINGLE_LEVEL=1
            # Build argument list for re-invocation
            _REINVOKE_ARGS="--force"
            if [ "${CLEAN_AFTER}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --clean"; fi
            if [ "${ARG_NO_STALE_WARNING}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --no-stale-warning"; fi
            if [ -n "${ARG_WORKLOAD}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --workload ${ARG_WORKLOAD}"; fi
            if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
            if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi

            if "${BASH_SOURCE[0]}" ${_REINVOKE_ARGS}; then
                # Copy results to aggregation directory — find the child's results
                # Try the marker file first (set by child), then fall back to ls -td
                _LATEST_JOB_DIR=""
                if [ -f "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" ]; then
                    _LATEST_JOB_DIR=$(cat "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null)
                fi
                if [ -z "${_LATEST_JOB_DIR}" ] || [ ! -d "${_LATEST_JOB_DIR}" ]; then
                    _LATEST_JOB_DIR=$(ls -td "${SCRIPT_DIR}/../benchmarks/${PROJECT_NAME}-benchmark-"* 2>/dev/null | head -1)
                fi
                if [ -n "${_LATEST_JOB_DIR}" ] && [ -d "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export.jsonl" ]; then
                    cp "${_LATEST_JOB_DIR}/output/profile_export.jsonl" "${_ALL_RESULTS_DIR}/profile-concurrency-${_LEVEL}.jsonl"
                elif [ -n "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" ]; then
                    cp "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" "${_ALL_RESULTS_DIR}/results-concurrency-${_LEVEL}.json"
                fi
            else
                echo "⚠️  Benchmark at concurrency ${_LEVEL} failed (non-fatal, continuing)"
                _LEVEL_FAILURES=$((_LEVEL_FAILURES + 1))
            fi
            unset _BENCHMARK_SINGLE_LEVEL
            echo ""
        done

        # Aggregate results into a combined JSON file for the benchmark writer
        # Reads per-level JSONL files and computes aggregate metrics per concurrency level
        echo "📊 Aggregating multi-level results..."
        _COMBINED_FILE="${_ALL_RESULTS_DIR}/results.json"
        python3 -c "
import json, glob, sys, os, math

def percentile(sorted_vals, pct):
    if not sorted_vals:
        return 0.0
    idx = (pct / 100.0) * (len(sorted_vals) - 1)
    lower = int(math.floor(idx))
    upper = int(math.ceil(idx))
    if lower == upper:
        return sorted_vals[lower]
    frac = idx - lower
    return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac

def get_val(metrics, key):
    m = metrics.get(key)
    if isinstance(m, dict):
        return m.get('value')
    return m

results_dir = '${_ALL_RESULTS_DIR}'
combined = {'metrics': []}

# Process JSONL files (preferred)
for f in sorted(glob.glob(os.path.join(results_dir, 'profile-concurrency-*.jsonl'))):
    try:
        level = int(os.path.basename(f).replace('profile-concurrency-', '').replace('.jsonl', ''))
        records = []
        with open(f) as fp:
            for line in fp:
                line = line.strip()
                if line:
                    records.append(json.loads(line))

        if not records:
            continue

        # Aggregate per-request metrics
        latencies, ttfts, itls, ttsts, out_tokens = [], [], [], [], []
        start_times, end_times, in_tokens = [], [], []
        prefill_tps, output_tps = [], []

        for rec in records:
            meta = rec.get('metadata', {})
            metrics = rec.get('metrics', {})
            lat = get_val(metrics, 'request_latency')
            if lat is not None: latencies.append(lat)
            ttft = get_val(metrics, 'time_to_first_token') or get_val(metrics, 'time_to_first_output_token')
            if ttft is not None: ttfts.append(ttft)
            itl = get_val(metrics, 'inter_token_latency')
            if itl is not None: itls.append(itl)
            ttst = get_val(metrics, 'time_to_second_token')
            if ttst is not None: ttsts.append(ttst)
            otc = get_val(metrics, 'output_token_count')
            if otc is not None: out_tokens.append(otc)
            isl = get_val(metrics, 'input_sequence_length')
            if isl is not None: in_tokens.append(isl)
            ptps = get_val(metrics, 'prefill_throughput_per_user')
            if ptps is not None: prefill_tps.append(ptps)
            otps = get_val(metrics, 'output_token_throughput_per_user')
            if otps is not None: output_tps.append(otps)
            rs = meta.get('request_start_ns')
            re_ = meta.get('request_end_ns')
            if rs: start_times.append(rs)
            if re_: end_times.append(re_)

        # Sort for percentiles
        latencies.sort()
        ttfts.sort()
        itls.sort()
        ttsts.sort()
        prefill_tps.sort()
        output_tps.sort()

        # Compute throughput
        duration_s = (max(end_times) - min(start_times)) / 1e9 if start_times and end_times else 1.0
        duration_s = max(duration_s, 0.001)
        req_throughput = len(records) / duration_s
        token_throughput = sum(out_tokens) / duration_s if out_tokens else 0.0

        entry = {
            'concurrency': level,
            'request_throughput': req_throughput,
            'output_token_throughput': token_throughput,
            'total_requests': len(records),
            'duration_seconds': duration_s,
            'time_to_first_token': {
                'avg': sum(ttfts)/len(ttfts) if ttfts else 0.0,
                'p50': percentile(ttfts, 50),
                'p90': percentile(ttfts, 90),
                'p99': percentile(ttfts, 99),
            },
            'inter_token_latency': {
                'avg': sum(itls)/len(itls) if itls else 0.0,
                'p50': percentile(itls, 50),
                'p90': percentile(itls, 90),
                'p99': percentile(itls, 99),
            },
            'e2e_latency': {
                'avg': sum(latencies)/len(latencies) if latencies else 0.0,
                'p50': percentile(latencies, 50),
                'p90': percentile(latencies, 90),
                'p99': percentile(latencies, 99),
            },
            'time_to_second_token': {
                'p50': percentile(ttsts, 50),
                'p90': percentile(ttsts, 90),
            },
            'prefill_throughput': {
                'avg': sum(prefill_tps)/len(prefill_tps) if prefill_tps else 0.0,
                'p50': percentile(prefill_tps, 50),
            },
            'output_token_throughput_detail': {
                'avg': sum(output_tps)/len(output_tps) if output_tps else 0.0,
                'p50': percentile(output_tps, 50),
                'p90': percentile(output_tps, 90),
            },
            'total_token_throughput': (sum(out_tokens) + sum(in_tokens)) / duration_s if (out_tokens or in_tokens) else 0.0,
            'output_sequence_length': sum(out_tokens)/len(out_tokens) if out_tokens else 0.0,
            'input_sequence_length': sum(in_tokens)/len(in_tokens) if in_tokens else 0.0,
            'request_count': len(records),
            'input_tokens_mean': ${BENCHMARK_INPUT_TOKENS_MEAN:-0},
            'output_tokens_mean': ${BENCHMARK_OUTPUT_TOKENS_MEAN:-0},
        }
        combined['metrics'].append(entry)
    except Exception as e:
        print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)

# Fallback: process old-style JSON files if no JSONL found
if not combined['metrics']:
    for f in sorted(glob.glob(os.path.join(results_dir, 'results-concurrency-*.json'))):
        try:
            with open(f) as fp:
                data = json.load(fp)
            level = int(os.path.basename(f).replace('results-concurrency-', '').replace('.json', ''))
            if isinstance(data, dict):
                data['concurrency'] = level
                combined['metrics'].append(data)
        except Exception as e:
            print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)

with open('${_COMBINED_FILE}', 'w') as fp:
    try:
        json.dump(combined, fp, indent=2)
    except TypeError as te:
        print(f'Warning: JSON serialize error: {str(te)}', file=sys.stderr)
        fp.write(json.dumps({'metrics': []}, indent=2))
n_metrics = len(combined.get('metrics', []))
print(f'Combined {n_metrics} concurrency level results')
" 2>&1

        # Persist to Athena if CI mode is active
        if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ -f "${_COMBINED_FILE}" ]; then
            echo ""
            echo "📊 Persisting multi-level benchmark results to Athena..."

            _compute_config_id() {
                local input="${DEPLOYMENT_CONFIG}:${MODEL_NAME:-none}:${INSTANCE_TYPE}:${AWS_REGION}:${DEPLOYMENT_TARGET}:ic${IC_COUNT:-1}:adapt${ADAPTER_COUNT:-0}"
                if command -v sha256sum &> /dev/null; then
                    echo -n "$input" | sha256sum | cut -c1-16
                else
                    echo -n "$input" | shasum -a 256 | cut -c1-16
                fi
            }
            CONFIG_ID=$(_compute_config_id)

            if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
                --results-file "${_COMBINED_FILE}" \
                --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
                --project-name "${PROJECT_NAME}" \
                --workload "${BENCHMARK_WORKLOAD:-manual}" \
                --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
                --region "${AWS_REGION:-${REGION}}"; then
                echo "✅ Multi-level benchmark results persisted to S3"
            else
                echo "⚠️  Failed to persist multi-level benchmark results to Athena (non-fatal)"
            fi
        fi

        echo ""
        echo "📋 Multi-level Summary:"
        echo "   Levels tested: ${_NORMALIZED_LEVELS}"
        echo "   Failures: ${_LEVEL_FAILURES} / ${#_LEVELS[@]}"
        echo "   Results: ${_ALL_RESULTS_DIR}/"

        if [ ${_LEVEL_FAILURES} -ge ${#_LEVELS[@]} ]; then
            echo "❌ All concurrency levels failed"
            exit 1
        fi
        exit 0
    fi
fi

# ── _check_schema_registry_staleness() ────────────────────────────────────────
# Warn if the schema registry manifest's lastSynced timestamp is older than threshold.
# Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
# Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
_check_schema_registry_staleness() {
    if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
        return 0
    fi
    local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
    local manifest_file="${HOME}/.ml-container-creator/schemas/manifest.json"
    if [ ! -f "${manifest_file}" ]; then
        return 0
    fi
    local last_synced
    last_synced=$(python3 -c "
import json, sys
from datetime import datetime, timezone
try:
    with open('${manifest_file}') as f:
        manifest = json.load(f)
    ls = manifest.get('lastSynced', '')
    if not ls:
        sys.exit(0)
    synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
    days = (datetime.now(timezone.utc) - synced).days
    if days > int('${threshold}'):
        print(days)
except:
    pass
" 2>/dev/null)
    if [ -n "${last_synced}" ]; then
        echo "⚠️  Schema registry is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-schemas' to update."
    fi
}

_check_schema_registry_staleness

# ── Verify AWS CLI v2 ─────────────────────────────────────────────────────────
if ! aws --version 2>&1 | grep -q "aws-cli/2"; then
    echo "❌ AWS CLI v2 is required for benchmarking."
    echo "   The SageMaker AI Benchmarking API is only available in CLI v2."
    echo "   Detected: $(aws --version 2>&1 | head -1)"
    echo ""
    echo "   Install CLI v2: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html"
    exit 1
fi

# ── Resolve inference component name ──────────────────────────────────────────
# Resolution precedence: --adapter <name>, --ic <name>, first in do/ic/, or legacy config
IC_NAME=""
if [ -n "${ADAPTER_ARG}" ]; then
    # Adapter name provided via --adapter flag — look up adapter IC
    ADAPTER_CONF="${SCRIPT_DIR}/adapters/${ADAPTER_ARG}.conf"
    if [ ! -f "${ADAPTER_CONF}" ]; then
        echo "❌ Adapter config not found: do/adapters/${ADAPTER_ARG}.conf"
        echo "   Available adapters:"
        if [ -d "${SCRIPT_DIR}/adapters" ]; then
            for conf in "${SCRIPT_DIR}"/adapters/*.conf; do
                [ -f "${conf}" ] || continue
                echo "     • $(basename "${conf}" .conf)"
            done
        else
            echo "     (none)"
        fi
        exit 1
    fi
    ADAPTER_IC_NAME=""
    source "${ADAPTER_CONF}"
    if [ -z "${ADAPTER_IC_NAME}" ]; then
        echo "❌ Adapter '${ADAPTER_ARG}' conf is missing ADAPTER_IC_NAME."
        exit 1
    fi
    IC_NAME="${ADAPTER_IC_NAME}"
elif [ -n "${IC_ARG}" ]; then
    # Explicit IC name provided via --ic flag
    IC_CONF="${SCRIPT_DIR}/ic/${IC_ARG}.conf"
    if [ ! -f "${IC_CONF}" ]; then
        echo "❌ IC config not found: do/ic/${IC_ARG}.conf"
        exit 1
    fi
    IC_DEPLOYED_NAME=""
    source "${IC_CONF}"
    if [ -z "${IC_DEPLOYED_NAME}" ]; then
        echo "❌ IC '${IC_ARG}' has not been deployed yet. Run ./do/deploy --ic ${IC_ARG} first."
        exit 1
    fi
    IC_NAME="${IC_DEPLOYED_NAME}"
elif [ -d "${SCRIPT_DIR}/ic" ]; then
    # No --ic argument, but do/ic/ exists — use first IC alphabetically
    for conf in "${SCRIPT_DIR}"/ic/*.conf; do
        [ -f "${conf}" ] || continue
        IC_DEPLOYED_NAME=""
        source "${conf}"
        if [ -n "${IC_DEPLOYED_NAME}" ]; then
            IC_NAME="${IC_DEPLOYED_NAME}"
            break
        fi
    done
    if [ -z "${IC_NAME}" ]; then
        echo "❌ No ICs deployed. Run ./do/deploy first."
        exit 1
    fi
else
    # Legacy: no do/ic/ directory, use INFERENCE_COMPONENT_NAME from do/config
    IC_NAME="${INFERENCE_COMPONENT_NAME:-}"
fi

# ── Helper: update a variable in do/config ────────────────────────────────────
_update_benchmark_var() {
    local var_name="$1"
    local var_value="$2"
    local config_file="${SCRIPT_DIR}/config"

    if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
        sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
        rm -f "${config_file}.bak"
    else
        echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
    fi
}

# ── Idempotency: Check for existing benchmark job ─────────────────────────────
# If BENCHMARK_JOB_NAME is set in do/config and the job is still running,
# resume waiting for it instead of creating a new one (unless --force is used).
RESUME_EXISTING=false

if [ "${FORCE}" = false ] && [ -n "${BENCHMARK_JOB_NAME:-}" ]; then
    EXISTING_STATUS=$(aws sagemaker describe-ai-benchmark-job \
        --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
        --region "${AWS_REGION}" \
        --query 'AIBenchmarkJobStatus' \
        --output text 2>/dev/null) || EXISTING_STATUS=""

    case "${EXISTING_STATUS}" in
        InProgress|Starting|Pending)
            echo "📊 Resuming existing benchmark job: ${BENCHMARK_JOB_NAME}"
            echo "   Status: ${EXISTING_STATUS}"
            echo "   (use --force to start a new benchmark instead)"
            echo ""
            RESUME_EXISTING=true
            ;;
        Completed)
            echo "📊 Previous benchmark job already completed: ${BENCHMARK_JOB_NAME}"
            echo "   (use --force to start a new benchmark)"
            echo ""
            RESUME_EXISTING=true
            JOB_STATUS="Completed"
            ;;
        Failed|Stopped)
            FAILURE_REASON=$(aws sagemaker describe-ai-benchmark-job \
                --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
                --region "${AWS_REGION}" \
                --query 'FailureReason' \
                --output text 2>/dev/null) || FAILURE_REASON="unknown"
            echo "⚠️  Previous benchmark job ${EXISTING_STATUS}: ${BENCHMARK_JOB_NAME}"
            if [ "${EXISTING_STATUS}" = "Failed" ] && [ -n "${FAILURE_REASON}" ] && [ "${FAILURE_REASON}" != "None" ]; then
                echo "   Reason: ${FAILURE_REASON}"
            fi
            echo "   Use --force to start a new benchmark."
            exit 1
            ;;
        *)
            # Job doesn't exist or can't be described — proceed with new job
            ;;
    esac
fi

# ── Configuration ─────────────────────────────────────────────────────────────
WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
if [ "${RESUME_EXISTING}" = false ]; then
    BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
fi
POLL_INTERVAL=30
MAX_POLL_ATTEMPTS=60  # 30 minutes max (60 * 30s)

echo "📊 SageMaker AI Benchmark"
echo "   Project: ${PROJECT_NAME}"
echo "   Endpoint: ${ENDPOINT_NAME:-not set}"
echo "   Inference Component: ${IC_NAME:-not set}"
echo "   Concurrency: ${BENCHMARK_CONCURRENCY}"
echo "   Input tokens (mean): ${BENCHMARK_INPUT_TOKENS_MEAN}"
echo "   Output tokens (mean): ${BENCHMARK_OUTPUT_TOKENS_MEAN}"
echo "   Streaming: ${BENCHMARK_STREAMING}"
if [ -n "${BENCHMARK_REQUEST_COUNT:-}" ]; then
    echo "   Request count: ${BENCHMARK_REQUEST_COUNT}"
fi
echo "   S3 output: ${BENCHMARK_S3_OUTPUT_PATH}"
echo ""

# ── Pre-flight check: Verify endpoint is InService ────────────────────────────
if [ "${RESUME_EXISTING}" = false ]; then

echo "🔍 Pre-flight: Verifying endpoint status..."

if [ -z "${ENDPOINT_NAME:-}" ]; then
    echo "❌ ENDPOINT_NAME is not set in do/config"
    echo "   Deploy your endpoint first: ./do/deploy"
    exit 1
fi

ENDPOINT_STATUS=$(aws sagemaker describe-endpoint \
    --endpoint-name "${ENDPOINT_NAME}" \
    --region "${AWS_REGION}" \
    --query 'EndpointStatus' \
    --output text 2>/dev/null) || {
    echo "❌ Failed to describe endpoint: ${ENDPOINT_NAME}"
    echo "   Check that the endpoint exists and your AWS credentials are valid."
    exit 1
}

if [ "${ENDPOINT_STATUS}" != "InService" ]; then
    echo "❌ Endpoint is not InService (current status: ${ENDPOINT_STATUS})"
    echo "   The endpoint must be InService before running a benchmark."
    echo "   Check status: aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
    exit 1
fi

echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"

# ── Pre-flight check: Ensure S3 output bucket exists ──────────────────────────
echo "🔍 Pre-flight: Checking S3 output bucket..."

BENCHMARK_S3_BUCKET=$(echo "${BENCHMARK_S3_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)

if ! aws s3api head-bucket --bucket "${BENCHMARK_S3_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
    echo "📦 Creating S3 bucket: ${BENCHMARK_S3_BUCKET}"
    if [ "${AWS_REGION}" = "us-east-1" ]; then
        if ! aws s3api create-bucket \
            --bucket "${BENCHMARK_S3_BUCKET}" \
            --region "${AWS_REGION}"; then
            echo "❌ Failed to create S3 bucket: ${BENCHMARK_S3_BUCKET}"
            exit 1
        fi
    else
        if ! aws s3api create-bucket \
            --bucket "${BENCHMARK_S3_BUCKET}" \
            --region "${AWS_REGION}" \
            --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
            echo "❌ Failed to create S3 bucket: ${BENCHMARK_S3_BUCKET}"
            exit 1
        fi
    fi
    echo "✅ S3 bucket created: ${BENCHMARK_S3_BUCKET}"
else
    echo "✅ S3 bucket exists: ${BENCHMARK_S3_BUCKET}"
fi

# ── Pre-flight check: Ensure Secrets Manager secret for HF token ──────────────
# The benchmarking service requires a Secrets Manager ARN for tokenizer access.
# If HF_TOKEN is available (plaintext or resolved from ARN), store it in Secrets Manager.
SECRET_ARN=""

if [ -n "${HF_TOKEN_ARN:-}" ]; then
    # Already using Secrets Manager ARN — use it directly
    SECRET_ARN="${HF_TOKEN_ARN}"
    echo "✅ Using existing Secrets Manager ARN for HF token: ${SECRET_ARN}"
elif [ -n "${HF_TOKEN:-}" ]; then
    # Plaintext HF token provided — store in Secrets Manager for the benchmark service
    SECRET_NAME="ml-container-creator/${PROJECT_NAME}/hf-token"
    echo "🔐 Pre-flight: Ensuring Secrets Manager secret for HF token..."

    if ! aws secretsmanager describe-secret --secret-id "$SECRET_NAME" --region "$AWS_REGION" 2>/dev/null; then
        echo "   Creating Secrets Manager secret: ${SECRET_NAME}"
        aws secretsmanager create-secret \
            --name "$SECRET_NAME" \
            --secret-string "$HF_TOKEN" \
            --region "$AWS_REGION" > /dev/null || {
            echo "❌ Failed to create Secrets Manager secret"
            exit 1
        }
    else
        echo "   Updating Secrets Manager secret: ${SECRET_NAME}"
        aws secretsmanager put-secret-value \
            --secret-id "$SECRET_NAME" \
            --secret-string "$HF_TOKEN" \
            --region "$AWS_REGION" > /dev/null || {
            echo "❌ Failed to update Secrets Manager secret"
            exit 1
        }
    fi

    SECRET_ARN=$(aws secretsmanager describe-secret \
        --secret-id "$SECRET_NAME" \
        --region "$AWS_REGION" \
        --query 'ARN' \
        --output text)
    echo "✅ HF token stored in Secrets Manager: ${SECRET_ARN}"
else
    echo "⚠️  No HF_TOKEN provided — tokenizer-based metrics (TTFT, ITL) may be unavailable"
fi

echo ""

# ── Step 1: Create AI Workload Config ─────────────────────────────────────────
# Build the inline workload spec JSON from do/config variables.
# The workload spec defines benchmark type, parameters, tooling, and secrets.
echo "⚙️  Step 1: Creating AI Workload Config: ${WORKLOAD_CONFIG_NAME}"

# Build parameters block
PARAMS_JSON="{\"prompt_input_tokens_mean\":${BENCHMARK_INPUT_TOKENS_MEAN},\"output_tokens_mean\":${BENCHMARK_OUTPUT_TOKENS_MEAN},\"concurrency\":${BENCHMARK_CONCURRENCY},\"streaming\":${BENCHMARK_STREAMING},\"tokenizer\":\"${MODEL_NAME}\""

# Add optional request_count if specified
if [ -n "${BENCHMARK_REQUEST_COUNT:-}" ]; then
    PARAMS_JSON="${PARAMS_JSON},\"request_count\":${BENCHMARK_REQUEST_COUNT}"
fi

PARAMS_JSON="${PARAMS_JSON}}"

# Build secrets block (only if HF token is available)
SECRETS_JSON=""
if [ -n "${SECRET_ARN}" ]; then
    SECRETS_JSON=",\"secrets\":{\"hf_token\":\"${SECRET_ARN}\"}"
fi

# Assemble full workload spec (inline YAML/JSON string for the WorkloadSpec.Inline field)
WORKLOAD_SPEC="{\"benchmark\":{\"type\":\"aiperf\"},\"parameters\":${PARAMS_JSON},\"tooling\":{\"api_standard\":\"openai\"}${SECRETS_JSON}}"

# Wrap in the API's expected structure: --ai-workload-configs '{"WorkloadSpec":{"Inline":"..."}}'
# The Inline field takes the spec as a JSON-encoded string
WORKLOAD_CONFIGS="{\"WorkloadSpec\":{\"Inline\":$(echo "${WORKLOAD_SPEC}" | python3 -c 'import sys,json; print(json.dumps(sys.stdin.read().strip()))')}}"

# Workload config idempotency: reuse if params match, recreate if they differ
EXISTING_CONFIG_SPEC=""
if aws sagemaker describe-ai-workload-config \
    --ai-workload-config-name "${WORKLOAD_CONFIG_NAME}" \
    --region "${AWS_REGION}" 2>/dev/null >/dev/null; then
    EXISTING_CONFIG_SPEC=$(aws sagemaker describe-ai-workload-config \
        --ai-workload-config-name "${WORKLOAD_CONFIG_NAME}" \
        --region "${AWS_REGION}" \
        --query 'AIWorkloadConfigs.WorkloadSpec.Inline' \
        --output text 2>/dev/null) || EXISTING_CONFIG_SPEC=""
fi

if [ -n "${EXISTING_CONFIG_SPEC}" ]; then
    # Compare existing spec with desired spec (normalize for comparison)
    EXISTING_NORMALIZED=$(echo "${EXISTING_CONFIG_SPEC}" | python3 -c "import sys,json; print(json.dumps(json.loads(sys.stdin.read()), sort_keys=True))" 2>/dev/null) || EXISTING_NORMALIZED=""
    DESIRED_NORMALIZED=$(echo "${WORKLOAD_SPEC}" | python3 -c "import sys,json; print(json.dumps(json.loads(sys.stdin.read()), sort_keys=True))" 2>/dev/null) || DESIRED_NORMALIZED=""

    if [ "${EXISTING_NORMALIZED}" = "${DESIRED_NORMALIZED}" ]; then
        echo "   ✅ Existing workload config matches current parameters — reusing"
        CREATE_WORKLOAD_CONFIG=false
    else
        echo "   ⚠️  Workload config parameters changed — recreating..."
        aws sagemaker delete-ai-workload-config \
            --ai-workload-config-name "${WORKLOAD_CONFIG_NAME}" \
            --region "${AWS_REGION}" || true
        CREATE_WORKLOAD_CONFIG=true
    fi
else
    CREATE_WORKLOAD_CONFIG=true
fi

if [ "${CREATE_WORKLOAD_CONFIG:-true}" = "true" ]; then
    # Create the workload config
    if ! aws sagemaker create-ai-workload-config \
        --ai-workload-config-name "${WORKLOAD_CONFIG_NAME}" \
        --ai-workload-configs "${WORKLOAD_CONFIGS}" \
        --region "${AWS_REGION}"; then
        echo "❌ Failed to create AI Workload Config"
        echo "   This may indicate the SageMaker AI Benchmarking API is not available in region: ${AWS_REGION}"
        echo "   Check: https://docs.aws.amazon.com/sagemaker/latest/dg/regions-quotas.html"
        exit 1
    fi
    echo "✅ Workload config created: ${WORKLOAD_CONFIG_NAME}"
fi

# Persist workload config name for resume
_update_benchmark_var "BENCHMARK_WORKLOAD_CONFIG_NAME" "${WORKLOAD_CONFIG_NAME}"
echo ""

# ── Step 2: Create AI Benchmark Job ──────────────────────────────────────────
# Target the deployed endpoint and inference component with the workload config.
echo "🚀 Step 2: Creating AI Benchmark Job: ${BENCHMARK_JOB_NAME}"

BENCHMARK_TARGET="{\"Endpoint\":{\"Identifier\":\"${ENDPOINT_NAME}\",\"InferenceComponents\":[{\"Identifier\":\"${IC_NAME}\"}]}}"
OUTPUT_CONFIG="{\"S3OutputLocation\":\"${BENCHMARK_S3_OUTPUT_PATH}\"}"

if ! aws sagemaker create-ai-benchmark-job \
    --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
    --benchmark-target "${BENCHMARK_TARGET}" \
    --output-config "${OUTPUT_CONFIG}" \
    --ai-workload-config-identifier "${WORKLOAD_CONFIG_NAME}" \
    --role-arn "${ROLE_ARN}" \
    --region "${AWS_REGION}"; then
    echo "❌ Failed to create AI Benchmark Job"
    echo "   Check that:"
    echo "   • The execution role has sagemaker:CreateAIBenchmarkJob permission"
    echo "   • The endpoint and inference component are valid"
    echo "   • The S3 output path is accessible: ${BENCHMARK_S3_OUTPUT_PATH}"
    exit 1
fi

echo "✅ Benchmark job created: ${BENCHMARK_JOB_NAME}"

# Save job name to do/config for idempotency on re-run
_update_benchmark_var "BENCHMARK_JOB_NAME" "${BENCHMARK_JOB_NAME}"

echo ""

fi  # end of RESUME_EXISTING=false block

# ── Step 3: Poll for completion ───────────────────────────────────────────────
# Poll describe-ai-benchmark-job every POLL_INTERVAL seconds until terminal state.
# Terminal states: Completed, Failed, Stopped

# Skip polling if we already know the job completed (resumed a finished job)
if [ "${JOB_STATUS:-}" != "Completed" ] && [ "${JOB_STATUS:-}" != "Failed" ] && [ "${JOB_STATUS:-}" != "Stopped" ]; then

echo "⏳ Step 3: Waiting for benchmark to complete..."
echo "   Polling every ${POLL_INTERVAL}s (max ${MAX_POLL_ATTEMPTS} attempts = 30 min)"
echo ""

POLL_COUNT=0
JOB_STATUS=""

while [ ${POLL_COUNT} -lt ${MAX_POLL_ATTEMPTS} ]; do
    JOB_STATUS=$(aws sagemaker describe-ai-benchmark-job \
        --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
        --region "${AWS_REGION}" \
        --query 'AIBenchmarkJobStatus' \
        --output text 2>/dev/null) || {
        echo "⚠️  Failed to describe benchmark job (credentials may have expired)"
        echo "   Re-run to check status manually:"
        echo "   aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
        exit 1
    }

    case "${JOB_STATUS}" in
        Completed)
            echo "✅ Benchmark completed successfully!"
            break
            ;;
        Failed)
            echo "❌ Benchmark job failed"
            break
            ;;
        Stopped)
            echo "⚠️  Benchmark job was stopped"
            break
            ;;
        *)
            POLL_COUNT=$((POLL_COUNT + 1))
            ELAPSED=$((POLL_COUNT * POLL_INTERVAL))
            echo "   $(date +%H:%M:%S) Status: ${JOB_STATUS} (${ELAPSED}s elapsed)"
            sleep ${POLL_INTERVAL}
            ;;
    esac
done

# Check for timeout
if [ ${POLL_COUNT} -ge ${MAX_POLL_ATTEMPTS} ]; then
    echo ""
    echo "⚠️  Benchmark timed out after 30 minutes (status: ${JOB_STATUS})"
    echo "   The job may still be running. Re-run ./do/benchmark to resume waiting."
    echo "   Or check status manually:"
    echo "   aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
    exit 1
fi

fi  # end of polling conditional

echo ""

# ── Step 4: Display results ───────────────────────────────────────────────────
if [ "${JOB_STATUS}" = "Completed" ]; then
    # Persist results locally to benchmarks/<job-name>/
    PROJECT_ROOT="${SCRIPT_DIR}/.."
    LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${BENCHMARK_JOB_NAME}"
    RESULTS_JSONL="${LOCAL_RESULTS_DIR}/output/profile_export.jsonl"
    RESULTS_FILE="${LOCAL_RESULTS_DIR}/output/profile_export_aiperf.json"

    # Check if results already exist locally (idempotency: skip S3 download)
    if [ -f "${RESULTS_JSONL}" ] || [ -f "${RESULTS_FILE}" ]; then
        echo "📥 Step 4: Results already available locally"
        RESULTS_DOWNLOADED=true
    else
        echo "📥 Step 4: Downloading benchmark results..."

        RESULTS_S3_PATH=$(aws sagemaker describe-ai-benchmark-job \
            --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
            --region "${AWS_REGION}" \
            --query 'OutputConfig.S3OutputLocation' \
            --output text 2>/dev/null)

        # Create local benchmarks directory
        mkdir -p "${LOCAL_RESULTS_DIR}"

        # The benchmark service writes results into a subdirectory (e.g., bmk-prod-<job>-<hash>/)
        # under the S3OutputLocation. We use multiple strategies to locate the results file.
        RESULTS_DOWNLOADED=false

        # Ensure RESULTS_S3_PATH has a trailing slash for consistent path joining
        RESULTS_S3_PATH="${RESULTS_S3_PATH%/}/"

        # Strategy 1: Sync the entire output tree locally, then find results
        # This is the most reliable approach — handles any subdirectory structure
        echo "   Syncing results from S3..."
        if aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/" --region "${AWS_REGION}" 2>/dev/null; then
            # Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
            for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
                ARCHIVE_DIR=$(dirname "${ARCHIVE}")
                tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
            done

            # Look for specific result files (priority: JSONL > aiperf JSON)
            _FOUND_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
            _FOUND_JSON=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)

            if [ -n "${_FOUND_JSONL}" ]; then
                if [ "${_FOUND_JSONL}" != "${RESULTS_JSONL}" ]; then
                    mkdir -p "$(dirname "${RESULTS_JSONL}")"
                    cp "${_FOUND_JSONL}" "${RESULTS_JSONL}"
                fi
                RESULTS_DOWNLOADED=true
            fi
            if [ -n "${_FOUND_JSON}" ]; then
                if [ "${_FOUND_JSON}" != "${RESULTS_FILE}" ]; then
                    mkdir -p "$(dirname "${RESULTS_FILE}")"
                    cp "${_FOUND_JSON}" "${RESULTS_FILE}"
                fi
                RESULTS_DOWNLOADED=true
            fi
        fi

        # Strategy 2: If sync found nothing, try listing and downloading individual files
        # This handles cases where s3 sync silently fails (permissions, empty prefix match)
        if [ "${RESULTS_DOWNLOADED}" = false ]; then
            echo "   Searching for results files..."
            RESULTS_BUCKET=$(echo "${RESULTS_S3_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
            RESULTS_PREFIX=$(echo "${RESULTS_S3_PATH}" | sed "s|s3://${RESULTS_BUCKET}/||")

            # List all objects and look for our target files
            _ALL_KEYS=$(aws s3api list-objects-v2 \
                --bucket "${RESULTS_BUCKET}" \
                --prefix "${RESULTS_PREFIX}" \
                --region "${AWS_REGION}" \
                --query 'Contents[].Key' \
                --output text 2>/dev/null | tr '\t' '\n')

            _JSONL_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export\.jsonl$" | head -1)
            _JSON_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export_aiperf\.json$" | head -1)

            if [ -n "${_JSONL_KEY}" ] && [ "${_JSONL_KEY}" != "None" ]; then
                mkdir -p "$(dirname "${RESULTS_JSONL}")"
                aws s3 cp "s3://${RESULTS_BUCKET}/${_JSONL_KEY}" "${RESULTS_JSONL}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
            fi
            if [ -n "${_JSON_KEY}" ] && [ "${_JSON_KEY}" != "None" ]; then
                mkdir -p "$(dirname "${RESULTS_FILE}")"
                aws s3 cp "s3://${RESULTS_BUCKET}/${_JSON_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
            fi
        fi
    fi

    if [ "${RESULTS_DOWNLOADED}" = true ]; then
        echo "✅ Results downloaded"
        echo ""

        # Display summary table
        echo "╔══════════════════════════════════════════════════════════════════╗"
        echo "║              SageMaker AI Benchmark Results                     ║"
        echo "╠══════════════════════════════════════════════════════════════════╣"
        echo "║  Job: ${BENCHMARK_JOB_NAME}"
        echo "║  Endpoint: ${ENDPOINT_NAME}"
        echo "╠══════════════════════════════════════════════════════════════════╣"

        # Parse and display metrics from profile_export.jsonl (rich per-request data)
        if command -v python3 &>/dev/null; then
            python3 -c "
import json, sys, os, math

def percentile(sorted_vals, pct):
    if not sorted_vals:
        return None
    idx = (pct / 100.0) * (len(sorted_vals) - 1)
    lower = int(math.floor(idx))
    upper = int(math.ceil(idx))
    if lower == upper:
        return sorted_vals[lower]
    frac = idx - lower
    return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac

def fmt(val, suffix=''):
    if val is None:
        return 'N/A'
    return f'{val:.2f}{suffix}'

try:
    jsonl_path = '${RESULTS_JSONL}'
    json_path = '${RESULTS_FILE}'
    records = []

    # Primary: read profile_export.jsonl (rich per-request data)
    if os.path.exists(jsonl_path):
        with open(jsonl_path) as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        records.append(json.loads(line))
                    except json.JSONDecodeError:
                        continue

    if records:
        # Extract scalar values from metric dicts {"value": X, "unit": "..."}
        def get_val(metrics, key):
            m = metrics.get(key)
            if isinstance(m, dict):
                return m.get('value')
            return m

        # Collect per-request metrics
        latencies = []
        ttfts = []
        itls = []
        ttsts = []
        output_tokens = []
        start_times = []
        end_times = []

        for rec in records:
            meta = rec.get('metadata', {})
            metrics = rec.get('metrics', {})

            lat = get_val(metrics, 'request_latency')
            if lat is not None:
                latencies.append(lat)

            ttft = get_val(metrics, 'time_to_first_token')
            if ttft is None:
                ttft = get_val(metrics, 'time_to_first_output_token')
            if ttft is not None:
                ttfts.append(ttft)

            itl = get_val(metrics, 'inter_token_latency')
            if itl is not None:
                itls.append(itl)

            ttst = get_val(metrics, 'time_to_second_token')
            if ttst is not None:
                ttsts.append(ttst)

            otc = get_val(metrics, 'output_token_count')
            if otc is not None:
                output_tokens.append(otc)

            # Track timing for throughput calculation
            rs = meta.get('request_start_ns')
            re_ = meta.get('request_end_ns')
            if rs is not None:
                start_times.append(rs)
            if re_ is not None:
                end_times.append(re_)

        n = len(records)

        # Compute system throughput
        if start_times and end_times:
            duration_ns = max(end_times) - min(start_times)
            duration_s = duration_ns / 1e9 if duration_ns > 0 else 1.0
            req_throughput = n / duration_s
            total_out_tokens = sum(output_tokens) if output_tokens else 0
            token_throughput = total_out_tokens / duration_s
        else:
            req_throughput = None
            token_throughput = None

        # Compute percentiles
        latencies.sort()
        ttfts.sort()
        itls.sort()
        ttsts.sort()

        print(f'║  Requests:                {n}')
        print(f'║  Request Throughput:      {fmt(req_throughput)} req/s')
        print(f'║  Output Token Throughput: {fmt(token_throughput)} tokens/s')
        print('║')
        print('║  Time to First Token (ms):')
        print(f'║    Avg: {fmt(sum(ttfts)/len(ttfts) if ttfts else None)}  P50: {fmt(percentile(ttfts, 50))}  P90: {fmt(percentile(ttfts, 90))}  P99: {fmt(percentile(ttfts, 99))}')
        print('║')
        print('║  Inter-Token Latency (ms):')
        print(f'║    Avg: {fmt(sum(itls)/len(itls) if itls else None)}  P50: {fmt(percentile(itls, 50))}  P90: {fmt(percentile(itls, 90))}  P99: {fmt(percentile(itls, 99))}')
        print('║')
        print('║  Request Latency (ms):')
        print(f'║    Avg: {fmt(sum(latencies)/len(latencies) if latencies else None)}  P50: {fmt(percentile(latencies, 50))}  P90: {fmt(percentile(latencies, 90))}  P99: {fmt(percentile(latencies, 99))}')
        print('║')
        print('║  Time to Second Token (ms):')
        print(f'║    Avg: {fmt(sum(ttsts)/len(ttsts) if ttsts else None)}  P50: {fmt(percentile(ttsts, 50))}  P90: {fmt(percentile(ttsts, 90))}  P99: {fmt(percentile(ttsts, 99))}')

    else:
        print('║  ⚠️  No JSONL results found — cannot display metrics')
        print(f'║  Expected: {jsonl_path}')

except Exception as e:
    print(f'║  ⚠️  Could not parse results: {e}')
    import traceback
    traceback.print_exc(file=sys.stderr)
"
        else
            # Fallback: display raw JSON if python3 is not available
            echo "║  (python3 not available — showing raw results)"
            echo "║"
            if [ -f "${RESULTS_JSONL}" ]; then
                head -3 "${RESULTS_JSONL}"
            elif [ -f "${RESULTS_FILE}" ]; then
                cat "${RESULTS_FILE}" | head -50
            fi
        fi

        echo "╚══════════════════════════════════════════════════════════════════╝"
        echo ""
        echo "📁 Results saved to: benchmarks/${BENCHMARK_JOB_NAME}/"
        echo "☁️  S3 results: ${RESULTS_S3_PATH:-${BENCHMARK_S3_OUTPUT_PATH}}"

        # Write marker for multi-level parent to find this results dir
        echo "${LOCAL_RESULTS_DIR}" > "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null || true
    else
        echo "⚠️  Could not download results from S3"
        echo "   The benchmark completed but results could not be located."
        echo ""
        echo "   Debug — list objects at the output path:"
        echo "   aws s3 ls ${RESULTS_S3_PATH} --recursive --region ${AWS_REGION}"
        echo ""
        echo "   Or list via API:"
        RESULTS_BUCKET=$(echo "${RESULTS_S3_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
        RESULTS_PREFIX=$(echo "${RESULTS_S3_PATH}" | sed "s|s3://${RESULTS_BUCKET}/||")
        echo "   aws s3api list-objects-v2 --bucket ${RESULTS_BUCKET} --prefix ${RESULTS_PREFIX} --region ${AWS_REGION}"
        echo ""
        # Show what's actually there to help debug
        echo "   Objects found at output path:"
        aws s3api list-objects-v2 \
            --bucket "${RESULTS_BUCKET}" \
            --prefix "${RESULTS_PREFIX}" \
            --region "${AWS_REGION}" \
            --query 'Contents[].{Key: Key, Size: Size}' \
            --output table 2>/dev/null || echo "   (could not list objects)"
    fi

    # ── Persist benchmark results to Athena ──────────────────────────────────
    # When CI_BENCHMARK_RESULTS_BUCKET is set (from bootstrap config), call the
    # benchmark writer to persist results as Parquet to S3 for Athena querying.
    # Skip when running as a child of multi-level mode — the parent orchestrator
    # handles combined persistence (one row per concurrency level, no duplicates).
    if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ "${RESULTS_DOWNLOADED}" = true ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
        echo ""
        echo "📊 Persisting benchmark results to Athena..."

        # Determine which results file to pass to the writer (prefer JSONL)
        _WRITER_INPUT="${RESULTS_JSONL}"
        if [ ! -f "${_WRITER_INPUT}" ]; then
            _WRITER_INPUT="${RESULTS_FILE}"
        fi

        # Best-effort: errors are logged but do not fail the benchmark script
        if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
            --results-file "${_WRITER_INPUT}" \
            --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
            --project-name "${PROJECT_NAME}" \
            --workload "${BENCHMARK_WORKLOAD:-manual}" \
            --concurrency "${BENCHMARK_CONCURRENCY}" \
            --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
            --region "${AWS_REGION:-${REGION}}"; then
            echo "✅ Benchmark results persisted to S3"
        else
            echo "⚠️  Failed to persist benchmark results to Athena (non-fatal)"
            echo "   Results remain available locally in: benchmarks/${BENCHMARK_JOB_NAME}/"
        fi
    fi
elif [ "${JOB_STATUS}" = "Failed" ]; then
    # Display failure reason
    echo "❌ Step 4: Benchmark job failed"
    FAILURE_REASON=$(aws sagemaker describe-ai-benchmark-job \
        --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
        --region "${AWS_REGION}" \
        --query 'FailureReason' \
        --output text 2>/dev/null)
    echo "   Reason: ${FAILURE_REASON}"
    echo ""
    echo "   Debug:"
    echo "   aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"

elif [ "${JOB_STATUS}" = "Stopped" ]; then
    echo "⚠️  Step 4: Benchmark job was stopped before completion"
    echo "   No results available."
fi

# ── Optional cleanup (--clean flag) ───────────────────────────────────────────
# Delete workload config and benchmark job to avoid resource accumulation.
if [ "${CLEAN_AFTER}" = true ]; then
    echo ""
    echo "🧹 Cleaning up benchmark resources (--clean)..."

    # Delete workload config
    if aws sagemaker delete-ai-workload-config \
        --ai-workload-config-name "${WORKLOAD_CONFIG_NAME}" \
        --region "${AWS_REGION}" 2>/dev/null; then
        echo "   ✓ Deleted workload config: ${WORKLOAD_CONFIG_NAME}"
    else
        echo "   ⚠️  Could not delete workload config: ${WORKLOAD_CONFIG_NAME}"
    fi

    # Delete benchmark job (must be in terminal state)
    if aws sagemaker delete-ai-benchmark-job \
        --ai-benchmark-job-name "${BENCHMARK_JOB_NAME}" \
        --region "${AWS_REGION}" 2>/dev/null; then
        echo "   ✓ Deleted benchmark job: ${BENCHMARK_JOB_NAME}"
    else
        echo "   ⚠️  Could not delete benchmark job: ${BENCHMARK_JOB_NAME}"
    fi

    echo "✅ Cleanup complete"
fi

echo ""
echo "📋 Summary:"
echo "   Workload Config: ${WORKLOAD_CONFIG_NAME}"
echo "   Benchmark Job:   ${BENCHMARK_JOB_NAME}"
echo "   Status:          ${JOB_STATUS}"
echo ""
if [ "${CLEAN_AFTER}" = false ]; then
    echo "🧹 To clean up benchmark resources:"
    echo "   ./do/clean benchmark"
fi
