#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

set -e
set -u
set -o pipefail

# Parse flags
FORCE_NEW=false
FORCE_IC=false
IC_TARGET=""
while [ $# -gt 0 ]; do
    case "$1" in
        --force) FORCE_NEW=true; shift ;;
        --force-ic)
            FORCE_IC=true
            shift
<% if (deploymentTarget === 'realtime-inference') { %>
            # Optional name argument: --force-ic <name>
            if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
                IC_TARGET="$1"
                shift
            fi
<% } %>
            ;;
<% if (deploymentTarget === 'realtime-inference') { %>
        --ic)
            if [ -z "${2:-}" ]; then
                echo "❌ --ic requires a name argument"
                echo "   Usage: ./do/deploy --ic <name>"
                exit 1
            fi
            IC_TARGET="$2"
            shift 2
            ;;
<% } %>
        --help|-h)
<% if (deploymentTarget === 'realtime-inference') { %>
            echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
            echo ""
            echo "Options:"
            echo "  --force            Create a new endpoint and IC, even if one already exists."
            echo "  --force-ic         Recreate ALL inference components on the existing endpoint."
            echo "  --force-ic <name>  Recreate only the named IC on the existing endpoint."
            echo "  --ic <name>        Deploy only the named IC (from do/ic/<name>.conf)."
            echo ""
            echo "Without flags, deploy resumes from the last run."
<% } else { %>
            echo "Usage: ./do/deploy [--force] [--force-ic]"
            echo ""
            echo "Options:"
            echo "  --force      Create a new endpoint, even if one already exists."
            echo "  --force-ic   Recreate the inference component on the existing endpoint."
            echo ""
            echo "Without flags, deploy resumes from the last run."
<% } %>
            exit 0
            ;;
        *)
            echo "❌ Unknown option: $1"
            echo "   Run ./do/deploy --help for usage."
            exit 1
            ;;
    esac
done

# Source configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/config"

echo "🚀 Deploying to AWS"
echo "   Project: ${PROJECT_NAME}"
echo "   Deployment config: ${DEPLOYMENT_CONFIG}"
echo "   Region: ${AWS_REGION}"
echo "   Build target: ${BUILD_TARGET}"
echo "   Deployment target: ${DEPLOYMENT_TARGET}"
<% if (deploymentTarget === 'realtime-inference') { %>
if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
    echo "   Endpoint: ${ENDPOINT_NAME} (external)"
else
    echo "   Instance type: ${INSTANCE_TYPE}"
fi
<% } else if (deploymentTarget === 'async-inference') { %>
echo "   Instance type: ${INSTANCE_TYPE}"
echo "   S3 output: ${ASYNC_S3_OUTPUT_PATH}"
echo "   SNS success: ${ASYNC_SNS_SUCCESS_TOPIC}"
echo "   SNS error: ${ASYNC_SNS_ERROR_TOPIC}"
<% if (asyncMaxConcurrentInvocations) { %>
echo "   Max concurrent: ${ASYNC_MAX_CONCURRENT_INVOCATIONS}"
<% } %>
<% } else if (deploymentTarget === 'hyperpod-eks') { %>
echo "   HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
echo "   Namespace: ${HYPERPOD_NAMESPACE}"
echo "   Replicas: ${HYPERPOD_REPLICAS}"
<% } else if (deploymentTarget === 'batch-transform') { %>
echo "   Instance type: ${INSTANCE_TYPE}"
echo "   S3 input: ${BATCH_INPUT_PATH}"
echo "   S3 output: ${BATCH_OUTPUT_PATH}"
echo "   Instance count: ${BATCH_INSTANCE_COUNT}"
echo "   Split type: ${BATCH_SPLIT_TYPE}"
echo "   Strategy: ${BATCH_STRATEGY}"
<% } %>

# Check AWS credentials
echo "🔍 Validating AWS credentials..."
if ! aws sts get-caller-identity &> /dev/null; then
    echo "❌ AWS credentials not configured"
    echo "   Run: aws configure"
    echo "   Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
    exit 4
fi

AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"

# Construct ECR repository URL
ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"

# ============================================================
# Shared: Verify ECR image exists
# ============================================================
echo "🔍 Verifying ECR image exists..."
if ! aws ecr describe-images \
    --repository-name "${ECR_REPOSITORY_NAME}" \
    --image-ids imageTag="${PROJECT_NAME}-latest" \
    --region "${AWS_REGION}" &> /dev/null; then
    
    echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
    echo ""
    echo "Please build and push your image first:"
    echo "  ./do/submit"
    echo ""
    echo "After the build completes successfully, run this deploy script again."
    exit 4
fi

echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
IMAGE_TAG="${PROJECT_NAME}-latest"

# ============================================================
# Shared: Resolve secrets for container environment
# ============================================================
CONTAINER_ENV_JSON=""

if [ -n "${HF_TOKEN_ARN:-}" ]; then
    echo "🔐 Resolving HuggingFace token from Secrets Manager..."
    RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
        echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
        exit 3
    }
    CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
elif [ -n "${HF_TOKEN:-}" ]; then
    CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
fi

if [ -n "${NGC_API_KEY_ARN:-}" ]; then
    echo "🔐 Resolving NGC API key from Secrets Manager..."
    RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
        echo "❌ Failed to resolve NGC API key from Secrets Manager"
        exit 3
    }
    if [ -n "${CONTAINER_ENV_JSON}" ]; then
        CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
    else
        CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
    fi
elif [ -n "${NGC_API_KEY:-}" ]; then
    if [ -n "${CONTAINER_ENV_JSON}" ]; then
        CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
    else
        CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
    fi
fi

<% if (deploymentTarget === 'realtime-inference') { %>
# ============================================================
# SageMaker Real-Time Inference Deployment (Inference Components)
# ============================================================

# Source shared helpers
source "${SCRIPT_DIR}/lib/secrets.sh"
source "${SCRIPT_DIR}/lib/wait.sh"
source "${SCRIPT_DIR}/lib/endpoint-config.sh"
source "${SCRIPT_DIR}/lib/inference-component.sh"

# Validate execution role ARN
if [ -z "${ROLE_ARN:-}" ]; then
    echo "❌ Execution role ARN not provided"
    echo ""
    echo "Usage:"
    echo "  export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
    echo "  ./do/deploy"
    echo ""
    echo "Or set ROLE_ARN in do/config"
    echo ""
    echo "The execution role must have permissions for:"
    echo "  • SageMaker endpoint and inference component management"
    echo "  • ECR image access"
    echo "  • S3 access (if using model artifacts)"
    echo "  • CloudWatch Logs"
    exit 3
fi

echo "   Using execution role: ${ROLE_ARN}"

# Validate --ic argument if specified (set by --ic <name> or --force-ic <name>)
if [ -n "${IC_TARGET}" ]; then
    if [ ! -d "${SCRIPT_DIR}/ic" ]; then
        echo "❌ IC name specified but no do/ic/ directory found"
        echo "   This project does not use multi-IC configuration."
        echo "   Remove --ic/--force-ic <name> to deploy using the legacy single-IC path."
        exit 1
    fi
    if [ ! -f "${SCRIPT_DIR}/ic/${IC_TARGET}.conf" ]; then
        echo "❌ IC config not found: do/ic/${IC_TARGET}.conf"
        echo ""
        echo "   Available ICs:"
        for conf in "${SCRIPT_DIR}"/ic/*.conf; do
            [ -f "${conf}" ] || continue
            echo "     • $(basename "${conf}" .conf)"
        done
        echo ""
        echo "   Usage: ./do/deploy --ic <name>"
        exit 1
    fi
fi

# Resolve container secrets (HF_TOKEN, NGC_API_KEY)
resolve_secrets

# ============================================================
# Idempotency: check for existing deployment from a previous run
# ============================================================
SKIP_TO=""

if [ "${FORCE_NEW}" = true ]; then
    echo "🔄 --force: ignoring previous deployment, creating new resources."
elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
    EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
    if [ "${EP_STATUS}" = "InService" ]; then
        if [ -n "${IC_TARGET}" ]; then
            echo "🔄 --force-ic: recreating IC '${IC_TARGET}' on existing endpoint: ${ENDPOINT_NAME}"
        else
            echo "🔄 --force-ic: recreating ALL inference components on existing endpoint: ${ENDPOINT_NAME}"
        fi
        SKIP_TO="create_ic"
    else
        echo "⚠️  --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
        echo "   Use --force to create a new endpoint, or wait for the current one."
        exit 4
    fi
elif [ -n "${ENDPOINT_NAME:-}" ]; then
    echo "🔍 Checking for existing deployment: ${ENDPOINT_NAME}"

    EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")

    case "${EP_STATUS}" in
        InService)
            echo "✅ Endpoint already InService: ${ENDPOINT_NAME}"

            # Check inference component
            if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
                IC_STATUS=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}")

                case "${IC_STATUS}" in
                    InService)
                        echo "✅ Inference component already InService: ${INFERENCE_COMPONENT_NAME}"
                        echo ""
                        echo "📋 Deployment is already live. Nothing to do."
                        echo "   Endpoint: ${ENDPOINT_NAME}"
                        echo "   Inference Component: ${INFERENCE_COMPONENT_NAME}"
                        echo ""
                        echo "🧪 Test your endpoint:"
                        echo "   ./do/test"
                        echo ""
                        echo "🧹 Clean up when done:"
                        echo "   ./do/clean endpoint"
                        exit 0
                        ;;
                    Creating)
                        echo "⏳ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
                        SKIP_TO="wait_ic"
                        IC_DEPLOYED_NAME="${INFERENCE_COMPONENT_NAME}"
                        ;;
                    Failed)
                        echo "⚠️  Inference component failed: ${INFERENCE_COMPONENT_NAME}"
                        echo "   Will create a new inference component on the existing endpoint."
                        SKIP_TO="create_ic"
                        ;;
                    *)
                        # Stored IC not found — check if a different IC is running on this endpoint
                        if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
                            # External endpoint: never adopt ICs we didn't create
                            echo "   Stored IC not found on external endpoint. Will create a new one."
                            SKIP_TO="create_ic"
                        else
                            LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
                            if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
                                echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
                                echo "   (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
                                _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
                                echo ""
                                echo "📋 Deployment is already live. Nothing to do."
                                echo "   Endpoint: ${ENDPOINT_NAME}"
                                echo "   Inference Component: ${LIVE_IC}"
                                echo ""
                                echo "🧪 Test your endpoint:"
                                echo "   ./do/test"
                                echo ""
                                echo "🧹 Clean up when done:"
                                echo "   ./do/clean endpoint"
                                exit 0
                            else
                                echo "   No existing inference component found on endpoint. Will create one."
                                SKIP_TO="create_ic"
                            fi
                        fi
                        ;;
                esac
            else
                # No IC name in config — check if one is already running on the endpoint
                if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
                    # External endpoint: never adopt ICs we didn't create
                    echo "   No previous IC deployed by this project. Will create a new one."
                    SKIP_TO="create_ic"
                else
                    LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
                    if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
                        echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
                        _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
                        echo ""
                        echo "📋 Deployment is already live. Nothing to do."
                        echo "   Endpoint: ${ENDPOINT_NAME}"
                        echo "   Inference Component: ${LIVE_IC}"
                        echo ""
                        echo "🧪 Test your endpoint:"
                        echo "   ./do/test"
                        echo ""
                        echo "🧹 Clean up when done:"
                        echo "   ./do/clean endpoint"
                        exit 0
                    else
                        SKIP_TO="create_ic"
                    fi
                fi
            fi
            ;;
        Creating|Updating)
            echo "⏳ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
            SKIP_TO="wait_endpoint"
            ;;
        Failed)
            echo "⚠️  Previous endpoint failed: ${ENDPOINT_NAME}"
            echo "   Creating a new deployment. Clean up the failed endpoint with:"
            echo "   ./do/clean endpoint"
            echo ""
            # Fall through to create new resources
            ;;
        "")
            echo "   Previous endpoint not found (may have been cleaned up). Creating new deployment."
            ;;
        *)
            echo "   Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
            ;;
    esac
fi

# ============================================================
# Step 1: Create endpoint configuration and endpoint (skip if resuming)
# ============================================================
if [ -z "${SKIP_TO}" ]; then
    if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
        # External endpoint: validate it still exists and is InService
        echo "🔗 Using external endpoint: ${ENDPOINT_NAME}"
        echo "   Validating endpoint status..."

        EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")

        if [ -z "${EP_STATUS}" ]; then
            echo "❌ External endpoint not found: ${ENDPOINT_NAME}"
            echo "   The endpoint may have been deleted. Update ENDPOINT_NAME in do/config"
            echo "   or remove ENDPOINT_EXTERNAL=true to create a new endpoint."
            exit 4
        fi

        if [ "${EP_STATUS}" != "InService" ]; then
            echo "❌ External endpoint not InService: ${ENDPOINT_NAME} (status: ${EP_STATUS})"
            echo "   The endpoint must be InService before attaching inference components."
            echo "   Wait for the endpoint to become InService, or update do/config."
            exit 4
        fi

        echo "✅ External endpoint is InService: ${ENDPOINT_NAME}"
        # Skip directly to IC creation — no endpoint config, no endpoint creation, no wait
        SKIP_TO="create_ic"
    else
        TIMESTAMP=$(date +%s)
        ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"

        _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"

        # Create endpoint configuration via shared helper
        create_endpoint_config

        _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"

        # Record endpoint config in manifest (non-blocking)
        ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
        ./do/manifest add \
            --type sagemaker-endpoint-config \
            --id "${ENDPOINT_CONFIG_ARN}" \
            --project "${PROJECT_NAME}" \
            --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
            2>/dev/null || true

        # Step 2: Create endpoint
        echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
        if ! aws sagemaker create-endpoint \
            --endpoint-name "${ENDPOINT_NAME}" \
            --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
            --region "${AWS_REGION}"; then

            echo "❌ Failed to create endpoint"
            echo "   Check that:"
            echo "   • Your IAM credentials have sagemaker:CreateEndpoint permission"
            echo "   • You have sufficient service quota in region: ${AWS_REGION}"
            exit 4
        fi

        echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"

        # Record endpoint in manifest (non-blocking)
        ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
        ./do/manifest add \
            --type sagemaker-endpoint \
            --id "${ENDPOINT_ARN}" \
            --project "${PROJECT_NAME}" \
            --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
            2>/dev/null || true
    fi
fi

# ============================================================
# Wait for endpoint (skip if already InService or external)
# ============================================================
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
    echo "⏳ Waiting for endpoint to reach InService status..."
    echo "   This may take a few minutes..."
    echo "   If this times out, re-run ./do/deploy to resume."

    wait_endpoint "${ENDPOINT_NAME}"

    echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
fi

# ============================================================
# Step 3: Deploy inference components (skip if resuming from wait_ic)
# ============================================================
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then

    if [ -d "${SCRIPT_DIR}/ic" ]; then
        # _check_gpu_capacity
        #   Best-effort capacity guardrail: sums IC_GPU_COUNT across all do/ic/*.conf
        #   and compares against known GPU count for the instance type.
        #   Warns (does not error) if total exceeds instance capacity.
        #   Skips check if instance type is not in the known map.
        _check_gpu_capacity() {
            # Skip check if no INSTANCE_TYPE (external endpoints)
            if [ -z "${INSTANCE_TYPE:-}" ]; then
                return 0
            fi

            # Best-effort capacity guardrail: sums GPU requirements from base ICs only.
            # NOTE: Only do/ic/*.conf files are counted. Adapter ICs (do/adapters/*.conf)
            # share the base IC's GPU resources and have no ComputeResourceRequirements,
            # so they are intentionally excluded from this capacity check.
            #
            # Hardcoded GPU counts for common SageMaker GPU instance types
            local instance_gpus=""
            case "${INSTANCE_TYPE}" in
                ml.g4dn.xlarge)     instance_gpus=1 ;;
                ml.g4dn.12xlarge)   instance_gpus=4 ;;
                ml.g5.xlarge)       instance_gpus=1 ;;
                ml.g5.2xlarge)      instance_gpus=1 ;;
                ml.g5.4xlarge)      instance_gpus=1 ;;
                ml.g5.8xlarge)      instance_gpus=1 ;;
                ml.g5.12xlarge)     instance_gpus=4 ;;
                ml.g5.48xlarge)     instance_gpus=8 ;;
                ml.g6.xlarge)       instance_gpus=1 ;;
                ml.g6.12xlarge)     instance_gpus=4 ;;
                ml.g6.48xlarge)     instance_gpus=8 ;;
                ml.g6e.xlarge)      instance_gpus=1 ;;
                ml.g6e.2xlarge)     instance_gpus=1 ;;
                ml.g6e.4xlarge)     instance_gpus=1 ;;
                ml.g6e.8xlarge)     instance_gpus=1 ;;
                ml.g6e.12xlarge)    instance_gpus=4 ;;
                ml.g6e.48xlarge)    instance_gpus=8 ;;
                ml.g7e.xlarge)      instance_gpus=1 ;;
                ml.g7e.2xlarge)     instance_gpus=1 ;;
                ml.g7e.4xlarge)     instance_gpus=1 ;;
                ml.g7e.8xlarge)     instance_gpus=1 ;;
                ml.g7e.12xlarge)    instance_gpus=4 ;;
                ml.g7e.48xlarge)    instance_gpus=8 ;;
                ml.p3.2xlarge)      instance_gpus=1 ;;
                ml.p3.8xlarge)      instance_gpus=4 ;;
                ml.p3.16xlarge)     instance_gpus=8 ;;
                ml.p4d.24xlarge)    instance_gpus=8 ;;
                ml.p4de.24xlarge)   instance_gpus=8 ;;
                ml.p5.48xlarge)     instance_gpus=8 ;;
                *)                  instance_gpus="" ;;
            esac

            # Skip check if instance type not in map
            if [ -z "${instance_gpus}" ]; then
                return 0
            fi

            # Sum IC_GPU_COUNT across all IC config files
            local total_gpu_requested=0
            for conf in "${SCRIPT_DIR}"/ic/*.conf; do
                [ -f "${conf}" ] || continue
                local ic_gpus
                ic_gpus=$(grep "^export IC_GPU_COUNT=" "${conf}" 2>/dev/null | sed 's/^export IC_GPU_COUNT=//' | tr -d '"' || echo "1")
                if [ -z "${ic_gpus}" ]; then
                    ic_gpus=1
                fi
                total_gpu_requested=$(( total_gpu_requested + ic_gpus ))
            done

            if [ "${total_gpu_requested}" -gt "${instance_gpus}" ]; then
                echo ""
                echo "⚠️  GPU capacity warning: ICs request ${total_gpu_requested} GPUs total, but ${INSTANCE_TYPE} has ${instance_gpus} GPUs."
                echo "   SageMaker will likely reject IC creation if capacity is exceeded."
                echo "   Consider reducing IC_GPU_COUNT values or using a larger instance type."
                echo ""
            fi
        }

        # Run capacity guardrail before deploying ICs
        _check_gpu_capacity

        # _delete_and_wait_ic <ic_name>
        #   Deletes an inference component and waits for deletion to complete.
        #   Polls until the IC is no longer found (avoids name conflicts on recreate).
        _delete_and_wait_ic() {
            local ic_name="$1"
            local delete_timeout=600  # 10 minutes max wait for deletion

            echo "🗑️  Deleting inference component: ${ic_name}"
            if ! aws sagemaker delete-inference-component \
                --inference-component-name "${ic_name}" \
                --region "${AWS_REGION}" 2>/dev/null; then
                echo "   ⚠️  Delete call failed (IC may already be gone). Continuing..."
                return 0
            fi

            echo "   Waiting for deletion to complete..."
            local delete_start
            delete_start=$(date +%s)

            while true; do
                local ic_status
                ic_status=$(_get_ic_status "${ic_name}")

                if [ -z "${ic_status}" ]; then
                    echo "   ✅ Inference component deleted: ${ic_name}"
                    break
                fi

                local elapsed=$(( $(date +%s) - delete_start ))
                if [ "${elapsed}" -ge "${delete_timeout}" ]; then
                    echo "   ⚠️  Deletion timed out after ${delete_timeout}s. IC status: ${ic_status}"
                    echo "   Proceeding anyway — SageMaker may reject the new IC if name conflicts."
                    break
                fi

                echo "   $(date +%H:%M:%S) Deleting... (${ic_status}, ${elapsed}s elapsed)"
                sleep 15
            done
        }

        # _deploy_single_ic <conf_file>
        #   Deploys a single IC with per-IC idempotency:
        #   - If FORCE_IC is true: delete existing IC, clear state, create fresh
        #   - If IC_DEPLOYED_NAME is set and InService → skip
        #   - If IC_DEPLOYED_NAME is set and Creating → wait for it
        #   - If IC_DEPLOYED_NAME is set and Failed → recreate with new timestamp
        #   - If IC_DEPLOYED_NAME is not set → create new IC
        #   Fail-fast: exits immediately on failure.
        _deploy_single_ic() {
            local ic_conf="$1"
            local ic_basename
            ic_basename=$(basename "${ic_conf}" .conf)

            # Source the IC config to check IC_DEPLOYED_NAME
            # Use a subshell-safe approach: read the variable without polluting scope
            local existing_ic_name=""
            if grep -q "^export IC_DEPLOYED_NAME=" "${ic_conf}" 2>/dev/null; then
                existing_ic_name=$(grep "^export IC_DEPLOYED_NAME=" "${ic_conf}" | sed 's/^export IC_DEPLOYED_NAME="//' | sed 's/"$//')
            fi

            # --force-ic: delete existing IC before recreating
            if [ "${FORCE_IC}" = true ] && [ -n "${existing_ic_name}" ]; then
                echo "🔄 --force-ic: recreating IC '${ic_basename}'"
                _delete_and_wait_ic "${existing_ic_name}"

                # Clear deployed state from config before recreating
                _update_config_var "IC_DEPLOYED_NAME" "" "${ic_conf}"
                _update_config_var "IC_DEPLOYED_AT" "" "${ic_conf}"
                existing_ic_name=""
            fi

            if [ "${FORCE_IC}" = true ] && [ -z "${existing_ic_name}" ]; then
                # Force mode with no existing IC — just create new
                create_inference_component "${ic_conf}"
            elif [ -n "${existing_ic_name}" ]; then
                # IC was previously deployed — check its current status
                local ic_status
                ic_status=$(_get_ic_status "${existing_ic_name}")

                case "${ic_status}" in
                    InService)
                        echo "✅ IC '${ic_basename}' already InService: ${existing_ic_name} — skipping"
                        IC_DEPLOYED_NAME="${existing_ic_name}"
                        return 0
                        ;;
                    Creating)
                        echo "⏳ IC '${ic_basename}' is still Creating: ${existing_ic_name} — waiting..."
                        IC_DEPLOYED_NAME="${existing_ic_name}"
                        wait_ic "${IC_DEPLOYED_NAME}"
                        echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
                        return 0
                        ;;
                    Failed)
                        echo "⚠️  IC '${ic_basename}' previously Failed: ${existing_ic_name} — recreating..."
                        create_inference_component "${ic_conf}"
                        ;;
                    *)
                        echo "   IC '${ic_basename}' has unknown/missing status for ${existing_ic_name} — creating new..."
                        create_inference_component "${ic_conf}"
                        ;;
                esac
            else
                # No previous deployment — create new IC
                create_inference_component "${ic_conf}"
            fi

            echo "⏳ Waiting for inference component to reach InService status..."
            echo "   This may take 5-10 minutes..."

            wait_ic "${IC_DEPLOYED_NAME}"

            echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"

            # Record inference component in manifest (non-blocking)
            local ic_arn="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
            ./do/manifest add \
                --type sagemaker-inference-component \
                --id "${ic_arn}" \
                --project "${PROJECT_NAME}" \
                --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
                2>/dev/null || true
        }

        if [ -n "${IC_TARGET}" ]; then
            # Single IC path: deploy only the named IC
            echo ""
            echo "── Deploying IC: ${IC_TARGET} ──"
            _deploy_single_ic "${SCRIPT_DIR}/ic/${IC_TARGET}.conf"
        else
            # Multi-IC path: iterate all IC config files (alphabetical order)
            IC_SUMMARY=""
            IC_DEPLOY_FAILED=false

            for conf in "${SCRIPT_DIR}"/ic/*.conf; do
                [ -f "${conf}" ] || continue
                local_ic_basename=$(basename "${conf}" .conf)
                echo ""
                echo "── Deploying IC: ${local_ic_basename} ──"

                if ! _deploy_single_ic "${conf}"; then
                    echo "❌ IC '${local_ic_basename}' failed to deploy. Stopping."
                    IC_SUMMARY="${IC_SUMMARY}   ${local_ic_basename}: FAILED\n"
                    IC_DEPLOY_FAILED=true
                    break
                fi

                IC_SUMMARY="${IC_SUMMARY}   ${local_ic_basename}: ${IC_DEPLOYED_NAME} [InService]\n"
            done

            # Print summary
            echo ""
            echo "📋 IC Deployment Summary:"
            echo -e "${IC_SUMMARY}"

            if [ "${IC_DEPLOY_FAILED}" = true ]; then
                echo "❌ Deployment stopped due to IC failure. Fix the issue and re-run ./do/deploy to resume."
                exit 4
            fi
        fi
    else
        # Legacy single-IC path: no do/ic/ directory
        create_inference_component_legacy

        echo "⏳ Waiting for inference component to reach InService status..."
        echo "   This may take 5-10 minutes..."
        echo "   If this times out, re-run ./do/deploy to resume."

        wait_ic "${IC_DEPLOYED_NAME}"

        echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"

        # Record inference component in manifest (non-blocking)
        IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
        ./do/manifest add \
            --type sagemaker-inference-component \
            --id "${IC_ARN}" \
            --project "${PROJECT_NAME}" \
            --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
            2>/dev/null || true
    fi

elif [ "${SKIP_TO}" = "wait_ic" ]; then
    # Resuming: just wait for the IC that was already being created
    echo "⏳ Waiting for inference component to reach InService status..."
    echo "   This may take 5-10 minutes..."
    echo "   If this times out, re-run ./do/deploy to resume."

    wait_ic "${IC_DEPLOYED_NAME}"

    echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
fi

echo "✅ Deployment complete!"
echo ""
echo "📋 Deployment Details:"
echo "   Endpoint: ${ENDPOINT_NAME}"
if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
    echo "   Endpoint Config: (external — not managed by this project)"
    echo "   Region: ${AWS_REGION}"
else
    echo "   Endpoint Config: ${ENDPOINT_CONFIG_NAME:-N/A}"
    echo "   Region: ${AWS_REGION}"
    echo "   Instance Type: ${INSTANCE_TYPE}"
fi
echo "   Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
echo ""
echo "📋 What's next?"
echo "   • Test your endpoint:         ./do/test"
<% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
echo "   • Benchmark performance:      ./do/benchmark"
<% } %>
<% if (typeof enableLora !== 'undefined' && enableLora) { %>
echo "   • Add a LoRA adapter:         ./do/adapter add <name> --weights s3://..."
<% } %>
echo "   • View endpoint status:       ./do/status"
echo "   • Register this deployment:   ./do/register"
echo "   • View logs:                  ./do/logs"
<% if (!(typeof existingEndpointName !== 'undefined' && existingEndpointName)) { %>
echo "   • Clean up when done:         ./do/clean endpoint"
<% } %>

<% } else if (deploymentTarget === 'async-inference') { %>
# ============================================================
# SageMaker Async Inference Deployment (Model-Based)
# SageMaker async inference does NOT support Inference Components.
# Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
# ============================================================

# Source shared helpers
source "${SCRIPT_DIR}/lib/secrets.sh"
source "${SCRIPT_DIR}/lib/wait.sh"

# Resolve container secrets (HF_TOKEN, NGC_API_KEY)
resolve_secrets

# Validate execution role ARN
if [ -z "${ROLE_ARN:-}" ]; then
    echo "❌ Execution role ARN not provided"
    echo ""
    echo "Usage:"
    echo "  export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
    echo "  ./do/deploy"
    echo ""
    echo "Or set ROLE_ARN in do/config"
    echo ""
    echo "The execution role must have permissions for:"
    echo "  • SageMaker model and endpoint management"
    echo "  • ECR image access"
    echo "  • S3 write access for async output path: ${ASYNC_S3_OUTPUT_PATH}"
    echo "  • SNS publish permissions (optional, for notifications)"
    echo "  • CloudWatch Logs"
    exit 3
fi

echo "   Using execution role: ${ROLE_ARN}"

# ============================================================
# Bootstrap async infrastructure (S3 bucket + SNS topics)
# ============================================================

# Extract bucket name from S3 output path
ASYNC_S3_BUCKET=$(echo "${ASYNC_S3_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)

<% if (!asyncS3OutputPath) { %>
# Bootstrap default S3 bucket (check-and-create)
echo "🔍 Checking if S3 bucket exists: ${ASYNC_S3_BUCKET}"
if ! aws s3api head-bucket --bucket "${ASYNC_S3_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
    echo "📦 Creating S3 bucket: ${ASYNC_S3_BUCKET}"
    if [ "${AWS_REGION}" = "us-east-1" ]; then
        if ! aws s3api create-bucket \
            --bucket "${ASYNC_S3_BUCKET}" \
            --region "${AWS_REGION}"; then
            echo "❌ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
            echo ""
            echo "   Check that:"
            echo "   • Your IAM credentials have s3:CreateBucket permission"
            echo "   • The bucket name is not already taken globally"
            exit 4
        fi
    else
        if ! aws s3api create-bucket \
            --bucket "${ASYNC_S3_BUCKET}" \
            --region "${AWS_REGION}" \
            --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
            echo "❌ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
            echo ""
            echo "   Check that:"
            echo "   • Your IAM credentials have s3:CreateBucket permission"
            echo "   • The bucket name is not already taken globally"
            exit 4
        fi
    fi
    echo "✅ S3 bucket created: ${ASYNC_S3_BUCKET}"
else
    echo "✅ S3 bucket exists: ${ASYNC_S3_BUCKET}"
fi
<% } else { %>
# Custom S3 output path provided — skip bucket creation
echo "✅ Using custom S3 output path: ${ASYNC_S3_OUTPUT_PATH}"
<% } %>

# Extract topic name from SNS success topic ARN
ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')

<% if (!asyncSnsSuccessTopic) { %>
# Bootstrap default SNS success topic (check-and-create)
echo "🔍 Checking if SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_SUCCESS_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
    echo "📦 Creating SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
    if ! aws sns create-topic \
        --name "${ASYNC_SNS_SUCCESS_TOPIC_NAME}" \
        --region "${AWS_REGION}" > /dev/null; then
        echo "❌ Failed to create SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
        echo ""
        echo "   Check that:"
        echo "   • Your IAM credentials have sns:CreateTopic permission"
        exit 4
    fi
    echo "✅ SNS success topic created: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
else
    echo "✅ SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
fi

# Record SNS success topic in manifest (non-blocking)
./do/manifest add \
    --type sns-topic \
    --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

<% } else { %>
# Custom SNS success topic ARN provided — skip topic creation
echo "✅ Using custom SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC}"

# Record SNS success topic in manifest (non-blocking)
ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
./do/manifest add \
    --type sns-topic \
    --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

<% } %>

# Extract topic name from SNS error topic ARN
ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')

<% if (!asyncSnsErrorTopic) { %>
# Bootstrap default SNS error topic (check-and-create)
echo "🔍 Checking if SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_ERROR_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
    echo "📦 Creating SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
    if ! aws sns create-topic \
        --name "${ASYNC_SNS_ERROR_TOPIC_NAME}" \
        --region "${AWS_REGION}" > /dev/null; then
        echo "❌ Failed to create SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
        echo ""
        echo "   Check that:"
        echo "   • Your IAM credentials have sns:CreateTopic permission"
        exit 4
    fi
    echo "✅ SNS error topic created: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
else
    echo "✅ SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
fi

# Record SNS error topic in manifest (non-blocking)
./do/manifest add \
    --type sns-topic \
    --id "${ASYNC_SNS_ERROR_TOPIC}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

<% } else { %>
# Custom SNS error topic ARN provided — skip topic creation
echo "✅ Using custom SNS error topic: ${ASYNC_SNS_ERROR_TOPIC}"

# Record SNS error topic in manifest (non-blocking)
ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
./do/manifest add \
    --type sns-topic \
    --id "${ASYNC_SNS_ERROR_TOPIC}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

<% } %>

# ============================================================
# Create async endpoint (classic model-based flow)
# SageMaker async inference does NOT support Inference Components.
# Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
# ============================================================

# ============================================================
# Idempotency: check for existing deployment from a previous run
# ============================================================
SKIP_TO=""

if [ "${FORCE_NEW}" = true ]; then
    echo "🔄 --force: ignoring previous deployment, creating new resources."
elif [ -n "${ENDPOINT_NAME:-}" ]; then
    echo "🔍 Checking for existing deployment: ${ENDPOINT_NAME}"

    EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")

    case "${EP_STATUS}" in
        InService)
            echo "✅ Async endpoint already InService: ${ENDPOINT_NAME}"
            echo ""
            echo "📋 Deployment is already live. Nothing to do."
            echo "   Endpoint: ${ENDPOINT_NAME}"
            echo ""
            echo "🧪 Test your async endpoint:"
            echo "   ./do/test"
            echo ""
            echo "🧹 Clean up when done:"
            echo "   ./do/clean endpoint"
            exit 0
            ;;
        Creating|Updating)
            echo "⏳ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
            SKIP_TO="wait_endpoint"
            ;;
        Failed)
            echo "⚠️  Previous endpoint failed: ${ENDPOINT_NAME}"
            echo "   Creating a new deployment. Clean up the failed endpoint with:"
            echo "   ./do/clean endpoint"
            echo ""
            ;;
        "")
            echo "   Previous endpoint not found (may have been cleaned up). Creating new deployment."
            ;;
        *)
            echo "   Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
            ;;
    esac
fi

# ============================================================
# Create async resources (skip if resuming from wait)
# ============================================================
if [ -z "${SKIP_TO}" ]; then
    TIMESTAMP=$(date +%s)
    MODEL_NAME_SM="${PROJECT_NAME}-async-model-${TIMESTAMP}"
    ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-async-epc-${TIMESTAMP}"
    ENDPOINT_NAME="${PROJECT_NAME}-async-ep-${TIMESTAMP}"

    _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
    _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
    _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"

    # Step 1: Create SageMaker model
    # Build primary container spec
    PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
    if [ -n "${CONTAINER_ENV_JSON}" ]; then
        PRIMARY_CONTAINER="${PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
    fi
    PRIMARY_CONTAINER="${PRIMARY_CONTAINER}}"

    echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"
    if ! aws sagemaker create-model \
        --model-name "${MODEL_NAME_SM}" \
        --primary-container "${PRIMARY_CONTAINER}" \
        --execution-role-arn "${ROLE_ARN}" \
        --region "${AWS_REGION}"; then

        echo "❌ Failed to create SageMaker model"
        echo "   Check that:"
        echo "   • The execution role ARN is valid"
        echo "   • The ECR image exists and is accessible"
        echo "   • The IAM role has ecr:GetDownloadUrlForLayer permission"
        exit 4
    fi

    echo "✅ SageMaker model created: ${MODEL_NAME_SM}"

    # Record model in manifest (non-blocking)
    MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
    ./do/manifest add \
        --type sagemaker-model \
        --id "${MODEL_ARN}" \
        --project "${PROJECT_NAME}" \
        --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
        2>/dev/null || true

    # Build production variant JSON (classic: includes ModelName, no execution-role-arn on endpoint config)
    VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"ModelName\":\"${MODEL_NAME_SM}\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"

    # Append InferenceAmiVersion if configured
    if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
        VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
        echo "   AMI version: ${INFERENCE_AMI_VERSION}"
    fi

    VARIANT_JSON="${VARIANT_JSON}}]"

    # Build AsyncInferenceConfig JSON
    ASYNC_CONFIG="{\"OutputConfig\":{\"S3OutputPath\":\"${ASYNC_S3_OUTPUT_PATH}\",\"NotificationConfig\":{\"SuccessTopic\":\"${ASYNC_SNS_SUCCESS_TOPIC}\",\"ErrorTopic\":\"${ASYNC_SNS_ERROR_TOPIC}\"}}"

    if [ -n "${ASYNC_MAX_CONCURRENT_INVOCATIONS:-}" ]; then
        ASYNC_CONFIG="${ASYNC_CONFIG},\"ClientConfig\":{\"MaxConcurrentInvocationsPerInstance\":${ASYNC_MAX_CONCURRENT_INVOCATIONS}}"
    fi

    ASYNC_CONFIG="${ASYNC_CONFIG}}"

    # Step 2: Create endpoint configuration with AsyncInferenceConfig (no --execution-role-arn)
    echo "⚙️  Creating async endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
    if ! aws sagemaker create-endpoint-config \
        --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
        --production-variants "${VARIANT_JSON}" \
        --async-inference-config "${ASYNC_CONFIG}" \
        --region "${AWS_REGION}"; then

        echo "❌ Failed to create async endpoint configuration"
        echo "   Check that:"
        echo "   • The S3 output path is accessible: ${ASYNC_S3_OUTPUT_PATH}"
        echo "   • The IAM role has s3:PutObject permission on the output path"
        echo "   • The instance type is valid: ${INSTANCE_TYPE}"
        echo "   • The instance type is available in region: ${AWS_REGION}"
        echo "   • You have sufficient service quota for the instance type"
        exit 4
    fi

    echo "✅ Async endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"

    # Record endpoint config in manifest (non-blocking)
    ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
    ./do/manifest add \
        --type sagemaker-endpoint-config \
        --id "${ENDPOINT_CONFIG_ARN}" \
        --project "${PROJECT_NAME}" \
        --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
        2>/dev/null || true

    # Step 3: Create endpoint
    echo "🚀 Creating async endpoint: ${ENDPOINT_NAME}"
    if ! aws sagemaker create-endpoint \
        --endpoint-name "${ENDPOINT_NAME}" \
        --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
        --region "${AWS_REGION}"; then

        echo "❌ Failed to create async endpoint"
        echo "   Check that:"
        echo "   • Your IAM credentials have sagemaker:CreateEndpoint permission"
        echo "   • You have sufficient service quota in region: ${AWS_REGION}"
        exit 4
    fi

    echo "✅ Async endpoint creation initiated: ${ENDPOINT_NAME}"

    # Record endpoint in manifest (non-blocking)
    ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
    ./do/manifest add \
        --type sagemaker-endpoint \
        --id "${ENDPOINT_ARN}" \
        --project "${PROJECT_NAME}" \
        --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
        2>/dev/null || true
fi

# ============================================================
# Wait for endpoint (skip if already InService)
# ============================================================
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
    echo "⏳ Waiting for async endpoint to reach InService status..."
    echo "   This may take several minutes..."
    echo "   If this times out, re-run ./do/deploy to resume."

    wait_endpoint "${ENDPOINT_NAME}"
fi

echo "✅ Async deployment complete!"
echo ""
echo "📋 Deployment Details:"
echo "   Endpoint: ${ENDPOINT_NAME}"
echo "   Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
echo "   Model: ${MODEL_NAME_SM}"
echo "   Region: ${AWS_REGION}"
echo "   Instance Type: ${INSTANCE_TYPE}"
echo "   Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
echo "   S3 Output: ${ASYNC_S3_OUTPUT_PATH}"
echo "   SNS Success: ${ASYNC_SNS_SUCCESS_TOPIC}"
echo "   SNS Error: ${ASYNC_SNS_ERROR_TOPIC}"
echo ""
echo "📋 What's next?"
echo "   • Test your async endpoint:   ./do/test"
echo "   • Check async output:         aws s3 ls ${ASYNC_S3_OUTPUT_PATH}"
<% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
echo "   • Benchmark performance:      ./do/benchmark"
<% } %>
echo "   • Register this deployment:   ./do/register"
echo "   • View logs:                  ./do/logs"
echo "   • Clean up when done:         ./do/clean endpoint"

<% } else if (deploymentTarget === 'hyperpod-eks') { %>
# ============================================================
# HyperPod EKS Deployment
# ============================================================

# Get kubeconfig for HyperPod cluster
echo "🔑 Configuring kubectl for HyperPod cluster..."
KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"

# Step 1: Describe the HyperPod cluster to get the underlying EKS cluster ARN
EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
    --cluster-name "${HYPERPOD_CLUSTER_NAME}" \
    --region "${AWS_REGION}" \
    --query "Orchestrator.Eks.ClusterArn" \
    --output text 2>&1) || {
    echo "❌ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
    echo ""
    echo "   Error details:"
    echo "   ${EKS_CLUSTER_ARN}"
    echo ""
    echo "   Check that:"
    echo "   • The cluster name is correct"
    echo "   • The cluster exists in region: ${AWS_REGION}"
    echo "   • Your IAM user/role has permission to access the cluster"
    echo ""
    echo "   Required IAM permissions:"
    echo "   • sagemaker:DescribeCluster"
    echo "   • eks:DescribeCluster"
    exit 4
}

# Step 2: Extract the EKS cluster name from the ARN
EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
echo "   HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
echo "   EKS cluster: ${EKS_CLUSTER_NAME}"

# Step 3: Update kubeconfig using the EKS cluster
if ! aws eks update-kubeconfig \
    --name "${EKS_CLUSTER_NAME}" \
    --region "${AWS_REGION}" \
    --kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
    echo "❌ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
    echo ""
    echo "   Required IAM permissions:"
    echo "   • eks:DescribeCluster"
    echo "   • eks:AccessKubernetesApi"
    exit 4
fi

export KUBECONFIG="${KUBECONFIG_PATH}"
echo "✅ Kubeconfig saved to: ${KUBECONFIG_PATH}"

# Verify cluster connectivity
echo "🔍 Verifying cluster connectivity..."
if ! kubectl cluster-info &> /dev/null; then
    echo "❌ Cannot connect to HyperPod cluster"
    echo ""
    echo "   Check that:"
    echo "   • The cluster is in 'InService' status"
    echo "   • Your network can reach the cluster API server"
    echo "   • Your IAM credentials are valid"
    exit 4
fi
echo "✅ Connected to HyperPod cluster"

# Create namespace if it doesn't exist
echo "📁 Ensuring namespace exists: ${HYPERPOD_NAMESPACE}"
if ! kubectl create namespace "${HYPERPOD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>&1; then
    echo "⚠️  Warning: Could not create/verify namespace"
fi

# Apply Kubernetes manifests
echo "📄 Applying Kubernetes manifests from hyperpod/..."

# Substitute shell variables (e.g. ${AWS_ACCOUNT_ID}) in manifests before applying
export AWS_ACCOUNT_ID
export ECR_IMAGE="${ECR_REPOSITORY}:${IMAGE_TAG}"

APPLY_OUTPUT=""
APPLY_EXIT_CODE=0
for manifest in hyperpod/*.yaml; do
    # Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
    RENDERED=$(envsubst < "${manifest}")
    if echo "${RENDERED}" | grep -q '^kind:'; then
        FILE_OUTPUT=$(echo "${RENDERED}" | kubectl apply -n "${HYPERPOD_NAMESPACE}" -f - 2>&1) || {
            APPLY_EXIT_CODE=$?
        }
        APPLY_OUTPUT="${APPLY_OUTPUT}${FILE_OUTPUT}\n"
    fi
done

if [ "${APPLY_EXIT_CODE}" -ne 0 ]; then
    echo ""
    echo "❌ Failed to apply Kubernetes manifests"
    echo ""
    echo "   Error details:"
    echo "   ${APPLY_OUTPUT}"
    echo ""
    echo "   Common issues:"
    echo "   • Insufficient node capacity - check available GPU nodes"
    echo "   • Resource requests exceed node capacity"
    echo "   • RBAC permissions - ensure you have permission to create resources in namespace '${HYPERPOD_NAMESPACE}'"
    echo "   • Invalid manifest syntax"
<% if (fsxVolumeHandle) { %>
    echo "   • PVC creation failure - verify the FSx CSI driver is installed on the cluster"
    echo "     kubectl get csidriver -o name | grep fsx"
<% } %>
    echo ""
    echo "   Debug commands:"
    echo "   kubectl get nodes -o wide"
    echo "   kubectl describe nodes"
    echo "   kubectl get events -n ${HYPERPOD_NAMESPACE}"
    exit ${APPLY_EXIT_CODE}
fi

echo "✅ Kubernetes manifests applied"

# Record k8s deployment and service in manifest (non-blocking)
./do/manifest add \
    --type k8s-deployment \
    --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"deploymentName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

./do/manifest add \
    --type k8s-service \
    --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"serviceName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

# Wait for deployment to be ready
DEPLOY_TIMEOUT=${DEPLOY_TIMEOUT:-1200}
echo "⏳ Waiting for deployment to be ready (timeout: ${DEPLOY_TIMEOUT}s)..."
echo "   This may take several minutes for GPU workloads..."
echo ""

# Poll pod status every 30s while rollout is in progress
(
    while true; do
        sleep 30
        POD_STATUS=$(kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} \
            --no-headers 2>/dev/null | head -5)
        if [ -n "${POD_STATUS}" ]; then
            echo "   📊 $(date +%H:%M:%S) Pod status:"
            echo "${POD_STATUS}" | while read -r line; do echo "      ${line}"; done
        fi
    done
) &
STATUS_PID=$!
trap "kill ${STATUS_PID} 2>/dev/null; wait ${STATUS_PID} 2>/dev/null" EXIT

ROLLOUT_OUTPUT=$(kubectl rollout status deployment/${PROJECT_NAME} -n "${HYPERPOD_NAMESPACE}" --timeout=${DEPLOY_TIMEOUT}s 2>&1) || {
    ROLLOUT_EXIT_CODE=$?
    kill ${STATUS_PID} 2>/dev/null
    echo ""
    echo "❌ Deployment failed to become ready within timeout"
    echo ""
    echo "   Error details:"
    echo "   ${ROLLOUT_OUTPUT}"
    echo ""
    echo "   Current pod state:"
    kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} -o wide 2>/dev/null
    echo ""
    echo "   Debug commands:"
    echo "   kubectl describe pods -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
    echo "   kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME} --tail=100"
    echo ""
    echo "   Common issues:"
    echo "   • Image pull errors - check ECR permissions"
    echo "   • Resource scheduling - insufficient GPU nodes"
    echo "   • Container crash - check application logs"
<% if (fsxVolumeHandle) { %>
    echo "   • PVC binding errors - verify FSx CSI driver is installed on the cluster"
    echo "     kubectl get pvc -n ${HYPERPOD_NAMESPACE}"
    echo "     kubectl describe pvc -n ${HYPERPOD_NAMESPACE}"
    echo "     kubectl get csidriver -o name | grep fsx"
<% } %>
    exit ${ROLLOUT_EXIT_CODE}
}

kill ${STATUS_PID} 2>/dev/null
wait ${STATUS_PID} 2>/dev/null

echo "✅ HyperPod EKS deployment complete!"
echo ""
echo "📋 Deployment Details:"
echo "   Cluster: ${HYPERPOD_CLUSTER_NAME}"
echo "   Namespace: ${HYPERPOD_NAMESPACE}"
echo "   Deployment: ${PROJECT_NAME}"
echo "   Replicas: ${HYPERPOD_REPLICAS}"
echo "   Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
echo ""
echo "📋 What's next?"
echo "   • Test your deployment:       ./do/test"
echo "   • Check pod status:           kubectl get pods -n ${HYPERPOD_NAMESPACE}"
echo "   • View pod logs:              kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
<% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
echo "   • Benchmark performance:      ./do/benchmark"
<% } %>
echo "   • Register this deployment:   ./do/register"
echo "   • View logs:                  ./do/logs"
echo "   • Clean up when done:         ./do/clean hyperpod"

# Write kubeconfig path to config so other scripts can use it (idempotent)
_update_config_var() {
    local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
    if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
        sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
        rm -f "${config_file}.bak"
    else
        echo "" >> "${config_file}"
        echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
    fi
}

_update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"

<% } else if (deploymentTarget === 'batch-transform') { %>
# ============================================================
# SageMaker Batch Transform Deployment
# Flow: create-model → create-transform-job → poll until completion
# ============================================================

# Source shared helpers
source "${SCRIPT_DIR}/lib/secrets.sh"
source "${SCRIPT_DIR}/lib/wait.sh"

# Resolve container secrets (HF_TOKEN, NGC_API_KEY)
resolve_secrets

# Validate execution role ARN
if [ -z "${ROLE_ARN:-}" ]; then
    echo "❌ Execution role ARN not provided"
    echo ""
    echo "Usage:"
    echo "  export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
    echo "  ./do/deploy"
    echo ""
    echo "Or set ROLE_ARN in do/config"
    echo ""
    echo "The execution role must have permissions for:"
    echo "  • SageMaker model and transform job management"
    echo "  • ECR image access"
    echo "  • S3 read access for input path: ${BATCH_INPUT_PATH}"
    echo "  • S3 write access for output path: ${BATCH_OUTPUT_PATH}"
    echo "  • CloudWatch Logs"
    exit 3
fi

echo "   Using execution role: ${ROLE_ARN}"

# Validate S3 input path
if [ -z "${BATCH_INPUT_PATH:-}" ]; then
    echo "❌ S3 input path not provided"
    echo ""
    echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
    echo "  export BATCH_INPUT_PATH=s3://my-bucket/input/"
    echo "  ./do/deploy"
    exit 3
fi

if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
    echo "❌ S3 input path must start with s3://"
    echo "   Current value: ${BATCH_INPUT_PATH}"
    echo "   Example: s3://my-bucket/input/"
    exit 3
fi

# Validate S3 output path
if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
    echo "❌ S3 output path not provided"
    echo ""
    echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
    echo "  export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
    echo "  ./do/deploy"
    exit 3
fi

if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
    echo "❌ S3 output path must start with s3://"
    echo "   Current value: ${BATCH_OUTPUT_PATH}"
    echo "   Example: s3://my-bucket/output/"
    exit 3
fi

# ============================================================
# Bootstrap S3 buckets for batch transform
# ============================================================

# Extract bucket names from S3 paths
BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)

<% if (!batchInputPath) { %>
# Bootstrap default S3 input bucket (check-and-create)
echo "🔍 Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
    echo "📦 Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
    if [ "${AWS_REGION}" = "us-east-1" ]; then
        if ! aws s3api create-bucket \
            --bucket "${BATCH_INPUT_BUCKET}" \
            --region "${AWS_REGION}"; then
            echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
            echo ""
            echo "   Check that:"
            echo "   • Your IAM credentials have s3:CreateBucket permission"
            echo "   • The bucket name is not already taken globally"
            exit 4
        fi
    else
        if ! aws s3api create-bucket \
            --bucket "${BATCH_INPUT_BUCKET}" \
            --region "${AWS_REGION}" \
            --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
            echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
            echo ""
            echo "   Check that:"
            echo "   • Your IAM credentials have s3:CreateBucket permission"
            echo "   • The bucket name is not already taken globally"
            exit 4
        fi
    fi
    echo "✅ S3 input bucket created: ${BATCH_INPUT_BUCKET}"
else
    echo "✅ S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
fi

# Upload sample input file if the input prefix is empty
EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
if [ -z "${EXISTING_OBJECTS}" ]; then
    echo "📄 Uploading sample input file to ${BATCH_INPUT_PATH}"
<% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
    echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
<% } else if (framework === 'transformers') { %>
    echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
<% } else if (framework === 'diffusors') { %>
    echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
<% } else { %>
    echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
<% } %>
    echo "✅ Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
    echo "   ⚠️  Replace this with your actual input data before running production jobs"
fi
<% } else { %>
# Custom S3 input path provided — skip bucket creation
echo "✅ Using custom S3 input path: ${BATCH_INPUT_PATH}"
<% } %>

<% if (!batchOutputPath) { %>
# Bootstrap default S3 output bucket (check-and-create, may be same as input)
if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
    echo "🔍 Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
    if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
        echo "📦 Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
        if [ "${AWS_REGION}" = "us-east-1" ]; then
            if ! aws s3api create-bucket \
                --bucket "${BATCH_OUTPUT_BUCKET}" \
                --region "${AWS_REGION}"; then
                echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
                exit 4
            fi
        else
            if ! aws s3api create-bucket \
                --bucket "${BATCH_OUTPUT_BUCKET}" \
                --region "${AWS_REGION}" \
                --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
                echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
                exit 4
            fi
        fi
        echo "✅ S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
    else
        echo "✅ S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
    fi
else
    echo "✅ S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
fi
<% } else { %>
# Custom S3 output path provided — skip bucket creation
echo "✅ Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
<% } %>

# ============================================================
# Check for previous transform job still running
# ============================================================
if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
    echo "🔍 Checking previous transform job: ${TRANSFORM_JOB_NAME}"
    PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
        --transform-job-name "${TRANSFORM_JOB_NAME}" \
        --region "${AWS_REGION}" \
        --query "TransformJobStatus" \
        --output text 2>/dev/null || echo "")

    case "${PREV_JOB_STATUS}" in
        InProgress)
            echo "⚠️  Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
            echo "   Wait for it to complete, or stop it with:"
            echo "   aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
            echo ""
            echo "   Use --force to create a new job anyway."
            exit 4
            ;;
        Completed)
            echo "✅ Previous transform job completed: ${TRANSFORM_JOB_NAME}"
            echo "   Creating a new job. Results from the previous job are in:"
            echo "   ${BATCH_OUTPUT_PATH}"
            echo ""
            ;;
        *)
            # Failed, Stopped, or not found — proceed with new job
            ;;
    esac
fi

# Generate unique names with timestamp
TIMESTAMP=$(date +%s)
MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"

_update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
_update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"

# Step 1: Create SageMaker model
echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"

# Build primary container spec
BATCH_PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
if [ -n "${CONTAINER_ENV_JSON}" ]; then
    BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
fi
BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER}}"

if ! aws sagemaker create-model \
    --model-name "${MODEL_NAME_SM}" \
    --primary-container "${BATCH_PRIMARY_CONTAINER}" \
    --execution-role-arn "${ROLE_ARN}" \
    --region "${AWS_REGION}"; then

    echo "❌ Failed to create SageMaker model"
    echo "   Check that:"
    echo "   • The execution role ARN is valid"
    echo "   • The ECR image exists and is accessible"
    echo "   • The IAM role has ecr:GetDownloadUrlForLayer permission"
    exit 4
fi

echo "✅ SageMaker model created: ${MODEL_NAME_SM}"

# Record model in manifest (non-blocking)
MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
./do/manifest add \
    --type sagemaker-model \
    --id "${MODEL_ARN}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

# Step 2: Build transform job JSON
TRANSFORM_JOB_JSON="{
    \"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
    \"ModelName\": \"${MODEL_NAME_SM}\",
    \"TransformInput\": {
        \"DataSource\": {
            \"S3DataSource\": {
                \"S3DataType\": \"S3Prefix\",
                \"S3Uri\": \"${BATCH_INPUT_PATH}\"
            }
        },
        \"ContentType\": \"application/json\",
        \"SplitType\": \"${BATCH_SPLIT_TYPE}\"
    },
    \"TransformOutput\": {
        \"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
        $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
    },
    \"TransformResources\": {
        \"InstanceType\": \"${INSTANCE_TYPE}\",
        \"InstanceCount\": ${BATCH_INSTANCE_COUNT}
    },
    \"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
    \"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
    \"BatchStrategy\": \"${BATCH_STRATEGY}\"
    $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
}"

# Step 3: Create transform job
echo "🚀 Creating transform job: ${TRANSFORM_JOB_NAME}"
if ! aws sagemaker create-transform-job \
    --cli-input-json "${TRANSFORM_JOB_JSON}" \
    --region "${AWS_REGION}"; then

    echo "❌ Failed to create transform job"
    echo "   Check that:"
    echo "   • The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
    echo "   • The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
    echo "   • The IAM role has s3:GetObject permission on the input path"
    echo "   • The IAM role has s3:PutObject permission on the output path"
    echo "   • The instance type is valid: ${INSTANCE_TYPE}"
    echo "   • The instance type is available in region: ${AWS_REGION}"
    echo "   • You have sufficient service quota for the instance type"
    exit 4
fi

echo "✅ Transform job created: ${TRANSFORM_JOB_NAME}"

# Record transform job in manifest (non-blocking)
TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
./do/manifest add \
    --type sagemaker-transform-job \
    --id "${TRANSFORM_JOB_ARN}" \
    --project "${PROJECT_NAME}" \
    --meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
    2>/dev/null || true

# Step 4: Poll transform job status until completion or failure
echo "⏳ Waiting for transform job to complete..."
echo "   This may take several minutes depending on dataset size..."
echo "   If this times out, check status with:"
echo "   aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
echo ""

while true; do
    JOB_STATUS=$(aws sagemaker describe-transform-job \
        --transform-job-name "${TRANSFORM_JOB_NAME}" \
        --region "${AWS_REGION}" \
        --query "TransformJobStatus" \
        --output text 2>&1) || {
        # Check if it was a credential expiration
        if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
            echo ""
            echo "⚠️  Credentials expired, but the transform job is still running."
            echo "   Refresh your credentials and check status with:"
            echo "   aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
            exit 4
        fi
        echo "❌ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
        echo "   Error: ${JOB_STATUS}"
        exit 4
    }

    case "${JOB_STATUS}" in
        Completed)
            echo "✅ Transform job completed successfully!"
            break
            ;;
        Failed)
            FAILURE_REASON=$(aws sagemaker describe-transform-job \
                --transform-job-name "${TRANSFORM_JOB_NAME}" \
                --region "${AWS_REGION}" \
                --query "FailureReason" \
                --output text 2>/dev/null || echo "Unknown")
            echo "❌ Transform job failed"
            echo "   Reason: ${FAILURE_REASON}"
            echo ""
            echo "   Check CloudWatch Logs for details:"
            echo "   https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
            echo ""
            echo "   Verify that:"
            echo "   • The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
            echo "   • The input data format matches the container's expected format"
            echo "   • The container's /ping and /invocations endpoints work correctly"
            exit 4
            ;;
        Stopped)
            echo "⚠️  Transform job was stopped"
            exit 4
            ;;
        InProgress)
            echo "   $(date +%H:%M:%S) Job status: InProgress..."
            sleep 30
            ;;
        *)
            echo "   $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
            sleep 30
            ;;
    esac
done

echo ""
echo "📋 Deployment Details:"
echo "   Transform Job: ${TRANSFORM_JOB_NAME}"
echo "   Model: ${MODEL_NAME_SM}"
echo "   Region: ${AWS_REGION}"
echo "   Instance Type: ${INSTANCE_TYPE}"
echo "   Instance Count: ${BATCH_INSTANCE_COUNT}"
echo "   Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
echo "   S3 Input: ${BATCH_INPUT_PATH}"
echo "   S3 Output: ${BATCH_OUTPUT_PATH}"
echo "   Split Type: ${BATCH_SPLIT_TYPE}"
echo "   Strategy: ${BATCH_STRATEGY}"
echo ""

# Download results locally
LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
mkdir -p "${LOCAL_OUTPUT_DIR}"
echo "📥 Downloading results to ${LOCAL_OUTPUT_DIR}/"
if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
    DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
    echo "✅ Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
    echo ""

    # Display first output file preview
    FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
    if [ -n "${FIRST_FILE}" ]; then
        echo "📄 Sample output (${FIRST_FILE}):"
        head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
        LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
        if [ "${LINES}" -gt 5 ]; then
            echo "   ... (${LINES} total lines)"
        fi
    fi
else
    echo "⚠️  Could not download output files"
fi

echo ""
echo "📋 What's next?"
echo "   • View results:               cat batch-output/"
echo "   • Review results:             ./do/test"
echo "   • Register this deployment:   ./do/register"
echo "   • View logs:                  ./do/logs"
echo "   • Clean up when done:         ./do/clean"

<% } %>
