#!/usr/bin/env bash
# Start the in-box dockerd. Launched by the host via
# `docker exec -d --user root`. Idempotent — safe to call again on
# `agentbox start`. The storage driver is selected at runtime (see
# select_storage_driver below): the kernel-native `overlay2` when a probe
# proves it works on the data-root filesystem, otherwise `fuse-overlayfs`.
# The chosen driver is written to /etc/docker/daemon.json before launch.

set -euo pipefail

if pgrep -x dockerd >/dev/null; then
  exit 0
fi

mkdir -p /var/lib/docker /var/run /var/log/agentbox

# /var/run lives in the container's writable layer (not a volume), so files
# written by the previous dockerd run survive `docker stop`/`start`. dockerd
# refuses to start if `/var/run/docker.pid` exists with a PID in /proc — and
# after restart, that PID number has been reassigned to (probably) the new
# `sleep infinity`. Wipe both the pidfile and the stale socket here; pgrep
# above already confirmed no real dockerd is running.
rm -f /var/run/docker.pid /var/run/docker.sock

# Cgroup v2 + unprivileged DinD on OrbStack/Docker Desktop: the outer
# container has /sys/fs/cgroup and /proc/sys bind-mounted RO from the host
# (Docker's standard hardening). We need both writable for dockerd to:
#   * mkdir /sys/fs/cgroup/docker (its own cgroup slice for child containers)
#   * write /proc/sys/net/ipv6/conf/<veth>/disable_ipv6 (default bridge setup)
# Without these, `docker run` fails with EROFS or "failed to disable IPv6".
# SYS_ADMIN + the private cgroup namespace let us remount these RW; the
# writes only affect the box's own namespaces (not the host). Failure is
# tolerable — some hosts already mount these RW.
mount -o remount,rw /sys/fs/cgroup 2>/dev/null || true
mount -o remount,rw /proc/sys 2>/dev/null || true

# --- Storage-driver selection -------------------------------------------------
# The inner dockerd's data root (/var/lib/docker, a Docker named volume) used
# to be pinned to fuse-overlayfs. fuse-overlayfs is broken on recent kernels
# (e.g. Docker Desktop's 6.x linuxkit kernel): inner `docker run` fails at
# execve() with "exec ...: invalid argument". The kernel-native overlay2
# driver works when the data-root filesystem can carry an overlay mount, which
# the ext4 named volume can. We pick overlay2 when a probe proves it works,
# else fall back to fuse-overlayfs.
#
# dockerd refuses to switch drivers once its data root is populated, so if the
# data root is already initialized under one driver we reuse that driver and
# skip the probe — a box created under one driver never switches.
DOCKER_DATA_ROOT=/var/lib/docker
DAEMON_JSON=/etc/docker/daemon.json

probe_overlay2() {
  # The kernel overlay filesystem has to exist at all.
  grep -qw overlay /proc/filesystems 2>/dev/null || return 1

  local probe lower upper work merged ok=1
  # The probe dir MUST live inside the data root so the test overlay is mounted
  # on the SAME filesystem the real graph will use. A probe under /tmp would
  # test the container's overlayfs writable layer — the wrong filesystem.
  probe="$(mktemp -d "$DOCKER_DATA_ROOT/.overlay2-probe.XXXXXX" 2>/dev/null)" || return 1
  lower="$probe/lower"; upper="$probe/upper"; work="$probe/work"; merged="$probe/merged"
  mkdir -p "$lower" "$upper" "$work" "$merged" || { rm -rf "$probe"; return 1; }

  # Stage a known-good executable so the merged view exposes it.
  cp /bin/true "$lower/probe-bin" 2>/dev/null || { rm -rf "$probe"; return 1; }
  chmod 0755 "$lower/probe-bin" 2>/dev/null || true

  if mount -t overlay overlay \
       -o "lowerdir=$lower,upperdir=$upper,workdir=$work" "$merged" 2>/dev/null; then
    # The actual fuse-overlayfs failure mode: execve from the merged dir. A
    # successful mount is not enough — fuse-overlayfs mounts fine and only
    # fails here.
    "$merged/probe-bin" >/dev/null 2>&1 || ok=0
    umount "$merged" 2>/dev/null || umount -l "$merged" 2>/dev/null || true
  else
    ok=0
  fi
  rm -rf "$probe"
  [ "$ok" = 1 ]
}

select_storage_driver() {
  # 1. Reuse an already-initialized data root's driver — dockerd cannot switch
  #    a populated data root, and this script reruns on every `agentbox start`.
  local has_overlay2=0 has_fuse=0
  [ -d "$DOCKER_DATA_ROOT/overlay2" ] \
    && [ -n "$(ls -A "$DOCKER_DATA_ROOT/overlay2" 2>/dev/null)" ] && has_overlay2=1
  [ -d "$DOCKER_DATA_ROOT/fuse-overlayfs" ] \
    && [ -n "$(ls -A "$DOCKER_DATA_ROOT/fuse-overlayfs" 2>/dev/null)" ] && has_fuse=1
  if [ "$has_overlay2" = 1 ]; then echo "overlay2"; return 0; fi
  if [ "$has_fuse" = 1 ]; then echo "fuse-overlayfs"; return 0; fi

  # 2. Fresh data root: probe overlay2 against the data-root filesystem.
  if probe_overlay2; then echo "overlay2"; return 0; fi
  echo "fuse-overlayfs"
}

# Sweep any leaked probe dir from a hard-killed previous run (cosmetic; the
# driver subdir checks above ignore it, and dockerd ignores non-driver dirs).
rm -rf "$DOCKER_DATA_ROOT"/.overlay2-probe.* 2>/dev/null || true

STORAGE_DRIVER="$(select_storage_driver)"

# Write daemon.json with the resolved driver. `iptables: true` stays for inner
# bridge networking. Rewritten every start, but the driver is stable (step 1
# above), so this never causes a mid-life driver switch.
mkdir -p /etc/docker
printf '%s\n' \
  "{ \"storage-driver\": \"$STORAGE_DRIVER\", \"iptables\": true }" \
  > "$DAEMON_JSON"
# Truncate dockerd.log fresh for this start, marker line first; dockerd appends.
echo "agentbox-dockerd-start: storage-driver=$STORAGE_DRIVER" \
  > /var/log/agentbox/dockerd.log
# --- end storage-driver selection --------------------------------------------

# nohup + & + disown lets us survive the `docker exec -d` returning. dockerd
# reads /etc/docker/daemon.json on its own; no flags here keeps the start path
# debuggable from inside the container (just edit the file and restart).
nohup dockerd >>/var/log/agentbox/dockerd.log 2>&1 &

# Wait for the socket to become accept()-able. Bound by ~30s — first start has
# to initialize iptables chains and the storage graphdriver (fuse-overlayfs is
# noticeably slower to initialize than overlay2).
for _ in $(seq 1 300); do
  if [ -S /var/run/docker.sock ] \
     && docker -H unix:///var/run/docker.sock info >/dev/null 2>&1; then
    break
  fi
  sleep 0.1
done

disown -a
