From 50202538e4e9e73b3611f99fa451ff9f4e1ca156 Mon Sep 17 00:00:00 2001
From: jknapp <jknapp85@gmail.com>
Date: Fri, 5 Jun 2026 19:15:25 -0700
Subject: [PATCH] cac-litespeed: supervise OLS in daemon mode so self-restarts
 don't kill PID 1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cac-litespeed containers were dying at random intervals and staying 503 until
manually restarted. Root-caused on whp02 (alsacorp, 2026-06-06): the LiteSpeed
Cache / QUIC.cloud integration refreshes the QUIC.cloud IP allowlist on a
schedule and, when it changes, sends SIGUSR1 → "request a graceful server
restart". The entrypoint ran `openlitespeed -n & wait "$OLS_PID"`, so when the
OLD main PID exited after the zero-downtime handoff, `wait` returned, PID 1
(bash) exited, and the whole container went down. The exit was clean (code 0),
so even a restart policy wouldn't reliably catch it — HAProxy just served 503
until someone ran `docker start`.

Replace the `-n` foreground+wait model with a daemon-mode supervisor: start OLS
via `lswsctrl start` (its native model, where it owns the SIGUSR1 handoff and
keeps listeners bound across generations) and have PID 1 follow `lswsctrl
status`. A graceful self-restart is now invisible here (verified zero-downtime);
PID 1 only relaunches on a genuine crash (no live main), with a 5-in-60s
crash-loop cap that bails out to Docker's restart policy / the site monitor.
SIGTERM still drains and exits cleanly for docker stop / recreate.

Verified on a scratch php85 container: survives `lswsctrl restart`, survives a
raw SIGUSR1 to the main (the exact QUIC.cloud path that used to kill it),
relaunches after `kill -9` of the main, and stops cleanly in ~6s on docker stop.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/entrypoint-litespeed.sh | 97 +++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 10 deletions(-)

diff --git a/scripts/entrypoint-litespeed.sh b/scripts/entrypoint-litespeed.sh
index fd4c6a5..e9cc18f 100644
--- a/scripts/entrypoint-litespeed.sh
+++ b/scripts/entrypoint-litespeed.sh
@@ -154,15 +154,8 @@ service cron start >/dev/null 2>&1 || /usr/sbin/cron
 ## ---- LSCache plugin (background, non-fatal) ----
 ( /scripts/install-lscache-wp.sh "$user" >>/var/log/lscache-install.log 2>&1 || true ) &
 
-## ---- start OLS in foreground with crash-guard ----
-## openlitespeed -n = no-daemon + supervisor. Trap forwards SIGTERM cleanly.
-OLS_PID=""
-trap '[ -n "$OLS_PID" ] && kill -TERM "$OLS_PID" 2>/dev/null; wait "$OLS_PID" 2>/dev/null || true' TERM INT
-
-/usr/local/lsws/bin/openlitespeed -n &
-OLS_PID=$!
-
-## Stream OLS + customer logs to PID-1 stdout so `docker logs` works.
+## Stream OLS + customer logs to PID-1 stdout so `docker logs` works. Started
+## once, before the supervisor loop — it follows the files across OLS restarts.
 touch /usr/local/lsws/logs/error.log /usr/local/lsws/logs/access.log
 touch "/home/$user/logs/apache/error_log" "/home/$user/logs/apache/access_log"
 touch "/home/$user/logs/php-fpm/error.log"
@@ -175,4 +168,88 @@ tail -F /usr/local/lsws/logs/error.log \
         "/home/$user/logs/apache/access_log" \
         "/home/$user/logs/php-fpm/error.log" 2>/dev/null &
 
-wait "$OLS_PID"
+## ---- supervise OLS in DAEMON mode (NOT `openlitespeed -n` + wait) ----
+## OLS performs INTERNAL graceful self-restarts: the LiteSpeed Cache /
+## QUIC.cloud integration refreshes the QUIC.cloud IP allowlist on a schedule
+## and, when it changes, sends SIGUSR1 → "request a graceful server restart".
+## In `-n` foreground mode the OLD main PID exits after the zero-downtime
+## handoff; a bare `wait` on that PID lets bash (PID 1) exit and tears the whole
+## container down. Worse, that exit is *clean*, so `RestartPolicy` doesn't
+## reliably catch it — the container just stops and HAProxy serves 503 until
+## someone manually starts it. (Root-caused on whp02 alsacorp, 2026-06-06.)
+##
+## Daemon mode is OLS's native model: it owns the SIGUSR1 handoff, keeps the
+## listeners bound across generations, and rewrites lshttpd.pid to the new main.
+## PID 1 just FOLLOWS the pidfile — a graceful self-restart is invisible here
+## (zero downtime), and we only ever relaunch on a genuine crash (no live main).
+STOP_REQUESTED=0
+term_handler() {
+  STOP_REQUESTED=1
+  /usr/local/lsws/bin/lswsctrl stop >/dev/null 2>&1 || true
+}
+trap term_handler TERM INT
+
+## Authoritative, path-independent liveness check: `lswsctrl status` prints
+## "litespeed is running with PID N." when up (and "...is not running" when
+## down). We match the running message specifically — a bare grep for "running"
+## would also match "not running". (This image keeps the pidfile under
+## /tmp/lshttpd, not logs/, so we never hard-code a pidfile path.)
+ols_running() { /usr/local/lsws/bin/lswsctrl status 2>/dev/null | grep -qi 'running with pid'; }
+
+## Crash-loop cap: if OLS can't stay up, bail out so Docker's restart policy and
+## the site-health monitor escalate instead of us hot-looping forever.
+MAX_STARTS=5
+WINDOW=60
+starts=""
+
+start_ols() {
+  /usr/local/lsws/bin/lswsctrl start >/dev/null 2>&1 || true
+  ## wait up to 10s for the daemon to report running
+  for _ in $(seq 1 20); do
+    ols_running && return 0
+    sleep 0.5
+  done
+  return 1
+}
+
+if ! start_ols; then
+  echo "entrypoint: OLS failed to start (not running after 10s)." >&2
+  exit 1
+fi
+echo "entrypoint: OLS started in daemon mode — $(/usr/local/lsws/bin/lswsctrl status 2>/dev/null || true)"
+
+while true; do
+  if ols_running; then
+    sleep 3
+    continue
+  fi
+
+  ## Not running this instant. This is EITHER a clean shutdown OR the brief
+  ## handoff window of a graceful self-restart (status momentarily reports down
+  ## while the new main takes over). Grace, then re-check before judging.
+  sleep 2
+  if [ "$STOP_REQUESTED" -eq 0 ] && ols_running; then
+    continue
+  fi
+
+  if [ "$STOP_REQUESTED" -eq 1 ]; then
+    echo "entrypoint: SIGTERM received, OLS stopped — exiting."
+    exit 0
+  fi
+
+  ## Genuine crash: not running and no shutdown requested. Relaunch, capped.
+  now=$(date +%s)
+  starts="$starts $now"
+  pruned=""
+  for t in $starts; do
+    [ $((now - t)) -lt "$WINDOW" ] && pruned="$pruned $t"
+  done
+  starts="$pruned"
+  n=$(echo $starts | wc -w)
+  echo "entrypoint: OLS not running — relaunching (attempt $n/$MAX_STARTS within ${WINDOW}s)." >&2
+  if [ "$n" -ge "$MAX_STARTS" ]; then
+    echo "entrypoint: OLS crash-looping ($n starts in ${WINDOW}s) — bailing out for Docker restart policy / monitor." >&2
+    exit 1
+  fi
+  start_ols || true
+done