diff --git a/scripts/entrypoint-litespeed.sh b/scripts/entrypoint-litespeed.sh index fd4c6a5..e9cc18f 100644 --- a/scripts/entrypoint-litespeed.sh +++ b/scripts/entrypoint-litespeed.sh @@ -154,15 +154,8 @@ service cron start >/dev/null 2>&1 || /usr/sbin/cron ## ---- LSCache plugin (background, non-fatal) ---- ( /scripts/install-lscache-wp.sh "$user" >>/var/log/lscache-install.log 2>&1 || true ) & -## ---- start OLS in foreground with crash-guard ---- -## openlitespeed -n = no-daemon + supervisor. Trap forwards SIGTERM cleanly. -OLS_PID="" -trap '[ -n "$OLS_PID" ] && kill -TERM "$OLS_PID" 2>/dev/null; wait "$OLS_PID" 2>/dev/null || true' TERM INT - -/usr/local/lsws/bin/openlitespeed -n & -OLS_PID=$! - -## Stream OLS + customer logs to PID-1 stdout so `docker logs` works. +## Stream OLS + customer logs to PID-1 stdout so `docker logs` works. Started +## once, before the supervisor loop — it follows the files across OLS restarts. touch /usr/local/lsws/logs/error.log /usr/local/lsws/logs/access.log touch "/home/$user/logs/apache/error_log" "/home/$user/logs/apache/access_log" touch "/home/$user/logs/php-fpm/error.log" @@ -175,4 +168,88 @@ tail -F /usr/local/lsws/logs/error.log \ "/home/$user/logs/apache/access_log" \ "/home/$user/logs/php-fpm/error.log" 2>/dev/null & -wait "$OLS_PID" +## ---- supervise OLS in DAEMON mode (NOT `openlitespeed -n` + wait) ---- +## OLS performs INTERNAL graceful self-restarts: the LiteSpeed Cache / +## QUIC.cloud integration refreshes the QUIC.cloud IP allowlist on a schedule +## and, when it changes, sends SIGUSR1 → "request a graceful server restart". +## In `-n` foreground mode the OLD main PID exits after the zero-downtime +## handoff; a bare `wait` on that PID lets bash (PID 1) exit and tears the whole +## container down. Worse, that exit is *clean*, so `RestartPolicy` doesn't +## reliably catch it — the container just stops and HAProxy serves 503 until +## someone manually starts it. (Root-caused on whp02 alsacorp, 2026-06-06.) +## +## Daemon mode is OLS's native model: it owns the SIGUSR1 handoff, keeps the +## listeners bound across generations, and rewrites lshttpd.pid to the new main. +## PID 1 just FOLLOWS the pidfile — a graceful self-restart is invisible here +## (zero downtime), and we only ever relaunch on a genuine crash (no live main). +STOP_REQUESTED=0 +term_handler() { + STOP_REQUESTED=1 + /usr/local/lsws/bin/lswsctrl stop >/dev/null 2>&1 || true +} +trap term_handler TERM INT + +## Authoritative, path-independent liveness check: `lswsctrl status` prints +## "litespeed is running with PID N." when up (and "...is not running" when +## down). We match the running message specifically — a bare grep for "running" +## would also match "not running". (This image keeps the pidfile under +## /tmp/lshttpd, not logs/, so we never hard-code a pidfile path.) +ols_running() { /usr/local/lsws/bin/lswsctrl status 2>/dev/null | grep -qi 'running with pid'; } + +## Crash-loop cap: if OLS can't stay up, bail out so Docker's restart policy and +## the site-health monitor escalate instead of us hot-looping forever. +MAX_STARTS=5 +WINDOW=60 +starts="" + +start_ols() { + /usr/local/lsws/bin/lswsctrl start >/dev/null 2>&1 || true + ## wait up to 10s for the daemon to report running + for _ in $(seq 1 20); do + ols_running && return 0 + sleep 0.5 + done + return 1 +} + +if ! start_ols; then + echo "entrypoint: OLS failed to start (not running after 10s)." >&2 + exit 1 +fi +echo "entrypoint: OLS started in daemon mode — $(/usr/local/lsws/bin/lswsctrl status 2>/dev/null || true)" + +while true; do + if ols_running; then + sleep 3 + continue + fi + + ## Not running this instant. This is EITHER a clean shutdown OR the brief + ## handoff window of a graceful self-restart (status momentarily reports down + ## while the new main takes over). Grace, then re-check before judging. + sleep 2 + if [ "$STOP_REQUESTED" -eq 0 ] && ols_running; then + continue + fi + + if [ "$STOP_REQUESTED" -eq 1 ]; then + echo "entrypoint: SIGTERM received, OLS stopped — exiting." + exit 0 + fi + + ## Genuine crash: not running and no shutdown requested. Relaunch, capped. + now=$(date +%s) + starts="$starts $now" + pruned="" + for t in $starts; do + [ $((now - t)) -lt "$WINDOW" ] && pruned="$pruned $t" + done + starts="$pruned" + n=$(echo $starts | wc -w) + echo "entrypoint: OLS not running — relaunching (attempt $n/$MAX_STARTS within ${WINDOW}s)." >&2 + if [ "$n" -ge "$MAX_STARTS" ]; then + echo "entrypoint: OLS crash-looping ($n starts in ${WINDOW}s) — bailing out for Docker restart policy / monitor." >&2 + exit 1 + fi + start_ols || true +done