From 89c74c10cf18acaa6dfb104e278e8d12e941090e Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Wed, 1 Jul 2026 09:07:11 -0700 Subject: [PATCH] fix(supervisor): restart haproxy in-place if it dies while container lives haproxy runs as a background child of PID 1 (gunicorn) with nothing watching it after init. If the haproxy master dies mid-life (observed 2026-07-01 on whp01: SIGABRT -> exit 134, reaped by gunicorn and logged as "Worker (pid:22) exited"), the container stays "up", Docker's --restart never fires, and haproxy is down until the external host watchdog full-restarts the whole container minutes later (dropping every connection). Add an in-container supervisor loop in start-up.sh (Phase 1.5) that runs scripts/ensure_haproxy.py every HAPROXY_SUPERVISOR_INTERVAL (default 15s). ensure_haproxy.py calls the existing, idempotent start_haproxy() only when haproxy isn't running (psutil guard), reviving it in place within one interval with no container restart. Same entrypoint-supervision pattern shipped for cac-litespeed. Validated locally: killing haproxy -> revived with new PIDs in ~one interval, container stayed healthy, no spurious restarts while healthy. Co-Authored-By: Claude Opus 4.8 (1M context) --- VERSION | 2 +- scripts/ensure_haproxy.py | 52 +++++++++++++++++++++++++++++++++++++++ scripts/start-up.sh | 16 ++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 scripts/ensure_haproxy.py diff --git a/VERSION b/VERSION index 72c663c..0f03152 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2026.06.3 +2026.07.1 diff --git a/scripts/ensure_haproxy.py b/scripts/ensure_haproxy.py new file mode 100644 index 0000000..a91755e --- /dev/null +++ b/scripts/ensure_haproxy.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +"""Idempotent haproxy liveness check — driven by the in-container supervisor loop. + +Why this exists +--------------- +haproxy runs as a *background child of PID 1* (gunicorn) — it is started once at +container init (scripts/init.py -> do_initial_setup -> start_haproxy) and then +left running. Nothing supervises it after that. If the haproxy master process +dies mid-life (SIGABRT -> exit 134, segfault, or an OOM of the haproxy master), +the container stays "up" because gunicorn is still PID 1, so Docker's +`--restart` policy never fires. haproxy then stays down until the *external* +host watchdog (haproxy-watchdog.sh) notices port 80 is dead for ~3 minutes and +does a full `docker restart` — which drops every in-flight connection. + +This script closes that gap: called on a short interval by the supervisor loop +in start-up.sh, it re-launches haproxy *in place* within one interval. + +Safety +------ +start_haproxy() is guarded by `is_process_running('haproxy')` (psutil-based, so +it works in this container which has no `ps`), so calling this while haproxy is +healthy is a cheap no-op. It only ever acts when haproxy is genuinely gone. +""" +import sys + +sys.path.insert(0, '/haproxy') +import haproxy_manager # noqa: E402 (sys.path manipulation must come first) + + +def main(): + if haproxy_manager.is_process_running('haproxy'): + return 0 + + haproxy_manager.logger.warning( + "[haproxy-supervisor] haproxy process not found — attempting in-place restart" + ) + # start_haproxy() validates the config (and regenerates it if invalid) + # before launching, and swallows its own errors, so it will not raise here. + haproxy_manager.start_haproxy() + + if haproxy_manager.is_process_running('haproxy'): + haproxy_manager.logger.info("[haproxy-supervisor] haproxy restarted in place") + return 0 + + haproxy_manager.logger.error( + "[haproxy-supervisor] haproxy restart FAILED — still not running after start_haproxy()" + ) + return 1 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/start-up.sh b/scripts/start-up.sh index d0d6584..c022a4c 100755 --- a/scripts/start-up.sh +++ b/scripts/start-up.sh @@ -27,6 +27,22 @@ cron & # Phase 1: container init python /haproxy/scripts/init.py +# Phase 1.5: in-container haproxy supervisor. +# haproxy runs as a background child of PID 1 (gunicorn) with NOTHING watching +# it after init. If the haproxy master dies mid-life (e.g. SIGABRT -> exit 134, +# segfault), the container stays "up" (gunicorn is PID 1), Docker's --restart +# policy never fires, and haproxy is down until the external host watchdog +# full-restarts the whole container minutes later (dropping every connection). +# This loop revives haproxy in place within one interval. ensure_haproxy.py is +# idempotent — a cheap no-op whenever haproxy is already running. +HAPROXY_SUPERVISOR_INTERVAL="${HAPROXY_SUPERVISOR_INTERVAL:-15}" +( + while true; do + sleep "${HAPROXY_SUPERVISOR_INTERVAL}" + python /haproxy/scripts/ensure_haproxy.py 2>&1 || true + done +) & + # Phase 2: WSGI servers # Tunable via env: HAPROXY_MGR_API_WORKERS (default 1), HAPROXY_MGR_API_TIMEOUT # (default 120 — API can do slow ACME calls), HAPROXY_MGR_MAX_REQUESTS (default