diff --git a/haproxy_manager.py b/haproxy_manager.py index c754428..0a40db7 100644 --- a/haproxy_manager.py +++ b/haproxy_manager.py @@ -20,6 +20,44 @@ import fcntl app = Flask(__name__) +# Default page server (port 8080) — served to HAProxy clients whose request hit +# an unconfigured domain OR whose IP is blocked. Defined at module level so +# gunicorn can import it from start-up.sh; previously this was created inside +# the __main__ block, which prevented out-of-process WSGI servers from reaching +# it. Routes accept ALL HTTP methods because HAProxy proxies the original +# request verb unchanged — a POST to a blocked domain would otherwise 405, +# which is just log noise. +default_app = Flask('haproxy_default') +default_app.template_folder = 'templates' + +_ANY_METHOD = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'HEAD', 'OPTIONS'] + + +@default_app.route('/', methods=_ANY_METHOD) +def default_page(): + """Serve the default page for unmatched domains.""" + return render_template( + 'default_page.html', + page_title=os.environ.get('HAPROXY_DEFAULT_PAGE_TITLE', 'Site Not Configured'), + main_message=os.environ.get( + 'HAPROXY_DEFAULT_MAIN_MESSAGE', + 'This domain has not been configured yet. Please contact your ' + 'system administrator to set up this website.' + ), + secondary_message=os.environ.get( + 'HAPROXY_DEFAULT_SECONDARY_MESSAGE', + 'If you believe this is an error, please check the domain name ' + 'and try again.' + ), + ) + + +@default_app.route('/blocked-ip', methods=_ANY_METHOD) +def blocked_ip_page(): + """Serve the blocked IP page for blocked clients (HTTP 403).""" + return render_template('blocked_ip_page.html'), 403 + + # Configuration DB_FILE = '/etc/haproxy/haproxy_config.db' TEMPLATE_DIR = Path('templates') @@ -2077,7 +2115,13 @@ def start_haproxy(): log_operation('start_haproxy', False, error_msg) logger.warning("Container will continue without HAProxy running") -if __name__ == '__main__': +def do_initial_setup(): + """One-time container-startup setup: DB schema, certbot account, fresh + self-signed cert, config generation, and HAProxy launch. Idempotent; + safe to re-run, but in prod it should run exactly once per container + instance (via scripts/init.py before gunicorn workers spawn) so that + start_haproxy() doesn't race with itself across forks. + """ init_db() # Clear any stale certbot locks left from a previous container instance # that didn't shut down cleanly. Safe — only removes locks that no live @@ -2089,7 +2133,7 @@ if __name__ == '__main__': logger.warning(f"certbot lock(s) actively held at startup: {_stale['held']}") certbot_register() generate_self_signed_cert(SSL_CERTS_DIR) - + # Always regenerate config before starting HAProxy to ensure compatibility try: generate_config() @@ -2097,40 +2141,23 @@ if __name__ == '__main__': except Exception as e: logger.error(f"Failed to generate initial configuration: {e}") # Continue anyway, HAProxy will fail to start but the service will be available - + start_haproxy() certbot_register() - - # Run Flask app on port 8000 for API and port 8080 for default page + + +if __name__ == '__main__': + # Direct-invocation path: `python haproxy_manager.py`. Used for local dev + # and as a fallback. In the container this runs only when scripts/start-up.sh + # is bypassed; production uses gunicorn after scripts/init.py. + do_initial_setup() + + # Run both Flask apps on the werkzeug dev server. Acceptable for local + # development but NOT production — gunicorn is the prod server, invoked + # from scripts/start-up.sh. from threading import Thread - - def run_default_page_server(): - """Run a separate Flask app on port 8080 for the default page""" - from flask import Flask, render_template - default_app = Flask(__name__) - default_app.template_folder = 'templates' - - @default_app.route('/') - def default_page(): - """Serve the default page for unmatched domains""" - admin_email = os.environ.get('HAPROXY_ADMIN_EMAIL', 'admin@example.com') - - return render_template('default_page.html', - page_title=os.environ.get('HAPROXY_DEFAULT_PAGE_TITLE', 'Site Not Configured'), - main_message=os.environ.get('HAPROXY_DEFAULT_MAIN_MESSAGE', 'This domain has not been configured yet. Please contact your system administrator to set up this website.'), - secondary_message=os.environ.get('HAPROXY_DEFAULT_SECONDARY_MESSAGE', 'If you believe this is an error, please check the domain name and try again.') - ) - - @default_app.route('/blocked-ip') - def blocked_ip_page(): - """Serve the blocked IP page for blocked clients""" - return render_template('blocked_ip_page.html'), 403 - - default_app.run(host='0.0.0.0', port=8080) - - # Start the default page server in a separate thread - default_server_thread = Thread(target=run_default_page_server, daemon=True) - default_server_thread.start() - - # Run the main API server + Thread( + target=lambda: default_app.run(host='0.0.0.0', port=8080), + daemon=True, + ).start() app.run(host='0.0.0.0', port=8000) diff --git a/requirements.txt b/requirements.txt index 5473bef..157355f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ Flask==2.3.3 Jinja2==3.1.2 -psutil \ No newline at end of file +psutil +# Production WSGI server. Replaces Flask's built-in werkzeug dev server, which +# is single-threaded and leaks workers over long uptimes (root cause of the +# 2026-05 haproxy-manager "healthy but stalled" incidents). +gunicorn==23.0.0 diff --git a/scripts/init.py b/scripts/init.py new file mode 100755 index 0000000..37da8c9 --- /dev/null +++ b/scripts/init.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +"""Container init: DB schema, certbot account, config generation, HAProxy start. + +Runs once per container start, BEFORE gunicorn workers spawn. Keeping init out +of the WSGI app's module-load path avoids fork-time races (multiple workers +attempting to start_haproxy() simultaneously, certbot lock contention, etc.). +""" +import sys + +sys.path.insert(0, '/haproxy') +import haproxy_manager # noqa: E402 (sys.path manipulation must come first) + +haproxy_manager.do_initial_setup() diff --git a/scripts/start-up.sh b/scripts/start-up.sh old mode 100644 new mode 100755 index b6f0456..d0d6584 --- a/scripts/start-up.sh +++ b/scripts/start-up.sh @@ -1,6 +1,20 @@ #!/usr/bin/env bash +# Container entrypoint. Two-phase startup: +# 1. One-shot init (init.py): DB schema, certbot register, config gen, start HAProxy. +# Runs synchronously and to completion so haproxy is up before the API binds. +# 2. WSGI serving via gunicorn (replacing the Flask dev server). Two gunicorn +# instances: +# - port 8080 -> default_app (default page + blocked-ip page; HAProxy +# proxies unmatched / blocked traffic here) +# - port 8000 -> app (management API) +# +# Why gunicorn: +# Flask's built-in werkzeug "development server" is single-threaded and leaks +# workers under sustained load. It carried haproxy-manager for a long time but +# stalled out around 24-48h uptime ("healthy" health-check, but every request +# queued behind a stuck worker). Gunicorn with --max-requests cycles workers +# periodically, which prevents the slow-leak failure mode entirely. -# Exit on error set -eo pipefail # Ensure trusted IP whitelist files exist (volume-mounted /etc/haproxy may shadow image defaults) @@ -9,4 +23,39 @@ mkdir -p /etc/haproxy [ -f /etc/haproxy/trusted_ips.map ] || : > /etc/haproxy/trusted_ips.map cron & -python /haproxy/haproxy_manager.py + +# Phase 1: container init +python /haproxy/scripts/init.py + +# Phase 2: WSGI servers +# Tunable via env: HAPROXY_MGR_API_WORKERS (default 1), HAPROXY_MGR_API_TIMEOUT +# (default 120 — API can do slow ACME calls), HAPROXY_MGR_MAX_REQUESTS (default +# 1000 — worker recycle frequency). +API_WORKERS="${HAPROXY_MGR_API_WORKERS:-1}" +API_TIMEOUT="${HAPROXY_MGR_API_TIMEOUT:-120}" +MAX_REQ="${HAPROXY_MGR_MAX_REQUESTS:-1000}" +MAX_REQ_JITTER="${HAPROXY_MGR_MAX_REQUESTS_JITTER:-100}" + +# Default page server on :8080. Stays in the background. +# --threads 4 lets one worker handle bursts of blocked-IP/default-page hits +# without forking. --max-requests recycles the worker to bound memory drift. +gunicorn \ + --bind 0.0.0.0:8080 \ + --workers 1 --threads 4 --worker-class gthread \ + --max-requests "${MAX_REQ}" --max-requests-jitter "${MAX_REQ_JITTER}" \ + --timeout 30 \ + --access-logfile - --error-logfile - --log-level info \ + --pythonpath /haproxy \ + 'haproxy_manager:default_app' & + +# Main API server on :8000 in the foreground. exec so signals propagate +# correctly and the container exits if the API dies (docker --restart picks it +# up). Longer --timeout because cert issuance hits ACME and can take a while. +exec gunicorn \ + --bind 0.0.0.0:8000 \ + --workers "${API_WORKERS}" --threads 4 --worker-class gthread \ + --max-requests "${MAX_REQ}" --max-requests-jitter "${MAX_REQ_JITTER}" \ + --timeout "${API_TIMEOUT}" \ + --access-logfile - --error-logfile - --log-level info \ + --pythonpath /haproxy \ + 'haproxy_manager:app'