diff --git a/haproxy_manager.py b/haproxy_manager.py
index c754428..0a40db7 100644
--- a/haproxy_manager.py
+++ b/haproxy_manager.py
@@ -20,6 +20,44 @@ import fcntl
 
 app = Flask(__name__)
 
+# Default page server (port 8080) — served to HAProxy clients whose request hit
+# an unconfigured domain OR whose IP is blocked. Defined at module level so
+# gunicorn can import it from start-up.sh; previously this was created inside
+# the __main__ block, which prevented out-of-process WSGI servers from reaching
+# it. Routes accept ALL HTTP methods because HAProxy proxies the original
+# request verb unchanged — a POST to a blocked domain would otherwise 405,
+# which is just log noise.
+default_app = Flask('haproxy_default')
+default_app.template_folder = 'templates'
+
+_ANY_METHOD = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'HEAD', 'OPTIONS']
+
+
+@default_app.route('/', methods=_ANY_METHOD)
+def default_page():
+    """Serve the default page for unmatched domains."""
+    return render_template(
+        'default_page.html',
+        page_title=os.environ.get('HAPROXY_DEFAULT_PAGE_TITLE', 'Site Not Configured'),
+        main_message=os.environ.get(
+            'HAPROXY_DEFAULT_MAIN_MESSAGE',
+            'This domain has not been configured yet. Please contact your '
+            'system administrator to set up this website.'
+        ),
+        secondary_message=os.environ.get(
+            'HAPROXY_DEFAULT_SECONDARY_MESSAGE',
+            'If you believe this is an error, please check the domain name '
+            'and try again.'
+        ),
+    )
+
+
+@default_app.route('/blocked-ip', methods=_ANY_METHOD)
+def blocked_ip_page():
+    """Serve the blocked IP page for blocked clients (HTTP 403)."""
+    return render_template('blocked_ip_page.html'), 403
+
+
 # Configuration
 DB_FILE = '/etc/haproxy/haproxy_config.db'
 TEMPLATE_DIR = Path('templates')
@@ -2077,7 +2115,13 @@ def start_haproxy():
             log_operation('start_haproxy', False, error_msg)
             logger.warning("Container will continue without HAProxy running")
 
-if __name__ == '__main__':
+def do_initial_setup():
+    """One-time container-startup setup: DB schema, certbot account, fresh
+    self-signed cert, config generation, and HAProxy launch. Idempotent;
+    safe to re-run, but in prod it should run exactly once per container
+    instance (via scripts/init.py before gunicorn workers spawn) so that
+    start_haproxy() doesn't race with itself across forks.
+    """
     init_db()
     # Clear any stale certbot locks left from a previous container instance
     # that didn't shut down cleanly. Safe — only removes locks that no live
@@ -2089,7 +2133,7 @@ if __name__ == '__main__':
         logger.warning(f"certbot lock(s) actively held at startup: {_stale['held']}")
     certbot_register()
     generate_self_signed_cert(SSL_CERTS_DIR)
-    
+
     # Always regenerate config before starting HAProxy to ensure compatibility
     try:
         generate_config()
@@ -2097,40 +2141,23 @@ if __name__ == '__main__':
     except Exception as e:
         logger.error(f"Failed to generate initial configuration: {e}")
         # Continue anyway, HAProxy will fail to start but the service will be available
-    
+
     start_haproxy()
     certbot_register()
-    
-    # Run Flask app on port 8000 for API and port 8080 for default page
+
+
+if __name__ == '__main__':
+    # Direct-invocation path: `python haproxy_manager.py`. Used for local dev
+    # and as a fallback. In the container this runs only when scripts/start-up.sh
+    # is bypassed; production uses gunicorn after scripts/init.py.
+    do_initial_setup()
+
+    # Run both Flask apps on the werkzeug dev server. Acceptable for local
+    # development but NOT production — gunicorn is the prod server, invoked
+    # from scripts/start-up.sh.
     from threading import Thread
-    
-    def run_default_page_server():
-        """Run a separate Flask app on port 8080 for the default page"""
-        from flask import Flask, render_template
-        default_app = Flask(__name__)
-        default_app.template_folder = 'templates'
-        
-        @default_app.route('/')
-        def default_page():
-            """Serve the default page for unmatched domains"""
-            admin_email = os.environ.get('HAPROXY_ADMIN_EMAIL', 'admin@example.com')
-            
-            return render_template('default_page.html',
-                page_title=os.environ.get('HAPROXY_DEFAULT_PAGE_TITLE', 'Site Not Configured'),
-                main_message=os.environ.get('HAPROXY_DEFAULT_MAIN_MESSAGE', 'This domain has not been configured yet. Please contact your system administrator to set up this website.'),
-                secondary_message=os.environ.get('HAPROXY_DEFAULT_SECONDARY_MESSAGE', 'If you believe this is an error, please check the domain name and try again.')
-            )
-        
-        @default_app.route('/blocked-ip')
-        def blocked_ip_page():
-            """Serve the blocked IP page for blocked clients"""
-            return render_template('blocked_ip_page.html'), 403
-        
-        default_app.run(host='0.0.0.0', port=8080)
-    
-    # Start the default page server in a separate thread
-    default_server_thread = Thread(target=run_default_page_server, daemon=True)
-    default_server_thread.start()
-    
-    # Run the main API server
+    Thread(
+        target=lambda: default_app.run(host='0.0.0.0', port=8080),
+        daemon=True,
+    ).start()
     app.run(host='0.0.0.0', port=8000)
diff --git a/requirements.txt b/requirements.txt
index 5473bef..157355f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,7 @@
 Flask==2.3.3
 Jinja2==3.1.2
-psutil
\ No newline at end of file
+psutil
+# Production WSGI server. Replaces Flask's built-in werkzeug dev server, which
+# is single-threaded and leaks workers over long uptimes (root cause of the
+# 2026-05 haproxy-manager "healthy but stalled" incidents).
+gunicorn==23.0.0
diff --git a/scripts/init.py b/scripts/init.py
new file mode 100755
index 0000000..37da8c9
--- /dev/null
+++ b/scripts/init.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+"""Container init: DB schema, certbot account, config generation, HAProxy start.
+
+Runs once per container start, BEFORE gunicorn workers spawn. Keeping init out
+of the WSGI app's module-load path avoids fork-time races (multiple workers
+attempting to start_haproxy() simultaneously, certbot lock contention, etc.).
+"""
+import sys
+
+sys.path.insert(0, '/haproxy')
+import haproxy_manager  # noqa: E402  (sys.path manipulation must come first)
+
+haproxy_manager.do_initial_setup()
diff --git a/scripts/start-up.sh b/scripts/start-up.sh
old mode 100644
new mode 100755
index b6f0456..d0d6584
--- a/scripts/start-up.sh
+++ b/scripts/start-up.sh
@@ -1,6 +1,20 @@
 #!/usr/bin/env bash
+# Container entrypoint. Two-phase startup:
+#   1. One-shot init (init.py): DB schema, certbot register, config gen, start HAProxy.
+#      Runs synchronously and to completion so haproxy is up before the API binds.
+#   2. WSGI serving via gunicorn (replacing the Flask dev server). Two gunicorn
+#      instances:
+#        - port 8080 -> default_app  (default page + blocked-ip page; HAProxy
+#          proxies unmatched / blocked traffic here)
+#        - port 8000 -> app          (management API)
+#
+# Why gunicorn:
+#   Flask's built-in werkzeug "development server" is single-threaded and leaks
+#   workers under sustained load. It carried haproxy-manager for a long time but
+#   stalled out around 24-48h uptime ("healthy" health-check, but every request
+#   queued behind a stuck worker). Gunicorn with --max-requests cycles workers
+#   periodically, which prevents the slow-leak failure mode entirely.
 
-# Exit on error
 set -eo pipefail
 
 # Ensure trusted IP whitelist files exist (volume-mounted /etc/haproxy may shadow image defaults)
@@ -9,4 +23,39 @@ mkdir -p /etc/haproxy
 [ -f /etc/haproxy/trusted_ips.map ]  || : > /etc/haproxy/trusted_ips.map
 
 cron &
-python /haproxy/haproxy_manager.py
+
+# Phase 1: container init
+python /haproxy/scripts/init.py
+
+# Phase 2: WSGI servers
+# Tunable via env: HAPROXY_MGR_API_WORKERS (default 1), HAPROXY_MGR_API_TIMEOUT
+# (default 120 — API can do slow ACME calls), HAPROXY_MGR_MAX_REQUESTS (default
+# 1000 — worker recycle frequency).
+API_WORKERS="${HAPROXY_MGR_API_WORKERS:-1}"
+API_TIMEOUT="${HAPROXY_MGR_API_TIMEOUT:-120}"
+MAX_REQ="${HAPROXY_MGR_MAX_REQUESTS:-1000}"
+MAX_REQ_JITTER="${HAPROXY_MGR_MAX_REQUESTS_JITTER:-100}"
+
+# Default page server on :8080. Stays in the background.
+# --threads 4 lets one worker handle bursts of blocked-IP/default-page hits
+# without forking. --max-requests recycles the worker to bound memory drift.
+gunicorn \
+    --bind 0.0.0.0:8080 \
+    --workers 1 --threads 4 --worker-class gthread \
+    --max-requests "${MAX_REQ}" --max-requests-jitter "${MAX_REQ_JITTER}" \
+    --timeout 30 \
+    --access-logfile - --error-logfile - --log-level info \
+    --pythonpath /haproxy \
+    'haproxy_manager:default_app' &
+
+# Main API server on :8000 in the foreground. exec so signals propagate
+# correctly and the container exits if the API dies (docker --restart picks it
+# up). Longer --timeout because cert issuance hits ACME and can take a while.
+exec gunicorn \
+    --bind 0.0.0.0:8000 \
+    --workers "${API_WORKERS}" --threads 4 --worker-class gthread \
+    --max-requests "${MAX_REQ}" --max-requests-jitter "${MAX_REQ_JITTER}" \
+    --timeout "${API_TIMEOUT}" \
+    --access-logfile - --error-logfile - --log-level info \
+    --pythonpath /haproxy \
+    'haproxy_manager:app'