From 8a86beac7367509a0db2ebf79c452f9a96ddc5f2 Mon Sep 17 00:00:00 2001
From: Josh Knapp <jknapp85@gmail.com>
Date: Sat, 9 May 2026 12:09:19 -0700
Subject: [PATCH] feat: clear stale certbot lock files before each ACME run +
 at startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

certbot uses fasteners (fcntl-based locking) to serialize concurrent
invocations. The kernel auto-releases fcntl locks when the holding
process exits, but the .certbot.lock FILES persist on disk — and we've
seen real cases where subsequent runs report "Another instance of
Certbot is already running" even when no certbot process is alive.
Observed during the 2026-05-09 bundling rollout when a hung worker
held a lock across container-internal Python crashes.

When SSL is blocked on a customer site, this is high-impact: the
certbot lock can sit stale until somebody manually deletes it.

clear_stale_certbot_locks():
  - probes each known lock path with fcntl.LOCK_NB
  - if the lock is unheld → file is stale → delete it
  - if the lock IS held → leave it alone (real certbot is running)

Wired in:
  - container startup (init block)
  - /api/ssl single-domain handler
  - /api/ssl/bundle handler
  - /api/certificates/renew handler

Safe to call repeatedly; never deletes a lock a real process holds, so
can never trigger concurrent certbot runs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 haproxy_manager.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/haproxy_manager.py b/haproxy_manager.py
index 6ff63a9..c754428 100644
--- a/haproxy_manager.py
+++ b/haproxy_manager.py
@@ -16,6 +16,7 @@ import tempfile
 import threading
 import time
 import re
+import fcntl
 
 app = Flask(__name__)
 
@@ -137,6 +138,69 @@ def validate_ip_address(ip_string):
     except ValueError:
         return False
 
+# Certbot uses fasteners (fcntl-based) to serialize concurrent invocations.
+# When a previous certbot run is SIGKILLed mid-execution (container restart,
+# OOM, manual kill), the kernel releases the fcntl lock automatically — but
+# the LOCK FILE on disk persists. Subsequent runs sometimes report
+# "Another instance of Certbot is already running" anyway, blocking SSL
+# issuance until someone manually clears the files.
+#
+# Our hung-process scenario (observed 2026-05-09 during the bundling rollout):
+# certbot from a previous attempt sat in defunct state holding the lock fd.
+# Once the process eventually exited, the locks were physically removable but
+# the symptoms persisted across multiple subsequent attempts.
+#
+# This helper probes each known lock path with fcntl.LOCK_NB. If we get the
+# lock, no real process holds it and the file is stale — we delete it. If we
+# DON'T get the lock, a real certbot is running and we leave it alone (so we
+# never accidentally trigger concurrent certbot runs).
+CERTBOT_LOCK_PATHS = (
+    '/etc/letsencrypt/.certbot.lock',
+    '/var/lib/letsencrypt/.certbot.lock',
+    '/var/log/letsencrypt/.certbot.lock',
+)
+
+def clear_stale_certbot_locks():
+    """Remove stale certbot lock files. Safe to call before any ACME run.
+    Returns {'cleared': [paths...], 'held': [paths...]} for logging.
+    """
+    cleared, held = [], []
+    for path in CERTBOT_LOCK_PATHS:
+        if not os.path.exists(path):
+            continue
+        try:
+            fd = os.open(path, os.O_RDWR)
+        except FileNotFoundError:
+            continue
+        except Exception as e:
+            held.append(f'{path} (open: {e})')
+            continue
+        try:
+            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            # A real process holds it; do not touch.
+            os.close(fd)
+            held.append(path)
+            continue
+        try:
+            # We hold the lock now. Release before unlinking so the lock
+            # state is clean if someone races us.
+            fcntl.flock(fd, fcntl.LOCK_UN)
+        except Exception:
+            pass
+        try:
+            os.close(fd)
+        except Exception:
+            pass
+        try:
+            os.remove(path)
+            cleared.append(path)
+        except FileNotFoundError:
+            cleared.append(path)
+        except Exception as e:
+            held.append(f'{path} (unlink: {e})')
+    return {'cleared': cleared, 'held': held}
+
 def find_certbot_live_dir(base_domain):
     """Find the most recent certbot live directory for a domain.
     Certbot creates -NNNN suffixed dirs for repeated requests."""
@@ -403,6 +467,9 @@ def request_ssl():
         return jsonify({'status': 'error', 'message': 'Domain is required'}), 400
 
     try:
+        # Defensive: clear any stale lock left by a SIGKILLed prior run.
+        clear_stale_certbot_locks()
+
         # Request Let's Encrypt certificate
         result = subprocess.run([
             'certbot', 'certonly', '-n', '--standalone',
@@ -610,6 +677,9 @@ def request_ssl_bundle():
         cmd.extend(['-d', n])
 
     try:
+        # Defensive: clear any stale lock left by a SIGKILLed prior run.
+        clear_stale_certbot_locks()
+
         result = subprocess.run(cmd, capture_output=True, text=True)
         if result.returncode != 0:
             stderr_excerpt = (result.stderr or '').strip()[:800]
@@ -688,6 +758,9 @@ def request_ssl_bundle():
 def renew_certificates():
     """Renew all certificates and reload HAProxy"""
     try:
+        # Defensive: clear any stale lock left by a SIGKILLed prior run.
+        clear_stale_certbot_locks()
+
         # Run certbot renew
         result = subprocess.run([
             'certbot', 'renew', '--quiet'
@@ -2006,6 +2079,14 @@ def start_haproxy():
 
 if __name__ == '__main__':
     init_db()
+    # Clear any stale certbot locks left from a previous container instance
+    # that didn't shut down cleanly. Safe — only removes locks that no live
+    # process holds (verified via fcntl probe).
+    _stale = clear_stale_certbot_locks()
+    if _stale['cleared']:
+        logger.info(f"Cleared stale certbot lock(s) at startup: {_stale['cleared']}")
+    if _stale['held']:
+        logger.warning(f"certbot lock(s) actively held at startup: {_stale['held']}")
     certbot_register()
     generate_self_signed_cert(SSL_CERTS_DIR)