feat: clear stale certbot lock files before each ACME run + at startup
All checks were successful
HAProxy Manager Build and Push / Build-and-Push (push) Successful in 53s
All checks were successful
HAProxy Manager Build and Push / Build-and-Push (push) Successful in 53s
certbot uses fasteners (fcntl-based locking) to serialize concurrent invocations. The kernel auto-releases fcntl locks when the holding process exits, but the .certbot.lock FILES persist on disk — and we've seen real cases where subsequent runs report "Another instance of Certbot is already running" even when no certbot process is alive. Observed during the 2026-05-09 bundling rollout when a hung worker held a lock across container-internal Python crashes. When SSL is blocked on a customer site, this is high-impact: the certbot lock can sit stale until somebody manually deletes it. clear_stale_certbot_locks(): - probes each known lock path with fcntl.LOCK_NB - if the lock is unheld → file is stale → delete it - if the lock IS held → leave it alone (real certbot is running) Wired in: - container startup (init block) - /api/ssl single-domain handler - /api/ssl/bundle handler - /api/certificates/renew handler Safe to call repeatedly; never deletes a lock a real process holds, so can never trigger concurrent certbot runs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import tempfile
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
import fcntl
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@@ -137,6 +138,69 @@ def validate_ip_address(ip_string):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Certbot uses fasteners (fcntl-based) to serialize concurrent invocations.
|
||||||
|
# When a previous certbot run is SIGKILLed mid-execution (container restart,
|
||||||
|
# OOM, manual kill), the kernel releases the fcntl lock automatically — but
|
||||||
|
# the LOCK FILE on disk persists. Subsequent runs sometimes report
|
||||||
|
# "Another instance of Certbot is already running" anyway, blocking SSL
|
||||||
|
# issuance until someone manually clears the files.
|
||||||
|
#
|
||||||
|
# Our hung-process scenario (observed 2026-05-09 during the bundling rollout):
|
||||||
|
# certbot from a previous attempt sat in defunct state holding the lock fd.
|
||||||
|
# Once the process eventually exited, the locks were physically removable but
|
||||||
|
# the symptoms persisted across multiple subsequent attempts.
|
||||||
|
#
|
||||||
|
# This helper probes each known lock path with fcntl.LOCK_NB. If we get the
|
||||||
|
# lock, no real process holds it and the file is stale — we delete it. If we
|
||||||
|
# DON'T get the lock, a real certbot is running and we leave it alone (so we
|
||||||
|
# never accidentally trigger concurrent certbot runs).
|
||||||
|
CERTBOT_LOCK_PATHS = (
|
||||||
|
'/etc/letsencrypt/.certbot.lock',
|
||||||
|
'/var/lib/letsencrypt/.certbot.lock',
|
||||||
|
'/var/log/letsencrypt/.certbot.lock',
|
||||||
|
)
|
||||||
|
|
||||||
|
def clear_stale_certbot_locks():
|
||||||
|
"""Remove stale certbot lock files. Safe to call before any ACME run.
|
||||||
|
Returns {'cleared': [paths...], 'held': [paths...]} for logging.
|
||||||
|
"""
|
||||||
|
cleared, held = [], []
|
||||||
|
for path in CERTBOT_LOCK_PATHS:
|
||||||
|
if not os.path.exists(path):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
fd = os.open(path, os.O_RDWR)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
held.append(f'{path} (open: {e})')
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||||
|
except BlockingIOError:
|
||||||
|
# A real process holds it; do not touch.
|
||||||
|
os.close(fd)
|
||||||
|
held.append(path)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# We hold the lock now. Release before unlinking so the lock
|
||||||
|
# state is clean if someone races us.
|
||||||
|
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
os.close(fd)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
os.remove(path)
|
||||||
|
cleared.append(path)
|
||||||
|
except FileNotFoundError:
|
||||||
|
cleared.append(path)
|
||||||
|
except Exception as e:
|
||||||
|
held.append(f'{path} (unlink: {e})')
|
||||||
|
return {'cleared': cleared, 'held': held}
|
||||||
|
|
||||||
def find_certbot_live_dir(base_domain):
|
def find_certbot_live_dir(base_domain):
|
||||||
"""Find the most recent certbot live directory for a domain.
|
"""Find the most recent certbot live directory for a domain.
|
||||||
Certbot creates -NNNN suffixed dirs for repeated requests."""
|
Certbot creates -NNNN suffixed dirs for repeated requests."""
|
||||||
@@ -403,6 +467,9 @@ def request_ssl():
|
|||||||
return jsonify({'status': 'error', 'message': 'Domain is required'}), 400
|
return jsonify({'status': 'error', 'message': 'Domain is required'}), 400
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Defensive: clear any stale lock left by a SIGKILLed prior run.
|
||||||
|
clear_stale_certbot_locks()
|
||||||
|
|
||||||
# Request Let's Encrypt certificate
|
# Request Let's Encrypt certificate
|
||||||
result = subprocess.run([
|
result = subprocess.run([
|
||||||
'certbot', 'certonly', '-n', '--standalone',
|
'certbot', 'certonly', '-n', '--standalone',
|
||||||
@@ -610,6 +677,9 @@ def request_ssl_bundle():
|
|||||||
cmd.extend(['-d', n])
|
cmd.extend(['-d', n])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Defensive: clear any stale lock left by a SIGKILLed prior run.
|
||||||
|
clear_stale_certbot_locks()
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
stderr_excerpt = (result.stderr or '').strip()[:800]
|
stderr_excerpt = (result.stderr or '').strip()[:800]
|
||||||
@@ -688,6 +758,9 @@ def request_ssl_bundle():
|
|||||||
def renew_certificates():
|
def renew_certificates():
|
||||||
"""Renew all certificates and reload HAProxy"""
|
"""Renew all certificates and reload HAProxy"""
|
||||||
try:
|
try:
|
||||||
|
# Defensive: clear any stale lock left by a SIGKILLed prior run.
|
||||||
|
clear_stale_certbot_locks()
|
||||||
|
|
||||||
# Run certbot renew
|
# Run certbot renew
|
||||||
result = subprocess.run([
|
result = subprocess.run([
|
||||||
'certbot', 'renew', '--quiet'
|
'certbot', 'renew', '--quiet'
|
||||||
@@ -2006,6 +2079,14 @@ def start_haproxy():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
init_db()
|
init_db()
|
||||||
|
# Clear any stale certbot locks left from a previous container instance
|
||||||
|
# that didn't shut down cleanly. Safe — only removes locks that no live
|
||||||
|
# process holds (verified via fcntl probe).
|
||||||
|
_stale = clear_stale_certbot_locks()
|
||||||
|
if _stale['cleared']:
|
||||||
|
logger.info(f"Cleared stale certbot lock(s) at startup: {_stale['cleared']}")
|
||||||
|
if _stale['held']:
|
||||||
|
logger.warning(f"certbot lock(s) actively held at startup: {_stale['held']}")
|
||||||
certbot_register()
|
certbot_register()
|
||||||
generate_self_signed_cert(SSL_CERTS_DIR)
|
generate_self_signed_cert(SSL_CERTS_DIR)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user