From db78a36935e73c1d2401c23432a1b73d6f34cf5e Mon Sep 17 00:00:00 2001 From: "Claude (bootstrap)" Date: Sun, 31 May 2026 11:13:57 -0700 Subject: [PATCH] sanitize-dont-refuse: strip dangerous symlinks via tar --exclude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shifts the sandbox's symlink handling from "refuse the whole tarball" to "drop the dangerous entries from extraction and record them as quarantine actions". This is what sandbox mode is supposed to do — make malicious cpmoves safe to import rather than gate-keeping them. Three coordinated changes: 1. scan-symlinks.php — exit 0 even when DANGEROUS findings exist. The JSON report is the source of truth; the caller decides what to do with it. Usage/IO errors still exit 2. STDERR still names each finding (now "STRIP X -> Y" instead of "refusing tarball") so the streamed [container] log on the panel side surfaces them. 2. extract.sh — reads the scan-symlinks report, builds a newline-delimited exclude list of DANGEROUS archive_paths, and passes it to `tar --exclude-from=`. The stripped entries never reach the filesystem; tar skips them silently. Also writes a small JSON sidecar at $EXTRACT_DIR/.cpanel-importer-stripped-symlinks.json describing each strip-action so the merge step can surface them in report.json without re-parsing scan-symlinks output. 3. entrypoint.sh write_report — reads the sidecar, prepends each stripped_dangerous_symlink action to the actions[] list, bumps files_quarantined by the strip-count, and rewrites summary_for_panel.alert_message to call them out distinctly: "N dangerous symlink(s) stripped during extract; M files quarantined; K cleaned in place. Customer site may have been compromised at the source — recommend review." Result on darkside: instead of the import failing on the ALFA alfasymlink/root entry, that entry is silently skipped during extract, recorded as `stripped_dangerous_symlink path=... target=/ reason=absolute target is root /`, and the rest of the tarball extracts normally. Subsequent ClamAV scan + DB sanitization run to completion; panel sees a verdict-completed import with the stripped symlinks visible in the Sanitization Sandbox panel on the results page. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/entrypoint.sh | 54 ++++++++++++++++------ scripts/extract.sh | 87 ++++++++++++++++++++++++++++------- scripts/lib/scan-symlinks.php | 22 +++++---- 3 files changed, 125 insertions(+), 38 deletions(-) diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index d3125a3..42d98f8 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -159,15 +159,18 @@ rsync -a --no-owner --no-group --no-perms --chmod=Du=rwx,Dg=rx,Do=,Fu=rw,Fg=r,Fo STAGE="write_report" log "stage: write_report" DURATION=$(( $(date -u +%s) - START_TS )) +STRIPPED_SYMLINKS_FILE="$EXTRACT_DIR/.cpanel-importer-stripped-symlinks.json" php -r ' -$importId = $argv[1]; -$duration = (int) $argv[2]; -$filesPath = $argv[3]; -$dbsPath = $argv[4]; -$outPath = $argv[5]; +$importId = $argv[1]; +$duration = (int) $argv[2]; +$filesPath = $argv[3]; +$dbsPath = $argv[4]; +$strippedPath = $argv[5]; +$outPath = $argv[6]; -$files = is_file($filesPath) ? json_decode(file_get_contents($filesPath), true) : null; -$dbs = is_file($dbsPath) ? json_decode(file_get_contents($dbsPath), true) : null; +$files = is_file($filesPath) ? json_decode(file_get_contents($filesPath), true) : null; +$dbs = is_file($dbsPath) ? json_decode(file_get_contents($dbsPath), true) : null; +$stripped = is_file($strippedPath) ? json_decode(file_get_contents($strippedPath), true) : null; $filesScanned = $files["files_scanned"] ?? 0; $filesClean = $files["files_clean"] ?? 0; @@ -176,6 +179,18 @@ $filesQuarantined = $files["files_quarantined"] ?? 0; $actions = $files["actions"] ?? []; $databases = $dbs["databases"] ?? []; +// Prepend the stripped-symlinks actions from extract.sh so the operator +// sees them at the top of the actions[] table on the results page. Bumps +// files_quarantined because the strip-action is morally equivalent to a +// quarantine - the entry was not extracted, the symlink file is "in the +// archive but absent from the cleaned tree". +$strippedActions = $stripped["actions"] ?? []; +$strippedCount = count($strippedActions); +if ($strippedCount > 0) { + $actions = array_merge($strippedActions, $actions); + $filesQuarantined += $strippedCount; +} + $dbRefused = 0; foreach ($databases as $db) { if (($db["imported_into_new_server"] ?? true) === false) $dbRefused++; @@ -184,13 +199,24 @@ foreach ($databases as $db) { $severity = "info"; $alert = false; $msg = "Sanitization clean: no malware signatures detected."; -if ($filesQuarantined > 0 || $dbRefused > 0) { +if ($filesQuarantined > 0 || $dbRefused > 0 || $strippedCount > 0) { $alert = true; - $severity = ($filesQuarantined > 50 || $dbRefused > 0) ? "warning" : "info"; - $msg = sprintf( - "%d files quarantined + %d cleaned in place; %d database(s) refused as compromised. Customer site may have been compromised at the source — recommend review.", - $filesQuarantined, $filesCleaned, $dbRefused - ); + $severity = ($filesQuarantined > 50 || $dbRefused > 0 || $strippedCount > 0) ? "warning" : "info"; + $parts = []; + if ($strippedCount > 0) { + $parts[] = sprintf("%d dangerous symlink(s) stripped during extract", $strippedCount); + } + if ($filesQuarantined - $strippedCount > 0) { + $parts[] = sprintf("%d files quarantined", $filesQuarantined - $strippedCount); + } + if ($filesCleaned > 0) { + $parts[] = sprintf("%d cleaned in place", $filesCleaned); + } + if ($dbRefused > 0) { + $parts[] = sprintf("%d database(s) refused as compromised", $dbRefused); + } + $msg = implode("; ", $parts) + . ". Customer site may have been compromised at the source — recommend review."; } $report = [ @@ -212,7 +238,7 @@ $report = [ file_put_contents($outPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); fprintf(STDERR, "report written: %s\n", $outPath); -' "$IMPORT_ID" "$DURATION" /tmp/scan-files-report.json /tmp/scan-dbs-report.json "$SANITIZED_DIR/report.json" \ +' "$IMPORT_ID" "$DURATION" /tmp/scan-files-report.json /tmp/scan-dbs-report.json "$STRIPPED_SYMLINKS_FILE" "$SANITIZED_DIR/report.json" \ || die "report merge failed" log "done — exited cleanly after ${DURATION}s" diff --git a/scripts/extract.sh b/scripts/extract.sh index a3ba98c..07cd824 100755 --- a/scripts/extract.sh +++ b/scripts/extract.sh @@ -1,19 +1,25 @@ #!/usr/bin/env bash # -# extract.sh — pre-extract symlink scan + cpmove untar. +# extract.sh — symlink scan + sanitized cpmove untar. # -# Usage: extract.sh +# Usage: extract.sh [] # -# Calls scripts/lib/scan-symlinks.php first; if it reports any DANGEROUS -# findings we abort BEFORE tar runs (per spec §0 step 2). On clean, -# extracts with the same hardening flags CpanelBackupImporter::extractBackup -# uses on the panel today (see web-files/libs/CpanelBackupImporter.php). +# Calls scripts/lib/scan-symlinks.php first, then untars the cpmove with +# every DANGEROUS-classified symlink entry stripped via tar --exclude. +# The stripped-symlinks list is written as JSON to (default +# $DEST/.cpanel-importer-stripped-symlinks.json) so the merge step in +# entrypoint.sh can fold the stripped entries into report.json's actions[]. +# +# Sandbox-mode posture: never refuse. ALFA-class root symlinks and other +# DANGEROUS entries are silently excluded from extraction; the panel sees +# them as quarantine actions on the results page instead of an import abort. set -euo pipefail -TARBALL="${1:?usage: extract.sh }" -DEST="${2:?usage: extract.sh }" -USERNAME="${3:?usage: extract.sh }" +TARBALL="${1:?usage: extract.sh []}" +DEST="${2:?usage: extract.sh []}" +USERNAME="${3:?usage: extract.sh []}" +ACTIONS_OUT="${4:-${DEST}/.cpanel-importer-stripped-symlinks.json}" ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; } log() { printf '[%s] extract: %s\n' "$(ts)" "$*"; } @@ -29,15 +35,56 @@ if ! php /scripts/lib/scan-symlinks.php \ --tarball "$TARBALL" \ --username "$USERNAME" \ --report "$SYMLINK_REPORT"; then - log "scan-symlinks.php exited non-zero" + log "scan-symlinks.php exited with usage/IO error; aborting (this is not a sanitize-able state)" cat "$SYMLINK_REPORT" >&2 || true - log "ABORT: tarball contains dangerous symlinks; aborting" - # Propagate the report on stdout so entrypoint.sh can include it - # in the failure record. exit 3 fi -log "symlink scan clean (no DANGEROUS findings)" +# --- compute exclude list from dangerous findings ------------------------- + +# Build a newline-delimited list of archive_path strings for tar --exclude- +# from. Also build a JSON actions[] array so entrypoint.sh's merge step can +# fold the strip-actions into report.json without re-parsing scan-symlinks. +EXCLUDES_FILE=$(mktemp -p /tmp tar-excludes.XXXXXX) +DANGEROUS_COUNT=$(python3 - "$SYMLINK_REPORT" "$EXCLUDES_FILE" "$ACTIONS_OUT" <<'PY' +import json, sys +src, excl_path, actions_path = sys.argv[1], sys.argv[2], sys.argv[3] +try: + with open(src) as fh: + r = json.load(fh) +except Exception as e: + sys.stderr.write(f"failed to parse scan-symlinks report: {e}\n") + print(0) + sys.exit(0) + +dangerous = [f for f in r.get('findings', []) if f.get('type') == 'DANGEROUS'] +with open(excl_path, 'w') as eh: + for f in dangerous: + p = f.get('archive_path', '') + if p: + eh.write(p + '\n') + +actions = [ + { + 'action': 'stripped_dangerous_symlink', + 'path': f.get('archive_path', ''), + 'target': f.get('target', ''), + 'reason': f.get('reason', ''), + } + for f in dangerous +] +with open(actions_path, 'w') as ah: + json.dump({'actions': actions, 'count': len(actions)}, ah, indent=2) +print(len(dangerous)) +PY +) + +if [[ "$DANGEROUS_COUNT" -gt 0 ]]; then + log "stripping $DANGEROUS_COUNT dangerous symlink(s) via tar --exclude-from" + while IFS= read -r path; do + log " STRIP: $path" + done < "$EXCLUDES_FILE" +fi # --- extract -------------------------------------------------------------- @@ -56,9 +103,17 @@ log "extracting with hardened tar flags into $DEST" # uid/perm bits so the cpmove can't drop setuid binaries at us. # --no-overwrite-dir: refuse to clobber existing directory metadata, # closing one historical tar-symlink-escape vector. +# --exclude-from=$EXCLUDES_FILE: strip every DANGEROUS-classified +# symlink (target = /, /etc, /root, /boot, /proc, /sys, /dev). +# Empty file = no-op exclude. tar's --exclude pattern matching +# uses fnmatch but our archive_path entries don't contain glob +# metacharacters (they came verbatim from `tar -tvf`), so the +# match is effectively a literal-path skip. # --absolute-names is NOT used — leading / in a member name is stripped. cd "$DEST" -tar --no-same-owner --no-same-permissions --no-overwrite-dir $TAR_FLAGS "$TARBALL" +tar --no-same-owner --no-same-permissions --no-overwrite-dir \ + --exclude-from="$EXCLUDES_FILE" \ + $TAR_FLAGS "$TARBALL" -log "extracted OK ($(find "$DEST" -type f | wc -l) files)" +log "extracted OK ($(find "$DEST" -type f | wc -l) files; $DANGEROUS_COUNT symlinks stripped)" exit 0 diff --git a/scripts/lib/scan-symlinks.php b/scripts/lib/scan-symlinks.php index 3d36f68..43da433 100644 --- a/scripts/lib/scan-symlinks.php +++ b/scripts/lib/scan-symlinks.php @@ -9,9 +9,13 @@ * gate without dragging in the rest of the importer. * * Exit codes: - * 0 — clean (no DANGEROUS findings) - * 1 — one or more DANGEROUS findings; tarball MUST NOT be extracted - * 2 — usage / I/O error + * 0 — scan completed successfully (with or without DANGEROUS findings). + * Findings are recorded in --report; extract.sh inspects the report + * to decide which entries to --exclude from `tar -xzf`. Sandbox-mode + * posture is "sanitize, don't refuse" — the container drops the + * dangerous symlinks from extraction and records the actions in + * report.json instead of aborting the whole import. + * 2 — usage / I/O error (couldn't read tarball, couldn't write report). * * Always writes a JSON report to --report describing every absolute-target * symlink seen and the classification verdict. @@ -181,15 +185,17 @@ $report = [ @file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); +// Sandbox-mode posture: never refuse. Log every DANGEROUS finding to +// stderr so the panel sees them in the streamed [container] log, and let +// extract.sh inspect --report to decide which entries to exclude from +// the tar untar. Caller treats exit 0 as "scan completed; consult report". if ($dangerousCount > 0) { - fwrite(STDERR, "scan-symlinks: $dangerousCount DANGEROUS finding(s); refusing tarball\n"); + fwrite(STDERR, "scan-symlinks: $dangerousCount DANGEROUS finding(s) will be stripped during extract\n"); foreach ($findings as $f) { if ($f['type'] === 'DANGEROUS') { - fwrite(STDERR, sprintf(" %s -> %s (%s)\n", $f['archive_path'], $f['target'], $f['reason'])); + fwrite(STDERR, sprintf(" STRIP %s -> %s (%s)\n", $f['archive_path'], $f['target'], $f['reason'])); } } - exit(1); } - -fwrite(STDERR, "scan-symlinks: clean (uncertain=$uncertainCount, dangerous=0)\n"); +fwrite(STDERR, "scan-symlinks: scan complete (uncertain=$uncertainCount, dangerous=$dangerousCount)\n"); exit(0);