Initial bootstrap: cpanel-importer sanitization sandbox
Skeleton for the cpanel-importer Docker container — a one-shot sandbox the WHP panel invokes BEFORE extracting a customer cpmove tarball. See cpanel-import-container-spec.md (in /workspace/) for the full design. What this ships in v1.0: - Dockerfile: almalinux:10-minimal + PHP 8.4 (Remi) + ClamAV 1.4 + SaneSecurity Foxhole.PHP rules + tar/mariadb-client/rsync. Runs as UID 999 (whp-import) via the panel-side --user 999:999 flag. - scripts/entrypoint.sh: validates env, runs (optional) freshclam, drives extract -> scan-files -> scan-dbs -> rsync -> report.json. - scripts/extract.sh + scripts/lib/scan-symlinks.php: pre-extract symlink scan ported standalone from web-files/libs/CpanelBackupImporter.php (the existing 2026-05-29 whp02 destruction-vector fix). Aborts with exit 3 before tar runs if any DANGEROUS symlink is found. - scripts/scan-files.php: ClamAV walk + classify-and-action. v1.0 ships with an empty cleaner registry — every hit is QUARANTINE_ONLY. Cleaner hooks are stubbed for v1.1. - scripts/scan-dbs.php: regex MyISAM -> InnoDB rewrite (always applied), WordPress identification, and ONE WP content scan check (siteurl_external_domain). v1.1 will grow the check set. - scripts/lib/safety-net.php: container-narrow open_basedir allow-list, much tighter than the panel-side one. - .gitea/workflows/build-push.yaml: builds + smoke-tests + PHP-syntax-checks + bash-syntax-checks before pushing to repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer. - tests/build-fixtures.sh: builds cpmove-clean.tar.gz (benign WP dump) and cpmove-alfa.tar.gz (the ALFA-shell symlink-to-/etc vector) for local end-to-end testing. - README.md / CONTRIBUTING.md: docker-run invocation, bind-mount catalog, report.json schema, how to add a cleaner pattern or a WP scan signature. Local acceptance test results: - clean fixture -> status=completed, 3 MyISAM->InnoDB, no flags, 0 - ALFA fixture -> exit 1, status=failed, failed_stage=extract, "tarball contains dangerous symlinks; aborting" on stderr - compromised-siteurl fixture -> imported_into_new_server=false, .flagged file written, summary_for_panel.show_alert=true Image size: 197 MB compressed (gzipped docker save), ~397 MB unique layers extracted. Well under the spec's 600 MB compressed / 1.2 GB extracted budget. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
219
scripts/entrypoint.sh
Executable file
219
scripts/entrypoint.sh
Executable file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# entrypoint.sh — main controller for the cpanel-importer sandbox.
|
||||
#
|
||||
# Inputs (env, set by the panel's docker run):
|
||||
# IMPORT_ID unique id for this run; used in quarantine + report paths
|
||||
# IMPORT_USERNAME cPanel/WHP username the cpmove belongs to
|
||||
# IMPORT_BACKUP_FILE absolute path inside the container, typically
|
||||
# /host/backup/cpmove-<user>.tar.gz
|
||||
# CLAMAV_REFRESH "true" to run freshclam at start (default: true)
|
||||
#
|
||||
# Flow (spec §0):
|
||||
# 1. validate env
|
||||
# 2. (optional) refresh ClamAV signatures
|
||||
# 3. extract → /tmp/extract/
|
||||
# 4. file scan → /tmp/scan-files-report.json
|
||||
# 5. DB sanitize → /tmp/sanitized/mysql/, /tmp/scan-dbs-report.json
|
||||
# 6. rsync /tmp/sanitized/ → /host/sanitized/<importid>/
|
||||
# 7. write /host/sanitized/<importid>/report.json (merged)
|
||||
#
|
||||
# On failure at any stage we still write a partial report.json with
|
||||
# status="failed" + the stage that broke, then exit non-zero.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# --- logging ---------------------------------------------------------------
|
||||
|
||||
ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }
|
||||
log() { printf '[%s] %s\n' "$(ts)" "$*"; }
|
||||
die() { log "FATAL: $*"; write_failure_report "$STAGE" "$*"; exit 1; }
|
||||
|
||||
# Buffered partial state. The final report.json is written by the merge
|
||||
# step (see write_final_report); if we crash before then, write_failure_report
|
||||
# emits whatever partial pieces exist.
|
||||
STAGE="init"
|
||||
START_TS="$(date -u +%s)"
|
||||
|
||||
write_failure_report() {
|
||||
local stage="$1"
|
||||
local msg="$2"
|
||||
local out_dir="/host/sanitized/${IMPORT_ID:-unknown}"
|
||||
# mkdir AND the report write can both fail (mount RO, missing
|
||||
# /host/sanitized, etc.); we log every failure to stderr and never
|
||||
# let the report-writer abort the script.
|
||||
if ! mkdir -p "$out_dir" 2>/dev/null; then
|
||||
log "WARN: failure-report mkdir failed for $out_dir; report will not be persisted"
|
||||
return 0
|
||||
fi
|
||||
if ! cat > "$out_dir/report.json" 2>/dev/null <<JSON
|
||||
{
|
||||
"import_id": "${IMPORT_ID:-unknown}",
|
||||
"status": "failed",
|
||||
"failed_stage": "$stage",
|
||||
"error": $(printf '%s' "$msg" | php -r 'echo json_encode(stream_get_contents(STDIN));' 2>/dev/null || echo '"(unencodable)"'),
|
||||
"scan_duration_seconds": $(( $(date -u +%s) - START_TS )),
|
||||
"files": null,
|
||||
"databases": null
|
||||
}
|
||||
JSON
|
||||
then
|
||||
log "WARN: failure-report write failed for $out_dir/report.json"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- env validation --------------------------------------------------------
|
||||
|
||||
STAGE="validate_env"
|
||||
log "cpanel-importer starting (container UID=$(id -u) GID=$(id -g))"
|
||||
|
||||
: "${IMPORT_ID:?IMPORT_ID env var is required}"
|
||||
: "${IMPORT_USERNAME:?IMPORT_USERNAME env var is required}"
|
||||
: "${IMPORT_BACKUP_FILE:?IMPORT_BACKUP_FILE env var is required}"
|
||||
CLAMAV_REFRESH="${CLAMAV_REFRESH:-true}"
|
||||
|
||||
log "import_id=$IMPORT_ID username=$IMPORT_USERNAME backup=$IMPORT_BACKUP_FILE"
|
||||
|
||||
if [[ ! -f "$IMPORT_BACKUP_FILE" ]]; then
|
||||
die "backup file does not exist or is not a regular file: $IMPORT_BACKUP_FILE"
|
||||
fi
|
||||
|
||||
# Make sure the output dirs exist (they're bind mounts, so we trust the
|
||||
# host to have created them, but mkdir -p is harmless).
|
||||
QUARANTINE_DIR="/host/quarantine/$IMPORT_ID"
|
||||
SANITIZED_DIR="/host/sanitized/$IMPORT_ID"
|
||||
mkdir -p "$QUARANTINE_DIR" "$SANITIZED_DIR" \
|
||||
|| die "cannot create quarantine/sanitized output dirs (are the bind mounts RW?)"
|
||||
|
||||
# Container-internal scratch space (mounted as tmpfs by the panel).
|
||||
EXTRACT_DIR="/tmp/extract"
|
||||
WORK_DIR="/tmp/sanitized"
|
||||
mkdir -p "$EXTRACT_DIR" "$WORK_DIR/mysql"
|
||||
|
||||
# --- refresh ClamAV signatures --------------------------------------------
|
||||
|
||||
STAGE="freshclam"
|
||||
if [[ "$CLAMAV_REFRESH" == "true" ]]; then
|
||||
log "refreshing ClamAV signatures (freshclam)"
|
||||
# freshclam is allowed to fail (e.g., container has no outbound net);
|
||||
# we proceed with the baseline rules from build time + log a warning.
|
||||
if ! freshclam --no-warnings >/tmp/freshclam.log 2>&1; then
|
||||
log "WARN: freshclam failed; proceeding with build-time signature DB"
|
||||
tail -20 /tmp/freshclam.log || true
|
||||
fi
|
||||
else
|
||||
log "CLAMAV_REFRESH=false; skipping freshclam"
|
||||
fi
|
||||
|
||||
# --- extract the cpmove ----------------------------------------------------
|
||||
|
||||
STAGE="extract"
|
||||
log "stage: extract"
|
||||
if ! /scripts/extract.sh "$IMPORT_BACKUP_FILE" "$EXTRACT_DIR" "$IMPORT_USERNAME"; then
|
||||
die "extract.sh failed; see stderr above"
|
||||
fi
|
||||
|
||||
# --- ClamAV scan + auto-clean/quarantine ----------------------------------
|
||||
|
||||
STAGE="scan_files"
|
||||
log "stage: scan_files"
|
||||
php /scripts/scan-files.php \
|
||||
--extract "$EXTRACT_DIR" \
|
||||
--quarantine "$QUARANTINE_DIR" \
|
||||
--report /tmp/scan-files-report.json \
|
||||
--import-id "$IMPORT_ID" \
|
||||
|| die "scan-files.php failed; see stderr above"
|
||||
|
||||
# --- DB engine swap + WP content scan -------------------------------------
|
||||
|
||||
STAGE="scan_dbs"
|
||||
log "stage: scan_dbs"
|
||||
php /scripts/scan-dbs.php \
|
||||
--extract "$EXTRACT_DIR" \
|
||||
--out "$WORK_DIR/mysql" \
|
||||
--final-prefix "$SANITIZED_DIR/mysql" \
|
||||
--report /tmp/scan-dbs-report.json \
|
||||
--import-id "$IMPORT_ID" \
|
||||
--username "$IMPORT_USERNAME" \
|
||||
|| die "scan-dbs.php failed; see stderr above"
|
||||
|
||||
# --- rsync cleaned tree to /host/sanitized --------------------------------
|
||||
|
||||
STAGE="rsync_out"
|
||||
log "stage: rsync_out"
|
||||
# Copy the (now-cleaned) extracted tree to the sanitized output. We exclude
|
||||
# files that scan-files.php quarantined — they are NOT present in the
|
||||
# extract dir anymore (the scanner moved them), so this is the cleaned
|
||||
# tree by construction.
|
||||
rsync -a --no-owner --no-group --no-perms --chmod=Du=rwx,Dg=rx,Do=,Fu=rw,Fg=r,Fo= \
|
||||
"$EXTRACT_DIR"/ "$SANITIZED_DIR/extracted/" \
|
||||
|| die "rsync to sanitized dir failed"
|
||||
|
||||
# Then drop the cleaned .sql files in place too.
|
||||
rsync -a --no-owner --no-group --no-perms --chmod=Du=rwx,Dg=rx,Do=,Fu=rw,Fg=r,Fo= \
|
||||
"$WORK_DIR/mysql"/ "$SANITIZED_DIR/mysql/" \
|
||||
|| die "rsync of cleaned .sql files failed"
|
||||
|
||||
# --- merge per-stage reports into the final report.json -------------------
|
||||
|
||||
STAGE="write_report"
|
||||
log "stage: write_report"
|
||||
DURATION=$(( $(date -u +%s) - START_TS ))
|
||||
php -r '
|
||||
$importId = $argv[1];
|
||||
$duration = (int) $argv[2];
|
||||
$filesPath = $argv[3];
|
||||
$dbsPath = $argv[4];
|
||||
$outPath = $argv[5];
|
||||
|
||||
$files = is_file($filesPath) ? json_decode(file_get_contents($filesPath), true) : null;
|
||||
$dbs = is_file($dbsPath) ? json_decode(file_get_contents($dbsPath), true) : null;
|
||||
|
||||
$filesScanned = $files["files_scanned"] ?? 0;
|
||||
$filesClean = $files["files_clean"] ?? 0;
|
||||
$filesCleaned = $files["files_cleaned"] ?? 0;
|
||||
$filesQuarantined = $files["files_quarantined"] ?? 0;
|
||||
$actions = $files["actions"] ?? [];
|
||||
$databases = $dbs["databases"] ?? [];
|
||||
|
||||
$dbRefused = 0;
|
||||
foreach ($databases as $db) {
|
||||
if (($db["imported_into_new_server"] ?? true) === false) $dbRefused++;
|
||||
}
|
||||
|
||||
$severity = "info";
|
||||
$alert = false;
|
||||
$msg = "Sanitization clean: no malware signatures detected.";
|
||||
if ($filesQuarantined > 0 || $dbRefused > 0) {
|
||||
$alert = true;
|
||||
$severity = ($filesQuarantined > 50 || $dbRefused > 0) ? "warning" : "info";
|
||||
$msg = sprintf(
|
||||
"%d files quarantined + %d cleaned in place; %d database(s) refused as compromised. Customer site may have been compromised at the source — recommend review.",
|
||||
$filesQuarantined, $filesCleaned, $dbRefused
|
||||
);
|
||||
}
|
||||
|
||||
$report = [
|
||||
"import_id" => $importId,
|
||||
"status" => "completed",
|
||||
"scan_duration_seconds" => $duration,
|
||||
"files_scanned" => $filesScanned,
|
||||
"files_clean" => $filesClean,
|
||||
"files_cleaned" => $filesCleaned,
|
||||
"files_quarantined" => $filesQuarantined,
|
||||
"actions" => $actions,
|
||||
"databases" => $databases,
|
||||
"summary_for_panel" => [
|
||||
"show_alert" => $alert,
|
||||
"alert_severity" => $severity,
|
||||
"alert_message" => $msg,
|
||||
],
|
||||
];
|
||||
|
||||
file_put_contents($outPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||||
fprintf(STDERR, "report written: %s\n", $outPath);
|
||||
' "$IMPORT_ID" "$DURATION" /tmp/scan-files-report.json /tmp/scan-dbs-report.json "$SANITIZED_DIR/report.json" \
|
||||
|| die "report merge failed"
|
||||
|
||||
log "done — exited cleanly after ${DURATION}s"
|
||||
exit 0
|
||||
64
scripts/extract.sh
Executable file
64
scripts/extract.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# extract.sh — pre-extract symlink scan + cpmove untar.
|
||||
#
|
||||
# Usage: extract.sh <tarball> <dest> <username>
|
||||
#
|
||||
# Calls scripts/lib/scan-symlinks.php first; if it reports any DANGEROUS
|
||||
# findings we abort BEFORE tar runs (per spec §0 step 2). On clean,
|
||||
# extracts with the same hardening flags CpanelBackupImporter::extractBackup
|
||||
# uses on the panel today (see web-files/libs/CpanelBackupImporter.php).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TARBALL="${1:?usage: extract.sh <tarball> <dest> <username>}"
|
||||
DEST="${2:?usage: extract.sh <tarball> <dest> <username>}"
|
||||
USERNAME="${3:?usage: extract.sh <tarball> <dest> <username>}"
|
||||
|
||||
ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }
|
||||
log() { printf '[%s] extract: %s\n' "$(ts)" "$*"; }
|
||||
|
||||
[[ -f "$TARBALL" ]] || { log "tarball not found: $TARBALL"; exit 2; }
|
||||
mkdir -p "$DEST"
|
||||
|
||||
# --- pre-extract symlink scan ---------------------------------------------
|
||||
|
||||
log "scanning tarball for dangerous symlinks (cpmove vector check)"
|
||||
SYMLINK_REPORT=$(mktemp -p /tmp scan-symlinks.XXXXXX.json)
|
||||
if ! php /scripts/lib/scan-symlinks.php \
|
||||
--tarball "$TARBALL" \
|
||||
--username "$USERNAME" \
|
||||
--report "$SYMLINK_REPORT"; then
|
||||
log "scan-symlinks.php exited non-zero"
|
||||
cat "$SYMLINK_REPORT" >&2 || true
|
||||
log "ABORT: tarball contains dangerous symlinks; aborting"
|
||||
# Propagate the report on stdout so entrypoint.sh can include it
|
||||
# in the failure record.
|
||||
exit 3
|
||||
fi
|
||||
|
||||
log "symlink scan clean (no DANGEROUS findings)"
|
||||
|
||||
# --- extract --------------------------------------------------------------
|
||||
|
||||
# Detect compression. cpmove can be .tar.gz / .tar.bz2 / .tar.
|
||||
TAR_FLAGS="-xf"
|
||||
case "$TARBALL" in
|
||||
*.tar.gz|*.tgz) TAR_FLAGS="-xzf" ;;
|
||||
*.tar.bz2|*.tbz2) TAR_FLAGS="-xjf" ;;
|
||||
*.tar.xz|*.txz) TAR_FLAGS="-xJf" ;;
|
||||
*.tar) TAR_FLAGS="-xf" ;;
|
||||
esac
|
||||
|
||||
log "extracting with hardened tar flags into $DEST"
|
||||
# Hardening flags (mirrored from CpanelBackupImporter::extractBackup):
|
||||
# --no-same-owner / --no-same-permissions: drop archive-recorded
|
||||
# uid/perm bits so the cpmove can't drop setuid binaries at us.
|
||||
# --no-overwrite-dir: refuse to clobber existing directory metadata,
|
||||
# closing one historical tar-symlink-escape vector.
|
||||
# --absolute-names is NOT used — leading / in a member name is stripped.
|
||||
cd "$DEST"
|
||||
tar --no-same-owner --no-same-permissions --no-overwrite-dir $TAR_FLAGS "$TARBALL"
|
||||
|
||||
log "extracted OK ($(find "$DEST" -type f | wc -l) files)"
|
||||
exit 0
|
||||
46
scripts/lib/safety-net.php
Normal file
46
scripts/lib/safety-net.php
Normal file
@@ -0,0 +1,46 @@
|
||||
<?php
|
||||
/**
|
||||
* safety-net.php — container-narrow open_basedir allow-list.
|
||||
*
|
||||
* The sibling at /workspace/whp/web-files/includes/safety-net.php is the
|
||||
* panel's allow-list — it includes /docker, /root/whp, /etc/whp, etc.,
|
||||
* because the panel legitimately reads from those.
|
||||
*
|
||||
* Inside this container, the worker has a much smaller set of paths it
|
||||
* needs. Anything outside this list is blocked at the PHP filesystem-
|
||||
* function level (PHP enforces open_basedir in unlink/scandir/fopen/
|
||||
* RecursiveDirectoryIterator/etc. AFTER symlink resolution, so a planted
|
||||
* symlink-to-/proc cannot escape the allow-list).
|
||||
*
|
||||
* HISTORY — the same destruction-bug class that motivated the panel-side
|
||||
* safety-net (whp02 /usr/bin + /etc wipe, 2026-05-28/29) is the reason
|
||||
* this exists. In the container the host /etc /usr /root are not bind-
|
||||
* mounted, but open_basedir gives belt-and-suspenders enforcement
|
||||
* against any extracted-archive symlink walker we add later.
|
||||
*/
|
||||
|
||||
if (function_exists('ini_set')) {
|
||||
// Container-internal paths only. Notable absences:
|
||||
// - /etc, /usr, /var, /root — never written to by this container
|
||||
// - /docker — there is no /docker in this image
|
||||
// - /home — there is no /home in this image
|
||||
$allowed = implode(PATH_SEPARATOR, [
|
||||
'/host', // /host/backup (RO), /host/quarantine, /host/sanitized
|
||||
'/tmp', // tmpfs scratch space
|
||||
'/opt/whp', // WORKDIR + per-run state
|
||||
'/scripts', // our own code
|
||||
'/var/lib/clamav', // ClamAV signature DB
|
||||
'/var/log/clamav', // freshclam log
|
||||
'/etc/freshclam.conf', // single file, read-only
|
||||
'/proc/self', // pid/cgroup introspection
|
||||
]);
|
||||
|
||||
if ((string) ini_get('open_basedir') === '') {
|
||||
@ini_set('open_basedir', $allowed);
|
||||
}
|
||||
|
||||
// Realpath cache tuning matches the panel — open_basedir adds a
|
||||
// realpath() to every fs op, so a bigger cache pays back fast.
|
||||
@ini_set('realpath_cache_size', '512K');
|
||||
@ini_set('realpath_cache_ttl', '600');
|
||||
}
|
||||
161
scripts/lib/scan-symlinks.php
Normal file
161
scripts/lib/scan-symlinks.php
Normal file
@@ -0,0 +1,161 @@
|
||||
<?php
|
||||
/**
|
||||
* scan-symlinks.php — standalone port of
|
||||
* CpanelBackupImporter::scanTarballForDangerousSymlinks().
|
||||
*
|
||||
* This is the same classification logic that ships in the WHP panel today
|
||||
* (web-files/libs/CpanelBackupImporter.php, ~line 2438). Lifted into a
|
||||
* standalone CLI so the container can run it as an independent pre-extract
|
||||
* gate without dragging in the rest of the importer.
|
||||
*
|
||||
* Exit codes:
|
||||
* 0 — clean (no DANGEROUS findings)
|
||||
* 1 — one or more DANGEROUS findings; tarball MUST NOT be extracted
|
||||
* 2 — usage / I/O error
|
||||
*
|
||||
* Always writes a JSON report to --report describing every absolute-target
|
||||
* symlink seen and the classification verdict.
|
||||
*
|
||||
* SECURITY NOTE — this differs from the panel implementation in ONE way:
|
||||
* The panel uses file_exists($target) on the *host* to decide whether a
|
||||
* target under a dangerous prefix is BENIGN_DANGLING vs DANGEROUS. We
|
||||
* are running INSIDE the container so /etc and /usr DO exist (they're
|
||||
* the container's own), but `--read-only --tmpfs /tmp` plus the worker
|
||||
* running as UID 999 means even DANGEROUS targets cannot reach the host.
|
||||
*
|
||||
* We treat any absolute-target symlink under a dangerous prefix as
|
||||
* DANGEROUS regardless of `file_exists()` — this is a stricter check
|
||||
* than the panel's, because in the container we *can* safely refuse to
|
||||
* even try the extract on a clearly malicious tarball.
|
||||
*/
|
||||
|
||||
require __DIR__ . '/safety-net.php';
|
||||
|
||||
$opts = getopt('', ['tarball:', 'username:', 'report:']);
|
||||
if (!isset($opts['tarball']) || !isset($opts['report'])) {
|
||||
fwrite(STDERR, "usage: scan-symlinks.php --tarball <path> --report <out.json> [--username <u>]\n");
|
||||
exit(2);
|
||||
}
|
||||
$tarPath = $opts['tarball'];
|
||||
$reportPath = $opts['report'];
|
||||
$username = $opts['username'] ?? '';
|
||||
|
||||
if (!is_file($tarPath) || !is_readable($tarPath)) {
|
||||
fwrite(STDERR, "scan-symlinks: not a readable file: $tarPath\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
// Same prefix list as the panel.
|
||||
$dangerousPrefixes = [
|
||||
'/etc', '/usr', '/bin', '/sbin', '/lib', '/lib64',
|
||||
'/boot', '/root',
|
||||
'/var/lib', '/var/log', '/var/cache', '/var/spool',
|
||||
];
|
||||
|
||||
$findings = [];
|
||||
$cpanelUsername = null;
|
||||
|
||||
$cmd = 'tar -tvf ' . escapeshellarg($tarPath) . ' 2>/dev/null';
|
||||
$fh = @popen($cmd, 'r');
|
||||
if (!$fh) {
|
||||
fwrite(STDERR, "scan-symlinks: failed to spawn tar -tvf on $tarPath\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
while (($line = fgets($fh)) !== false) {
|
||||
if ($line === '' || $line[0] !== 'l') continue;
|
||||
$arrow = strpos($line, ' -> ');
|
||||
if ($arrow === false) continue;
|
||||
$left = substr($line, 0, $arrow);
|
||||
$right = rtrim(substr($line, $arrow + 4), "\r\n");
|
||||
$parts = preg_split('/\s+/', $left, 6);
|
||||
if (count($parts) < 6) continue;
|
||||
$archivePath = $parts[5];
|
||||
$target = $right;
|
||||
|
||||
if ($target === '' || $target[0] !== '/') continue;
|
||||
|
||||
if ($cpanelUsername === null) {
|
||||
if (preg_match('#^cpmove-([^/]+)/#', $archivePath, $m)) {
|
||||
$cpanelUsername = $m[1];
|
||||
}
|
||||
}
|
||||
|
||||
// (1) user-internal — accept symlinks pointing into the customer's
|
||||
// own /home/<user>/ tree. The panel rewrites these on extract.
|
||||
$userInternal = false;
|
||||
$usernames = [];
|
||||
if ($cpanelUsername !== null && $cpanelUsername !== '') $usernames[] = $cpanelUsername;
|
||||
if ($username !== '') $usernames[] = $username;
|
||||
foreach ($usernames as $u) {
|
||||
$prefix = '/home/' . $u . '/';
|
||||
if (strpos($target, $prefix) === 0 || $target === rtrim($prefix, '/')) {
|
||||
$userInternal = true;
|
||||
break;
|
||||
}
|
||||
if (preg_match('#^/home\d+/' . preg_quote($u, '#') . '(/|$)#', $target)) {
|
||||
$userInternal = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($userInternal) continue;
|
||||
|
||||
// (2) exact root.
|
||||
$type = null;
|
||||
$reason = '';
|
||||
if ($target === '/') {
|
||||
$type = 'DANGEROUS';
|
||||
$reason = 'absolute target is root /';
|
||||
} else {
|
||||
// (3) — in container, every dangerous-prefix target is treated
|
||||
// as DANGEROUS without a file_exists() check (see security note
|
||||
// at top of file).
|
||||
foreach ($dangerousPrefixes as $p) {
|
||||
if ($target === $p || strpos($target, $p . '/') === 0) {
|
||||
$type = 'DANGEROUS';
|
||||
$reason = "absolute target resolves under system path $p";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($type === null) {
|
||||
// Target is absolute, not user-internal, not under a known
|
||||
// dangerous prefix. Operators want to know about these.
|
||||
$type = 'UNCERTAIN';
|
||||
$reason = 'absolute target outside user tree and not on dangerous-prefix list';
|
||||
}
|
||||
}
|
||||
|
||||
$findings[] = [
|
||||
'type' => $type,
|
||||
'archive_path' => $archivePath,
|
||||
'target' => $target,
|
||||
'reason' => $reason,
|
||||
];
|
||||
}
|
||||
pclose($fh);
|
||||
|
||||
$dangerousCount = count(array_filter($findings, fn($f) => $f['type'] === 'DANGEROUS'));
|
||||
$uncertainCount = count(array_filter($findings, fn($f) => $f['type'] === 'UNCERTAIN'));
|
||||
|
||||
$report = [
|
||||
'tarball' => $tarPath,
|
||||
'total_findings' => count($findings),
|
||||
'dangerous_count' => $dangerousCount,
|
||||
'uncertain_count' => $uncertainCount,
|
||||
'findings' => $findings,
|
||||
];
|
||||
|
||||
@file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||||
|
||||
if ($dangerousCount > 0) {
|
||||
fwrite(STDERR, "scan-symlinks: $dangerousCount DANGEROUS finding(s); refusing tarball\n");
|
||||
foreach ($findings as $f) {
|
||||
if ($f['type'] === 'DANGEROUS') {
|
||||
fwrite(STDERR, sprintf(" %s -> %s (%s)\n", $f['archive_path'], $f['target'], $f['reason']));
|
||||
}
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fwrite(STDERR, "scan-symlinks: clean (uncertain=$uncertainCount, dangerous=0)\n");
|
||||
exit(0);
|
||||
399
scripts/scan-dbs.php
Executable file
399
scripts/scan-dbs.php
Executable file
@@ -0,0 +1,399 @@
|
||||
<?php
|
||||
/**
|
||||
* scan-dbs.php — SQL dump engine swap + WordPress content scan.
|
||||
*
|
||||
* v1.0 scope:
|
||||
* - Walk every cpmove-USER/mysql/DBNAME.sql under the extract dir.
|
||||
* - ALWAYS: regex-rewrite ENGINE=MyISAM -> ENGINE=InnoDB.
|
||||
* - WordPress identification: presence of wp_options/wp_posts/wp_users
|
||||
* CREATE TABLEs (or prefix-variants where prefix != "wp_").
|
||||
* - WP content scan: ONE check — siteurl_external_domain — comparing
|
||||
* wp_options.siteurl / wp_options.home against the cpanel userdata's
|
||||
* main_domain + addon-domain list.
|
||||
* - If any high-confidence flag fires, the .sql file is written with
|
||||
* a .flagged suffix and imported_into_new_server=false.
|
||||
* - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/.
|
||||
*
|
||||
* v1.1 will grow the WP scan check set (post_content script-injection,
|
||||
* user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how
|
||||
* to add a check.
|
||||
*
|
||||
* Usage:
|
||||
* scan-dbs.php --extract DIR --out DIR --report OUT.json
|
||||
* --import-id ID --username USER
|
||||
*
|
||||
* Exit codes:
|
||||
* 0 on success (regardless of flags); 1 fatal; 2 usage.
|
||||
*
|
||||
* NOTE: docblock above must not contain the literal sequence "* /"
|
||||
* (without the space) anywhere — PHP closes the C-style comment at
|
||||
* that token and parses the rest as code. This bit us once on
|
||||
* the cpmove-USER /mysql glob path.
|
||||
*/
|
||||
|
||||
require __DIR__ . '/lib/safety-net.php';
|
||||
|
||||
const SCANNER_VERSION = '1.0.0';
|
||||
|
||||
$opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']);
|
||||
foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) {
|
||||
if (!isset($opts[$k])) {
|
||||
fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
$extractDir = rtrim($opts['extract'], '/');
|
||||
$outDir = rtrim($opts['out'], '/');
|
||||
$reportPath = $opts['report'];
|
||||
$importId = $opts['import-id'];
|
||||
$username = $opts['username'];
|
||||
// --final-prefix is the path .sql files will live at AFTER the rsync to
|
||||
// /host/sanitized/<importid>/mysql/. We record that path in the report
|
||||
// so the panel doesn't have to translate /tmp/... paths.
|
||||
$finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir;
|
||||
|
||||
@mkdir($outDir, 0750, true);
|
||||
|
||||
fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n");
|
||||
|
||||
// -- find all cpmove-*/mysql/*.sql dumps -----------------------------------
|
||||
|
||||
$sqlFiles = [];
|
||||
foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) {
|
||||
if (is_file($f)) $sqlFiles[] = $f;
|
||||
}
|
||||
// Some cpmove layouts use cpmove-<user>/mysql/<db>.create + <db>.sql;
|
||||
// glob above already covers <db>.sql which is what we care about.
|
||||
|
||||
if (empty($sqlFiles)) {
|
||||
fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n");
|
||||
}
|
||||
|
||||
// -- discover the user's allowed-domain list from the cpmove userdata -----
|
||||
|
||||
$allowedDomains = collect_allowed_domains($extractDir, $username);
|
||||
fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: "
|
||||
. (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains))
|
||||
. "\n");
|
||||
|
||||
$databases = [];
|
||||
|
||||
foreach ($sqlFiles as $sqlPath) {
|
||||
$dbName = basename($sqlPath, '.sql');
|
||||
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
||||
|
||||
$sizeBytes = filesize($sqlPath) ?: 0;
|
||||
$sql = file_get_contents($sqlPath);
|
||||
if ($sql === false) {
|
||||
fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- ENGINE SWAP (always applied) -------------------------------------
|
||||
|
||||
[$rewritten, $engineCounts] = engine_swap($sql);
|
||||
|
||||
// --- WordPress identification + content scan -------------------------
|
||||
|
||||
$isWp = is_wordpress_dump($rewritten);
|
||||
$flags = [];
|
||||
if ($isWp) {
|
||||
$flags = wp_content_scan($rewritten, $allowedDomains);
|
||||
}
|
||||
|
||||
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
||||
$refused = (bool) count($highConfidence);
|
||||
|
||||
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
||||
$outPath = $outDir . '/' . $outName;
|
||||
$finalPath = $finalPrefix . '/' . $outName;
|
||||
file_put_contents($outPath, $rewritten);
|
||||
|
||||
$databases[] = [
|
||||
'dbname' => $dbName,
|
||||
'size_bytes'=> $sizeBytes,
|
||||
'engine_changes' => [
|
||||
'myisam_to_innodb' => $engineCounts['myisam_to_innodb'],
|
||||
'row_format_dynamic_applied' => 0, // v1.1
|
||||
'fulltext_indexes_dropped' => 0, // v1.1
|
||||
],
|
||||
'wp_content_scan' => [
|
||||
'is_wordpress' => $isWp,
|
||||
'flags' => $flags,
|
||||
],
|
||||
'imported_into_new_server' => !$refused,
|
||||
'sanitized_sql_path' => $refused ? null : $finalPath,
|
||||
'flagged_sql_path' => $refused ? $finalPath : null,
|
||||
];
|
||||
}
|
||||
|
||||
$report = [
|
||||
'scanner_version' => SCANNER_VERSION,
|
||||
'import_id' => $importId,
|
||||
'databases' => $databases,
|
||||
];
|
||||
file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||||
|
||||
fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n");
|
||||
exit(0);
|
||||
|
||||
// ---- helpers --------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a
|
||||
* table-options token. Returns [string $newSql, array $counts].
|
||||
*
|
||||
* The regex is intentionally narrow:
|
||||
* - case-insensitive (cpmove dumps vary)
|
||||
* - anchored on word boundaries so we don't rewrite, say,
|
||||
* a TEXT field that contains the literal string "ENGINE=MyISAM"
|
||||
* (extremely unlikely but possible)
|
||||
*/
|
||||
function engine_swap(string $sql): array {
|
||||
$count = 0;
|
||||
$rewritten = preg_replace_callback(
|
||||
'/\bENGINE\s*=\s*MyISAM\b/i',
|
||||
function () use (&$count) { $count++; return 'ENGINE=InnoDB'; },
|
||||
$sql
|
||||
);
|
||||
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
||||
}
|
||||
|
||||
/**
|
||||
* Identify WordPress by the canonical core-table CREATE statements.
|
||||
*
|
||||
* cPanel exports respect the customer's prefix, so we accept any
|
||||
* prefix as long as the three core tables exist in this dump.
|
||||
*/
|
||||
function is_wordpress_dump(string $sql): bool {
|
||||
$hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql);
|
||||
$hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql);
|
||||
$hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql);
|
||||
// Bonus signal: the dump also references the standard wp_options
|
||||
// option_names. Cheap to check, drops a few false positives where
|
||||
// an app shares table names with WP.
|
||||
$optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql);
|
||||
return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the WP content scan. v1.0 ships ONE check:
|
||||
*
|
||||
* siteurl_external_domain — wp_options.siteurl or .home points at a
|
||||
* host not in the allow list (cpanel main + addons).
|
||||
*
|
||||
* Returns an array of flag dicts; an empty array means "clean."
|
||||
*
|
||||
* v1.1 add: post_content script-injection signature, theme/stylesheet
|
||||
* known-malware patterns, user_pass leaked-hash check, Wordfence regex.
|
||||
*/
|
||||
function wp_content_scan(string $sql, array $allowedDomains): array {
|
||||
$flags = [];
|
||||
|
||||
// Pull every (option_name, option_value) row from any INSERT INTO
|
||||
// <prefix>options. We use a forgiving regex because cPanel dumps
|
||||
// use both single-row INSERTs and chunked multi-row INSERTs.
|
||||
$optionValues = extract_wp_options($sql);
|
||||
|
||||
foreach (['siteurl', 'home'] as $optName) {
|
||||
if (!isset($optionValues[$optName])) continue;
|
||||
$val = $optionValues[$optName];
|
||||
$host = parse_url($val, PHP_URL_HOST);
|
||||
if ($host === null || $host === false || $host === '') continue;
|
||||
|
||||
// localhost / IP literals are not external domains; let the
|
||||
// panel handle them on the rewrite-wp-config pass.
|
||||
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
||||
|
||||
if (!domain_in_allowlist($host, $allowedDomains)) {
|
||||
$flags[] = [
|
||||
'severity' => 'high',
|
||||
'code' => 'siteurl_external_domain',
|
||||
'details' => sprintf(
|
||||
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
||||
$optName,
|
||||
json_encode($val),
|
||||
$host,
|
||||
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
||||
),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a map of option_name => option_value from any INSERT into the
|
||||
* options table. Returns ['siteurl' => '...', 'home' => '...', ...].
|
||||
*
|
||||
* Best-effort — multi-row INSERTs with weird quoting can defeat the
|
||||
* regex, in which case we report no values and the scan returns clean.
|
||||
* That's acceptable because the panel will still rewrite siteurl on its
|
||||
* own pass and any malicious siteurl that survives WILL show up in the
|
||||
* customer-facing rendered URL — admin can spot it post-import.
|
||||
*/
|
||||
function extract_wp_options(string $sql): array {
|
||||
$map = [];
|
||||
|
||||
// Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...);
|
||||
// The optional column list contains the literal "value" (lowercase
|
||||
// via `option_value`) and uppercase V too, so we can't use [^V]
|
||||
// as a delimiter — instead match a balanced parens column list
|
||||
// followed by VALUES.
|
||||
if (!preg_match_all(
|
||||
'/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims',
|
||||
$sql,
|
||||
$stmts
|
||||
)) {
|
||||
return $map;
|
||||
}
|
||||
|
||||
foreach ($stmts[1] as $body) {
|
||||
// Split on `),(` between rows; first row has the leading `(`,
|
||||
// last row has the trailing `)` — handled by trim below.
|
||||
$body = trim($body);
|
||||
$body = preg_replace('/^\(/', '', $body);
|
||||
$body = preg_replace('/\)$/', '', $body);
|
||||
$rows = preg_split('/\)\s*,\s*\(/', $body);
|
||||
foreach ($rows as $row) {
|
||||
$cells = parse_sql_row($row);
|
||||
// wp_options columns: option_id, option_name, option_value, autoload
|
||||
if (count($cells) >= 3) {
|
||||
$name = $cells[1];
|
||||
$value = $cells[2];
|
||||
if (is_string($name) && is_string($value) && $name !== '') {
|
||||
$map[$name] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one row of a MySQL INSERT VALUES tuple — comma-separated,
|
||||
* strings single-quoted with backslash escapes.
|
||||
*
|
||||
* Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but
|
||||
* good enough for cpmove dumps, which mysqldump produces in a
|
||||
* predictable format.
|
||||
*/
|
||||
function parse_sql_row(string $row): array {
|
||||
$cells = [];
|
||||
$i = 0;
|
||||
$n = strlen($row);
|
||||
while ($i < $n) {
|
||||
// Skip leading whitespace + commas.
|
||||
while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++;
|
||||
if ($i >= $n) break;
|
||||
$c = $row[$i];
|
||||
if ($c === "'") {
|
||||
// Quoted string.
|
||||
$i++;
|
||||
$buf = '';
|
||||
while ($i < $n) {
|
||||
$cc = $row[$i];
|
||||
if ($cc === '\\' && $i + 1 < $n) {
|
||||
$next = $row[$i + 1];
|
||||
$buf .= match ($next) {
|
||||
'n' => "\n",
|
||||
't' => "\t",
|
||||
'r' => "\r",
|
||||
'0' => "\0",
|
||||
default => $next,
|
||||
};
|
||||
$i += 2;
|
||||
continue;
|
||||
}
|
||||
if ($cc === "'") {
|
||||
// MySQL `''` -> literal '
|
||||
if ($i + 1 < $n && $row[$i + 1] === "'") {
|
||||
$buf .= "'";
|
||||
$i += 2;
|
||||
continue;
|
||||
}
|
||||
$i++;
|
||||
break;
|
||||
}
|
||||
$buf .= $cc;
|
||||
$i++;
|
||||
}
|
||||
$cells[] = $buf;
|
||||
} else {
|
||||
// Bareword / number / NULL — read until next comma.
|
||||
$start = $i;
|
||||
while ($i < $n && $row[$i] !== ',') $i++;
|
||||
$tok = trim(substr($row, $start, $i - $start));
|
||||
$cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok;
|
||||
}
|
||||
}
|
||||
return $cells;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover the user's allowed-domain set by reading the cpmove
|
||||
* userdata. cPanel writes:
|
||||
* cpmove-<user>/userdata/<domain> — per-domain config
|
||||
* cpmove-<user>/userdata/main — the main domain
|
||||
* cpmove-<user>/addons — addon-domain list
|
||||
* cpmove-<user>/sds — subdomain list
|
||||
*
|
||||
* Best-effort. If we can't find any, the siteurl check still runs but
|
||||
* will flag everything as external — surface up to admin.
|
||||
*/
|
||||
function collect_allowed_domains(string $extractDir, string $username): array {
|
||||
$domains = [];
|
||||
|
||||
foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) {
|
||||
if (!is_dir($userdataDir)) continue;
|
||||
foreach (scandir($userdataDir) ?: [] as $entry) {
|
||||
if ($entry === '.' || $entry === '..' || $entry === 'main') continue;
|
||||
// userdata/<domain> is a file or dir keyed by the domain.
|
||||
if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) {
|
||||
$domains[] = strtolower($entry);
|
||||
}
|
||||
}
|
||||
// userdata/main is a YAML-ish file with main_domain: <d>
|
||||
$mainFile = $userdataDir . '/main';
|
||||
if (is_file($mainFile)) {
|
||||
$content = file_get_contents($mainFile);
|
||||
if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) {
|
||||
$domains[] = strtolower($m[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) {
|
||||
if (!is_file($addonsFile)) continue;
|
||||
$content = file_get_contents($addonsFile);
|
||||
if ($content === false) continue;
|
||||
// cPanel writes "addon.tld=parent.tld" lines.
|
||||
foreach (preg_split('/\R/', $content) as $line) {
|
||||
if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) {
|
||||
$domains[] = strtolower($m[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($domains));
|
||||
}
|
||||
|
||||
/**
|
||||
* True if $host is in the allow-list, including subdomain matches.
|
||||
*
|
||||
* e.g. allowed=['example.com'], host='www.example.com' -> true.
|
||||
* allowed=['example.com'], host='malicious.tld' -> false.
|
||||
* allowed=[], host='*' -> false (refuse-all).
|
||||
*/
|
||||
function domain_in_allowlist(string $host, array $allowed): bool {
|
||||
if (empty($allowed)) return false;
|
||||
$host = strtolower($host);
|
||||
foreach ($allowed as $d) {
|
||||
$d = strtolower($d);
|
||||
if ($host === $d) return true;
|
||||
if (str_ends_with($host, '.' . $d)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
216
scripts/scan-files.php
Executable file
216
scripts/scan-files.php
Executable file
@@ -0,0 +1,216 @@
|
||||
<?php
|
||||
/**
|
||||
* scan-files.php — ClamAV scan + classify-and-action orchestrator.
|
||||
*
|
||||
* v1.0: quarantine-on-every-hit. No auto-cleaners enabled. The cleaner
|
||||
* registry (KNOWN_REMOVABLE / REMOVABLE_WITH_BACKUP) is stubbed below
|
||||
* for v1.1 expansion; see CONTRIBUTING.md for how to wire one in.
|
||||
*
|
||||
* Usage:
|
||||
* scan-files.php --extract <dir> --quarantine <dir> --report <out.json> --import-id <id>
|
||||
*
|
||||
* Exit codes:
|
||||
* 0 — scan completed (regardless of how many hits)
|
||||
* 1 — fatal scanner error (clamscan binary missing, signature DB unreadable)
|
||||
* 2 — usage error
|
||||
*
|
||||
* Report shape: matches spec §3, e.g.:
|
||||
* {
|
||||
* "files_scanned": N,
|
||||
* "files_clean": N,
|
||||
* "files_cleaned": 0, // always 0 in v1.0 — no cleaners yet
|
||||
* "files_quarantined": N,
|
||||
* "actions": [ { path, signature, action, cleaner, backup } ]
|
||||
* }
|
||||
*/
|
||||
|
||||
require __DIR__ . '/lib/safety-net.php';
|
||||
|
||||
const SCANNER_VERSION = '1.0.0';
|
||||
|
||||
$opts = getopt('', ['extract:', 'quarantine:', 'report:', 'import-id:']);
|
||||
foreach (['extract', 'quarantine', 'report', 'import-id'] as $k) {
|
||||
if (!isset($opts[$k])) {
|
||||
fwrite(STDERR, "usage: scan-files.php --extract <dir> --quarantine <dir> --report <out.json> --import-id <id>\n");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
$extractDir = rtrim($opts['extract'], '/');
|
||||
$quarantineDir = rtrim($opts['quarantine'], '/');
|
||||
$reportPath = $opts['report'];
|
||||
$importId = $opts['import-id'];
|
||||
|
||||
if (!is_dir($extractDir)) {
|
||||
fwrite(STDERR, "scan-files: extract dir does not exist: $extractDir\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
@mkdir($quarantineDir, 0750, true);
|
||||
|
||||
fwrite(STDERR, "scan-files: starting (extract=$extractDir, quarantine=$quarantineDir)\n");
|
||||
|
||||
// -- v1.0 cleaner registry (intentionally empty) ----------------------------
|
||||
//
|
||||
// Each entry maps a ClamAV signature substring -> classification +
|
||||
// cleaner callable. v1.0 ships empty so EVERY hit is classified as
|
||||
// QUARANTINE_ONLY. See CONTRIBUTING.md "Adding an auto-cleaner pattern"
|
||||
// for how to add a tested entry.
|
||||
//
|
||||
// Shape (v1.1+):
|
||||
// $cleaners = [
|
||||
// 'php-eval-base64-prefix' => [
|
||||
// 'class' => 'KNOWN_REMOVABLE',
|
||||
// 'match' => fn(string $sig): bool => str_contains($sig, 'PHP.Trojan.EvalB64'),
|
||||
// 'clean' => fn(string $path): bool => /* rewrite file in place; return ok */,
|
||||
// ],
|
||||
// ];
|
||||
$cleaners = [];
|
||||
|
||||
// -- run clamscan recursively over the extract dir --------------------------
|
||||
|
||||
// We use --infected so the output is only hits, and --recursive so we
|
||||
// walk subdirectories. We deliberately do NOT use --remove (we never want
|
||||
// clamscan unlinking files — we control quarantine).
|
||||
//
|
||||
// Output format per line on a hit:
|
||||
// /tmp/extract/foo/bar.php: Some.Signature.Name FOUND
|
||||
$cmd = sprintf(
|
||||
'clamscan --infected --recursive --no-summary --stdout %s 2>/dev/null',
|
||||
escapeshellarg($extractDir)
|
||||
);
|
||||
|
||||
$fh = popen($cmd, 'r');
|
||||
if (!$fh) {
|
||||
fwrite(STDERR, "scan-files: failed to spawn clamscan\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$hits = [];
|
||||
while (($line = fgets($fh)) !== false) {
|
||||
$line = rtrim($line, "\r\n");
|
||||
if ($line === '' || !str_ends_with($line, ' FOUND')) continue;
|
||||
// Strip trailing ' FOUND'.
|
||||
$body = substr($line, 0, -6);
|
||||
$colon = strrpos($body, ': ');
|
||||
if ($colon === false) continue;
|
||||
$path = substr($body, 0, $colon);
|
||||
$sig = substr($body, $colon + 2);
|
||||
if (!str_starts_with($path, $extractDir)) {
|
||||
// Defensive: shouldn't happen with our invocation.
|
||||
continue;
|
||||
}
|
||||
$hits[] = ['path' => $path, 'signature' => $sig];
|
||||
}
|
||||
pclose($fh);
|
||||
|
||||
// File count — we need files_scanned for the report. clamscan's summary
|
||||
// counting is suppressed; do a fast file count ourselves.
|
||||
$filesScanned = 0;
|
||||
$rdi = new RecursiveDirectoryIterator($extractDir, FilesystemIterator::SKIP_DOTS);
|
||||
$it = new RecursiveIteratorIterator($rdi);
|
||||
foreach ($it as $entry) {
|
||||
/** @var SplFileInfo $entry */
|
||||
if ($entry->isFile()) $filesScanned++;
|
||||
}
|
||||
|
||||
// -- classify + action each hit --------------------------------------------
|
||||
|
||||
$actions = [];
|
||||
$cleaned = 0;
|
||||
$quarantined = 0;
|
||||
|
||||
foreach ($hits as $h) {
|
||||
$path = $h['path'];
|
||||
$sig = $h['signature'];
|
||||
|
||||
// v1.0 — every hit is QUARANTINE_ONLY because the cleaner registry
|
||||
// is empty. Future work in v1.1 will iterate $cleaners and pick a
|
||||
// matching cleaner.
|
||||
$classification = 'QUARANTINE_ONLY';
|
||||
foreach ($cleaners as $name => $entry) {
|
||||
if (($entry['match'])($sig)) {
|
||||
$classification = $entry['class'];
|
||||
$cleanerName = $name;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$relPath = ltrim(substr($path, strlen($extractDir)), '/');
|
||||
$qPath = $quarantineDir . '/' . $relPath;
|
||||
|
||||
if ($classification === 'QUARANTINE_ONLY') {
|
||||
// Move the whole file to quarantine; remove from extract dir so
|
||||
// the rsync to /host/sanitized/ does not include it.
|
||||
@mkdir(dirname($qPath), 0750, true);
|
||||
if (!@rename($path, $qPath)) {
|
||||
// Fall back to copy + unlink (rename across mount boundaries
|
||||
// sometimes EXDEVs even though /tmp and /host are both ours).
|
||||
if (@copy($path, $qPath)) {
|
||||
@unlink($path);
|
||||
} else {
|
||||
fwrite(STDERR, "scan-files: WARN failed to quarantine $path -> $qPath\n");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$quarantined++;
|
||||
$actions[] = [
|
||||
'path' => $relPath,
|
||||
'signature' => $sig,
|
||||
'action' => 'quarantined',
|
||||
'cleaner' => null,
|
||||
'backup' => $qPath,
|
||||
];
|
||||
continue;
|
||||
}
|
||||
|
||||
// v1.1+ paths:
|
||||
if ($classification === 'KNOWN_REMOVABLE' || $classification === 'REMOVABLE_WITH_BACKUP') {
|
||||
// Backup first, then run the cleaner.
|
||||
@mkdir(dirname($qPath), 0750, true);
|
||||
$backup = $qPath . '.original';
|
||||
if (!@copy($path, $backup)) {
|
||||
fwrite(STDERR, "scan-files: backup before clean failed: $path; quarantining instead\n");
|
||||
@rename($path, $qPath);
|
||||
$quarantined++;
|
||||
$actions[] = [
|
||||
'path' => $relPath, 'signature' => $sig,
|
||||
'action' => 'quarantined', 'cleaner' => null, 'backup' => $qPath,
|
||||
];
|
||||
continue;
|
||||
}
|
||||
$cleanerOk = ($cleaners[$cleanerName]['clean'])($path);
|
||||
if (!$cleanerOk) {
|
||||
// Cleaner refused; fall back to quarantine.
|
||||
@rename($path, $qPath);
|
||||
$quarantined++;
|
||||
$actions[] = [
|
||||
'path' => $relPath, 'signature' => $sig,
|
||||
'action' => 'quarantined', 'cleaner' => $cleanerName, 'backup' => $qPath,
|
||||
];
|
||||
continue;
|
||||
}
|
||||
$cleaned++;
|
||||
$actions[] = [
|
||||
'path' => $relPath, 'signature' => $sig,
|
||||
'action' => 'cleaned', 'cleaner' => $cleanerName, 'backup' => $backup,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$report = [
|
||||
'scanner_version' => SCANNER_VERSION,
|
||||
'import_id' => $importId,
|
||||
'files_scanned' => $filesScanned,
|
||||
'files_clean' => max(0, $filesScanned - count($hits)),
|
||||
'files_cleaned' => $cleaned,
|
||||
'files_quarantined' => $quarantined,
|
||||
'actions' => $actions,
|
||||
];
|
||||
|
||||
@file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||||
|
||||
fwrite(STDERR, sprintf(
|
||||
"scan-files: done — scanned=%d clean=%d cleaned=%d quarantined=%d\n",
|
||||
$filesScanned, $report['files_clean'], $cleaned, $quarantined
|
||||
));
|
||||
exit(0);
|
||||
Reference in New Issue
Block a user