Initial bootstrap: cpanel-importer sanitization sandbox
Skeleton for the cpanel-importer Docker container — a one-shot sandbox the WHP panel invokes BEFORE extracting a customer cpmove tarball. See cpanel-import-container-spec.md (in /workspace/) for the full design. What this ships in v1.0: - Dockerfile: almalinux:10-minimal + PHP 8.4 (Remi) + ClamAV 1.4 + SaneSecurity Foxhole.PHP rules + tar/mariadb-client/rsync. Runs as UID 999 (whp-import) via the panel-side --user 999:999 flag. - scripts/entrypoint.sh: validates env, runs (optional) freshclam, drives extract -> scan-files -> scan-dbs -> rsync -> report.json. - scripts/extract.sh + scripts/lib/scan-symlinks.php: pre-extract symlink scan ported standalone from web-files/libs/CpanelBackupImporter.php (the existing 2026-05-29 whp02 destruction-vector fix). Aborts with exit 3 before tar runs if any DANGEROUS symlink is found. - scripts/scan-files.php: ClamAV walk + classify-and-action. v1.0 ships with an empty cleaner registry — every hit is QUARANTINE_ONLY. Cleaner hooks are stubbed for v1.1. - scripts/scan-dbs.php: regex MyISAM -> InnoDB rewrite (always applied), WordPress identification, and ONE WP content scan check (siteurl_external_domain). v1.1 will grow the check set. - scripts/lib/safety-net.php: container-narrow open_basedir allow-list, much tighter than the panel-side one. - .gitea/workflows/build-push.yaml: builds + smoke-tests + PHP-syntax-checks + bash-syntax-checks before pushing to repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer. - tests/build-fixtures.sh: builds cpmove-clean.tar.gz (benign WP dump) and cpmove-alfa.tar.gz (the ALFA-shell symlink-to-/etc vector) for local end-to-end testing. - README.md / CONTRIBUTING.md: docker-run invocation, bind-mount catalog, report.json schema, how to add a cleaner pattern or a WP scan signature. Local acceptance test results: - clean fixture -> status=completed, 3 MyISAM->InnoDB, no flags, 0 - ALFA fixture -> exit 1, status=failed, failed_stage=extract, "tarball contains dangerous symlinks; aborting" on stderr - compromised-siteurl fixture -> imported_into_new_server=false, .flagged file written, summary_for_panel.show_alert=true Image size: 197 MB compressed (gzipped docker save), ~397 MB unique layers extracted. Well under the spec's 600 MB compressed / 1.2 GB extracted budget. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
399
scripts/scan-dbs.php
Executable file
399
scripts/scan-dbs.php
Executable file
@@ -0,0 +1,399 @@
|
||||
<?php
|
||||
/**
|
||||
* scan-dbs.php — SQL dump engine swap + WordPress content scan.
|
||||
*
|
||||
* v1.0 scope:
|
||||
* - Walk every cpmove-USER/mysql/DBNAME.sql under the extract dir.
|
||||
* - ALWAYS: regex-rewrite ENGINE=MyISAM -> ENGINE=InnoDB.
|
||||
* - WordPress identification: presence of wp_options/wp_posts/wp_users
|
||||
* CREATE TABLEs (or prefix-variants where prefix != "wp_").
|
||||
* - WP content scan: ONE check — siteurl_external_domain — comparing
|
||||
* wp_options.siteurl / wp_options.home against the cpanel userdata's
|
||||
* main_domain + addon-domain list.
|
||||
* - If any high-confidence flag fires, the .sql file is written with
|
||||
* a .flagged suffix and imported_into_new_server=false.
|
||||
* - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/.
|
||||
*
|
||||
* v1.1 will grow the WP scan check set (post_content script-injection,
|
||||
* user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how
|
||||
* to add a check.
|
||||
*
|
||||
* Usage:
|
||||
* scan-dbs.php --extract DIR --out DIR --report OUT.json
|
||||
* --import-id ID --username USER
|
||||
*
|
||||
* Exit codes:
|
||||
* 0 on success (regardless of flags); 1 fatal; 2 usage.
|
||||
*
|
||||
* NOTE: docblock above must not contain the literal sequence "* /"
|
||||
* (without the space) anywhere — PHP closes the C-style comment at
|
||||
* that token and parses the rest as code. This bit us once on
|
||||
* the cpmove-USER /mysql glob path.
|
||||
*/
|
||||
|
||||
require __DIR__ . '/lib/safety-net.php';
|
||||
|
||||
const SCANNER_VERSION = '1.0.0';
|
||||
|
||||
$opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']);
|
||||
foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) {
|
||||
if (!isset($opts[$k])) {
|
||||
fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
$extractDir = rtrim($opts['extract'], '/');
|
||||
$outDir = rtrim($opts['out'], '/');
|
||||
$reportPath = $opts['report'];
|
||||
$importId = $opts['import-id'];
|
||||
$username = $opts['username'];
|
||||
// --final-prefix is the path .sql files will live at AFTER the rsync to
|
||||
// /host/sanitized/<importid>/mysql/. We record that path in the report
|
||||
// so the panel doesn't have to translate /tmp/... paths.
|
||||
$finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir;
|
||||
|
||||
@mkdir($outDir, 0750, true);
|
||||
|
||||
fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n");
|
||||
|
||||
// -- find all cpmove-*/mysql/*.sql dumps -----------------------------------
|
||||
|
||||
$sqlFiles = [];
|
||||
foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) {
|
||||
if (is_file($f)) $sqlFiles[] = $f;
|
||||
}
|
||||
// Some cpmove layouts use cpmove-<user>/mysql/<db>.create + <db>.sql;
|
||||
// glob above already covers <db>.sql which is what we care about.
|
||||
|
||||
if (empty($sqlFiles)) {
|
||||
fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n");
|
||||
}
|
||||
|
||||
// -- discover the user's allowed-domain list from the cpmove userdata -----
|
||||
|
||||
$allowedDomains = collect_allowed_domains($extractDir, $username);
|
||||
fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: "
|
||||
. (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains))
|
||||
. "\n");
|
||||
|
||||
$databases = [];
|
||||
|
||||
foreach ($sqlFiles as $sqlPath) {
|
||||
$dbName = basename($sqlPath, '.sql');
|
||||
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
||||
|
||||
$sizeBytes = filesize($sqlPath) ?: 0;
|
||||
$sql = file_get_contents($sqlPath);
|
||||
if ($sql === false) {
|
||||
fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- ENGINE SWAP (always applied) -------------------------------------
|
||||
|
||||
[$rewritten, $engineCounts] = engine_swap($sql);
|
||||
|
||||
// --- WordPress identification + content scan -------------------------
|
||||
|
||||
$isWp = is_wordpress_dump($rewritten);
|
||||
$flags = [];
|
||||
if ($isWp) {
|
||||
$flags = wp_content_scan($rewritten, $allowedDomains);
|
||||
}
|
||||
|
||||
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
||||
$refused = (bool) count($highConfidence);
|
||||
|
||||
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
||||
$outPath = $outDir . '/' . $outName;
|
||||
$finalPath = $finalPrefix . '/' . $outName;
|
||||
file_put_contents($outPath, $rewritten);
|
||||
|
||||
$databases[] = [
|
||||
'dbname' => $dbName,
|
||||
'size_bytes'=> $sizeBytes,
|
||||
'engine_changes' => [
|
||||
'myisam_to_innodb' => $engineCounts['myisam_to_innodb'],
|
||||
'row_format_dynamic_applied' => 0, // v1.1
|
||||
'fulltext_indexes_dropped' => 0, // v1.1
|
||||
],
|
||||
'wp_content_scan' => [
|
||||
'is_wordpress' => $isWp,
|
||||
'flags' => $flags,
|
||||
],
|
||||
'imported_into_new_server' => !$refused,
|
||||
'sanitized_sql_path' => $refused ? null : $finalPath,
|
||||
'flagged_sql_path' => $refused ? $finalPath : null,
|
||||
];
|
||||
}
|
||||
|
||||
$report = [
|
||||
'scanner_version' => SCANNER_VERSION,
|
||||
'import_id' => $importId,
|
||||
'databases' => $databases,
|
||||
];
|
||||
file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||||
|
||||
fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n");
|
||||
exit(0);
|
||||
|
||||
// ---- helpers --------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a
|
||||
* table-options token. Returns [string $newSql, array $counts].
|
||||
*
|
||||
* The regex is intentionally narrow:
|
||||
* - case-insensitive (cpmove dumps vary)
|
||||
* - anchored on word boundaries so we don't rewrite, say,
|
||||
* a TEXT field that contains the literal string "ENGINE=MyISAM"
|
||||
* (extremely unlikely but possible)
|
||||
*/
|
||||
function engine_swap(string $sql): array {
|
||||
$count = 0;
|
||||
$rewritten = preg_replace_callback(
|
||||
'/\bENGINE\s*=\s*MyISAM\b/i',
|
||||
function () use (&$count) { $count++; return 'ENGINE=InnoDB'; },
|
||||
$sql
|
||||
);
|
||||
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
||||
}
|
||||
|
||||
/**
|
||||
* Identify WordPress by the canonical core-table CREATE statements.
|
||||
*
|
||||
* cPanel exports respect the customer's prefix, so we accept any
|
||||
* prefix as long as the three core tables exist in this dump.
|
||||
*/
|
||||
function is_wordpress_dump(string $sql): bool {
|
||||
$hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql);
|
||||
$hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql);
|
||||
$hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql);
|
||||
// Bonus signal: the dump also references the standard wp_options
|
||||
// option_names. Cheap to check, drops a few false positives where
|
||||
// an app shares table names with WP.
|
||||
$optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql);
|
||||
return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the WP content scan. v1.0 ships ONE check:
|
||||
*
|
||||
* siteurl_external_domain — wp_options.siteurl or .home points at a
|
||||
* host not in the allow list (cpanel main + addons).
|
||||
*
|
||||
* Returns an array of flag dicts; an empty array means "clean."
|
||||
*
|
||||
* v1.1 add: post_content script-injection signature, theme/stylesheet
|
||||
* known-malware patterns, user_pass leaked-hash check, Wordfence regex.
|
||||
*/
|
||||
function wp_content_scan(string $sql, array $allowedDomains): array {
|
||||
$flags = [];
|
||||
|
||||
// Pull every (option_name, option_value) row from any INSERT INTO
|
||||
// <prefix>options. We use a forgiving regex because cPanel dumps
|
||||
// use both single-row INSERTs and chunked multi-row INSERTs.
|
||||
$optionValues = extract_wp_options($sql);
|
||||
|
||||
foreach (['siteurl', 'home'] as $optName) {
|
||||
if (!isset($optionValues[$optName])) continue;
|
||||
$val = $optionValues[$optName];
|
||||
$host = parse_url($val, PHP_URL_HOST);
|
||||
if ($host === null || $host === false || $host === '') continue;
|
||||
|
||||
// localhost / IP literals are not external domains; let the
|
||||
// panel handle them on the rewrite-wp-config pass.
|
||||
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
||||
|
||||
if (!domain_in_allowlist($host, $allowedDomains)) {
|
||||
$flags[] = [
|
||||
'severity' => 'high',
|
||||
'code' => 'siteurl_external_domain',
|
||||
'details' => sprintf(
|
||||
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
||||
$optName,
|
||||
json_encode($val),
|
||||
$host,
|
||||
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
||||
),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a map of option_name => option_value from any INSERT into the
|
||||
* options table. Returns ['siteurl' => '...', 'home' => '...', ...].
|
||||
*
|
||||
* Best-effort — multi-row INSERTs with weird quoting can defeat the
|
||||
* regex, in which case we report no values and the scan returns clean.
|
||||
* That's acceptable because the panel will still rewrite siteurl on its
|
||||
* own pass and any malicious siteurl that survives WILL show up in the
|
||||
* customer-facing rendered URL — admin can spot it post-import.
|
||||
*/
|
||||
function extract_wp_options(string $sql): array {
|
||||
$map = [];
|
||||
|
||||
// Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...);
|
||||
// The optional column list contains the literal "value" (lowercase
|
||||
// via `option_value`) and uppercase V too, so we can't use [^V]
|
||||
// as a delimiter — instead match a balanced parens column list
|
||||
// followed by VALUES.
|
||||
if (!preg_match_all(
|
||||
'/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims',
|
||||
$sql,
|
||||
$stmts
|
||||
)) {
|
||||
return $map;
|
||||
}
|
||||
|
||||
foreach ($stmts[1] as $body) {
|
||||
// Split on `),(` between rows; first row has the leading `(`,
|
||||
// last row has the trailing `)` — handled by trim below.
|
||||
$body = trim($body);
|
||||
$body = preg_replace('/^\(/', '', $body);
|
||||
$body = preg_replace('/\)$/', '', $body);
|
||||
$rows = preg_split('/\)\s*,\s*\(/', $body);
|
||||
foreach ($rows as $row) {
|
||||
$cells = parse_sql_row($row);
|
||||
// wp_options columns: option_id, option_name, option_value, autoload
|
||||
if (count($cells) >= 3) {
|
||||
$name = $cells[1];
|
||||
$value = $cells[2];
|
||||
if (is_string($name) && is_string($value) && $name !== '') {
|
||||
$map[$name] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one row of a MySQL INSERT VALUES tuple — comma-separated,
|
||||
* strings single-quoted with backslash escapes.
|
||||
*
|
||||
* Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but
|
||||
* good enough for cpmove dumps, which mysqldump produces in a
|
||||
* predictable format.
|
||||
*/
|
||||
function parse_sql_row(string $row): array {
|
||||
$cells = [];
|
||||
$i = 0;
|
||||
$n = strlen($row);
|
||||
while ($i < $n) {
|
||||
// Skip leading whitespace + commas.
|
||||
while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++;
|
||||
if ($i >= $n) break;
|
||||
$c = $row[$i];
|
||||
if ($c === "'") {
|
||||
// Quoted string.
|
||||
$i++;
|
||||
$buf = '';
|
||||
while ($i < $n) {
|
||||
$cc = $row[$i];
|
||||
if ($cc === '\\' && $i + 1 < $n) {
|
||||
$next = $row[$i + 1];
|
||||
$buf .= match ($next) {
|
||||
'n' => "\n",
|
||||
't' => "\t",
|
||||
'r' => "\r",
|
||||
'0' => "\0",
|
||||
default => $next,
|
||||
};
|
||||
$i += 2;
|
||||
continue;
|
||||
}
|
||||
if ($cc === "'") {
|
||||
// MySQL `''` -> literal '
|
||||
if ($i + 1 < $n && $row[$i + 1] === "'") {
|
||||
$buf .= "'";
|
||||
$i += 2;
|
||||
continue;
|
||||
}
|
||||
$i++;
|
||||
break;
|
||||
}
|
||||
$buf .= $cc;
|
||||
$i++;
|
||||
}
|
||||
$cells[] = $buf;
|
||||
} else {
|
||||
// Bareword / number / NULL — read until next comma.
|
||||
$start = $i;
|
||||
while ($i < $n && $row[$i] !== ',') $i++;
|
||||
$tok = trim(substr($row, $start, $i - $start));
|
||||
$cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok;
|
||||
}
|
||||
}
|
||||
return $cells;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover the user's allowed-domain set by reading the cpmove
|
||||
* userdata. cPanel writes:
|
||||
* cpmove-<user>/userdata/<domain> — per-domain config
|
||||
* cpmove-<user>/userdata/main — the main domain
|
||||
* cpmove-<user>/addons — addon-domain list
|
||||
* cpmove-<user>/sds — subdomain list
|
||||
*
|
||||
* Best-effort. If we can't find any, the siteurl check still runs but
|
||||
* will flag everything as external — surface up to admin.
|
||||
*/
|
||||
function collect_allowed_domains(string $extractDir, string $username): array {
|
||||
$domains = [];
|
||||
|
||||
foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) {
|
||||
if (!is_dir($userdataDir)) continue;
|
||||
foreach (scandir($userdataDir) ?: [] as $entry) {
|
||||
if ($entry === '.' || $entry === '..' || $entry === 'main') continue;
|
||||
// userdata/<domain> is a file or dir keyed by the domain.
|
||||
if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) {
|
||||
$domains[] = strtolower($entry);
|
||||
}
|
||||
}
|
||||
// userdata/main is a YAML-ish file with main_domain: <d>
|
||||
$mainFile = $userdataDir . '/main';
|
||||
if (is_file($mainFile)) {
|
||||
$content = file_get_contents($mainFile);
|
||||
if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) {
|
||||
$domains[] = strtolower($m[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) {
|
||||
if (!is_file($addonsFile)) continue;
|
||||
$content = file_get_contents($addonsFile);
|
||||
if ($content === false) continue;
|
||||
// cPanel writes "addon.tld=parent.tld" lines.
|
||||
foreach (preg_split('/\R/', $content) as $line) {
|
||||
if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) {
|
||||
$domains[] = strtolower($m[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($domains));
|
||||
}
|
||||
|
||||
/**
|
||||
* True if $host is in the allow-list, including subdomain matches.
|
||||
*
|
||||
* e.g. allowed=['example.com'], host='www.example.com' -> true.
|
||||
* allowed=['example.com'], host='malicious.tld' -> false.
|
||||
* allowed=[], host='*' -> false (refuse-all).
|
||||
*/
|
||||
function domain_in_allowlist(string $host, array $allowed): bool {
|
||||
if (empty($allowed)) return false;
|
||||
$host = strtolower($host);
|
||||
foreach ($allowed as $d) {
|
||||
$d = strtolower($d);
|
||||
if ($host === $d) return true;
|
||||
if (str_ends_with($host, '.' . $d)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
Reference in New Issue
Block a user