ENGINE=InnoDB. * - WordPress identification: presence of wp_options/wp_posts/wp_users * CREATE TABLEs (or prefix-variants where prefix != "wp_"). * - WP content scan: ONE check — siteurl_external_domain — comparing * wp_options.siteurl / wp_options.home against the cpanel userdata's * main_domain + addon-domain list. * - If any high-confidence flag fires, the .sql file is written with * a .flagged suffix and imported_into_new_server=false. * - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/. * * v1.1 will grow the WP scan check set (post_content script-injection, * user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how * to add a check. * * Usage: * scan-dbs.php --extract DIR --out DIR --report OUT.json * --import-id ID --username USER * * Exit codes: * 0 on success (regardless of flags); 1 fatal; 2 usage. * * NOTE: docblock above must not contain the literal sequence "* /" * (without the space) anywhere — PHP closes the C-style comment at * that token and parses the rest as code. This bit us once on * the cpmove-USER /mysql glob path. */ require __DIR__ . '/lib/safety-net.php'; const SCANNER_VERSION = '1.0.0'; $opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']); foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) { if (!isset($opts[$k])) { fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n"); exit(2); } } $extractDir = rtrim($opts['extract'], '/'); $outDir = rtrim($opts['out'], '/'); $reportPath = $opts['report']; $importId = $opts['import-id']; $username = $opts['username']; // --final-prefix is the path .sql files will live at AFTER the rsync to // /host/sanitized//mysql/. We record that path in the report // so the panel doesn't have to translate /tmp/... paths. $finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir; @mkdir($outDir, 0750, true); fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n"); // -- find all cpmove-*/mysql/*.sql dumps ----------------------------------- $sqlFiles = []; foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) { if (is_file($f)) $sqlFiles[] = $f; } // Some cpmove layouts use cpmove-/mysql/.create + .sql; // glob above already covers .sql which is what we care about. if (empty($sqlFiles)) { fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n"); } // -- discover the user's allowed-domain list from the cpmove userdata ----- $allowedDomains = collect_allowed_domains($extractDir, $username); fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: " . (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains)) . "\n"); $databases = []; foreach ($sqlFiles as $sqlPath) { $dbName = basename($sqlPath, '.sql'); fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n"); $sizeBytes = filesize($sqlPath) ?: 0; // STREAMING POSTURE: load + scan the SQL file in 4MB chunks instead // of one file_get_contents. cPanel customer dumps in the wild routinely // hit 5-10GB (large WooCommerce + media metadata); a 128MB-default PHP // memory_limit + the 2GB cgroup on the container both fail well below // those sizes. Streaming bounds the resident memory to a few MB // regardless of dump size. // // Per-chunk passes: // - engine_swap_chunk: same regex as engine_swap() but on a chunk, // plus a small carryover buffer at the end so // a pattern split across a chunk boundary // still matches. // - is_wp_chunk_scan: OR-folds the three CREATE TABLE detections // + the option_name sentinel into a state // dict that finalizes after the last chunk. // - wp_options_chunk_scan: extracts siteurl/home option values when // they appear in this chunk's portion of an // INSERT INTO options statement. // // We open input + output streams, push each rewritten chunk to the // output as we go, and keep only the per-chunk regex carryover (up to // 128 bytes — longer than any ENGINE=... or 'siteurl' string). $fin = @fopen($sqlPath, 'rb'); if (!$fin) { fwrite(STDERR, "scan-dbs: WARN failed to open $sqlPath; skipping\n"); continue; } // We write to a tmp file first; renamed to the final {.sql, .sql.flagged} // name once we know whether the wp content scan flagged the dump. $tmpOutPath = $outDir . '/' . $dbName . '.sql.tmp'; $fout = @fopen($tmpOutPath, 'wb'); if (!$fout) { fwrite(STDERR, "scan-dbs: WARN failed to open output $tmpOutPath; skipping\n"); fclose($fin); continue; } $engineSwapCount = 0; $wpFlags = [ 'has_options' => false, 'has_posts' => false, 'has_users' => false, 'has_option_sentinel'=> false, ]; $optionValues = []; // siteurl / home etc. accumulated across chunks $carry = ''; $chunkSize = 4 * 1024 * 1024; // 4 MB $carryWindow = 128; // bytes carried over for boundary matches while (!feof($fin)) { $buf = fread($fin, $chunkSize); if ($buf === false || $buf === '') break; // Prepend any carry from the previous iteration so a pattern split // across the boundary is still visible to the regex. $chunk = $carry . $buf; // Per-chunk passes. $chunk = engine_swap_chunk($chunk, $engineSwapCount); is_wp_chunk_scan($chunk, $wpFlags); wp_options_chunk_scan($chunk, $optionValues); // Keep the tail of this chunk as carry for the next pass, but // ONLY when we're not at EOF — otherwise the carry never gets // written. The carry length is the regex max-match window; for // our patterns (`\bENGINE=MyISAM\b`, `'siteurl'`, etc.) 128 // bytes is generous. if (!feof($fin) && strlen($chunk) > $carryWindow) { $writeLen = strlen($chunk) - $carryWindow; fwrite($fout, substr($chunk, 0, $writeLen)); $carry = substr($chunk, $writeLen); } else { fwrite($fout, $chunk); $carry = ''; } } fclose($fin); fclose($fout); $engineCounts = ['myisam_to_innodb' => $engineSwapCount]; // --- WordPress identification + content scan finalize ----------------- $isWp = $wpFlags['has_options'] && $wpFlags['has_posts'] && $wpFlags['has_users'] && $wpFlags['has_option_sentinel']; $flags = []; if ($isWp) { $flags = wp_content_scan_from_values($optionValues, $allowedDomains); } $highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high'); $refused = (bool) count($highConfidence); $outName = $dbName . '.sql' . ($refused ? '.flagged' : ''); $outPath = $outDir . '/' . $outName; $finalPath = $finalPrefix . '/' . $outName; if (!@rename($tmpOutPath, $outPath)) { fwrite(STDERR, "scan-dbs: WARN failed to rename $tmpOutPath -> $outPath; skipping\n"); @unlink($tmpOutPath); continue; } $databases[] = [ 'dbname' => $dbName, 'size_bytes'=> $sizeBytes, 'engine_changes' => [ 'myisam_to_innodb' => $engineCounts['myisam_to_innodb'], 'row_format_dynamic_applied' => 0, // v1.1 'fulltext_indexes_dropped' => 0, // v1.1 ], 'wp_content_scan' => [ 'is_wordpress' => $isWp, 'flags' => $flags, ], 'imported_into_new_server' => !$refused, 'sanitized_sql_path' => $refused ? null : $finalPath, 'flagged_sql_path' => $refused ? $finalPath : null, ]; } $report = [ 'scanner_version' => SCANNER_VERSION, 'import_id' => $importId, 'databases' => $databases, ]; file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n"); exit(0); // ---- helpers -------------------------------------------------------------- /** * Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a * table-options token. Returns [string $newSql, array $counts]. * * The regex is intentionally narrow: * - case-insensitive (cpmove dumps vary) * - anchored on word boundaries so we don't rewrite, say, * a TEXT field that contains the literal string "ENGINE=MyISAM" * (extremely unlikely but possible) */ function engine_swap(string $sql): array { $count = 0; $rewritten = preg_replace_callback( '/\bENGINE\s*=\s*MyISAM\b/i', function () use (&$count) { $count++; return 'ENGINE=InnoDB'; }, $sql ); return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]]; } /** * Streaming engine_swap. Same regex but the per-chunk count is mutated * into &$counter so the caller can accumulate across many chunks. */ function engine_swap_chunk(string $chunk, int &$counter): string { $rewritten = preg_replace_callback( '/\bENGINE\s*=\s*MyISAM\b/i', function () use (&$counter) { $counter++; return 'ENGINE=InnoDB'; }, $chunk ); return $rewritten ?? $chunk; } /** * Streaming WP-dump fingerprint accumulator. The four checks are * independent — any chunk that flips a flag from false to true keeps it * true for the rest of the scan, even if subsequent chunks don't match. * Caller AND-folds the four flags at finalization. */ function is_wp_chunk_scan(string $chunk, array &$flags): void { if (!$flags['has_options'] && preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $chunk)) { $flags['has_options'] = true; } if (!$flags['has_posts'] && preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $chunk)) { $flags['has_posts'] = true; } if (!$flags['has_users'] && preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $chunk)) { $flags['has_users'] = true; } if (!$flags['has_option_sentinel'] && preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $chunk)) { $flags['has_option_sentinel'] = true; } } /** * Streaming siteurl/home extractor. Looks for the literal * 'option_name', 'option_value' * pair within any chunk that contains an INSERT INTO options statement. * Stores into the accumulator only if not already populated — we keep the * FIRST occurrence (which in practice is the live value; subsequent * occurrences are duplicates from re-inserts or trigger noise). * * Best-effort: skips multi-row INSERTs where the option_name/value * tuple is split across a chunk boundary. The non-streaming * extract_wp_options() is still available for the small-file path if * we ever reinstate it. */ function wp_options_chunk_scan(string $chunk, array &$values): void { // Match a tuple of the form (anything, 'option_name', 'option_value', anything) // inside an INSERT statement. The pattern allows either single or // double quoting and escapes; SQL strings can contain \' and \" so // we match `(?:\\.|[^'])*` for content. if (!preg_match_all( "/\(\s*[^,]+,\s*'((?:\\\\.|[^'])*)'\s*,\s*'((?:\\\\.|[^'])*)'/", $chunk, $matches, PREG_SET_ORDER )) { return; } foreach ($matches as $m) { $name = stripslashes($m[1]); if (!in_array($name, ['siteurl', 'home', 'template', 'stylesheet'], true)) continue; if (isset($values[$name])) continue; $values[$name] = stripslashes($m[2]); } } /** * Pull the wp_content_scan finalization out of the original function so * the streaming path can call it with a pre-built option-values map * instead of re-scanning the full SQL string. */ function wp_content_scan_from_values(array $optionValues, array $allowedDomains): array { $flags = []; foreach (['siteurl', 'home'] as $optName) { if (!isset($optionValues[$optName])) continue; $val = $optionValues[$optName]; $host = parse_url($val, PHP_URL_HOST); if ($host === null || $host === false || $host === '') continue; if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue; if (!domain_in_allowlist($host, $allowedDomains)) { $flags[] = [ 'severity' => 'high', 'code' => 'siteurl_external_domain', 'details' => sprintf( "wp_options.%s = %s — host '%s' not in allowed domain list (%s)", $optName, json_encode($val), $host, empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains) ), ]; } } return $flags; } /** * Identify WordPress by the canonical core-table CREATE statements. * * cPanel exports respect the customer's prefix, so we accept any * prefix as long as the three core tables exist in this dump. */ function is_wordpress_dump(string $sql): bool { $hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql); $hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql); $hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql); // Bonus signal: the dump also references the standard wp_options // option_names. Cheap to check, drops a few false positives where // an app shares table names with WP. $optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql); return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel; } /** * Run the WP content scan. v1.0 ships ONE check: * * siteurl_external_domain — wp_options.siteurl or .home points at a * host not in the allow list (cpanel main + addons). * * Returns an array of flag dicts; an empty array means "clean." * * v1.1 add: post_content script-injection signature, theme/stylesheet * known-malware patterns, user_pass leaked-hash check, Wordfence regex. */ function wp_content_scan(string $sql, array $allowedDomains): array { $flags = []; // Pull every (option_name, option_value) row from any INSERT INTO // options. We use a forgiving regex because cPanel dumps // use both single-row INSERTs and chunked multi-row INSERTs. $optionValues = extract_wp_options($sql); foreach (['siteurl', 'home'] as $optName) { if (!isset($optionValues[$optName])) continue; $val = $optionValues[$optName]; $host = parse_url($val, PHP_URL_HOST); if ($host === null || $host === false || $host === '') continue; // localhost / IP literals are not external domains; let the // panel handle them on the rewrite-wp-config pass. if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue; if (!domain_in_allowlist($host, $allowedDomains)) { $flags[] = [ 'severity' => 'high', 'code' => 'siteurl_external_domain', 'details' => sprintf( "wp_options.%s = %s — host '%s' not in allowed domain list (%s)", $optName, json_encode($val), $host, empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains) ), ]; } } return $flags; } /** * Pull a map of option_name => option_value from any INSERT into the * options table. Returns ['siteurl' => '...', 'home' => '...', ...]. * * Best-effort — multi-row INSERTs with weird quoting can defeat the * regex, in which case we report no values and the scan returns clean. * That's acceptable because the panel will still rewrite siteurl on its * own pass and any malicious siteurl that survives WILL show up in the * customer-facing rendered URL — admin can spot it post-import. */ function extract_wp_options(string $sql): array { $map = []; // Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...); // The optional column list contains the literal "value" (lowercase // via `option_value`) and uppercase V too, so we can't use [^V] // as a delimiter — instead match a balanced parens column list // followed by VALUES. if (!preg_match_all( '/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims', $sql, $stmts )) { return $map; } foreach ($stmts[1] as $body) { // Split on `),(` between rows; first row has the leading `(`, // last row has the trailing `)` — handled by trim below. $body = trim($body); $body = preg_replace('/^\(/', '', $body); $body = preg_replace('/\)$/', '', $body); $rows = preg_split('/\)\s*,\s*\(/', $body); foreach ($rows as $row) { $cells = parse_sql_row($row); // wp_options columns: option_id, option_name, option_value, autoload if (count($cells) >= 3) { $name = $cells[1]; $value = $cells[2]; if (is_string($name) && is_string($value) && $name !== '') { $map[$name] = $value; } } } } return $map; } /** * Parse one row of a MySQL INSERT VALUES tuple — comma-separated, * strings single-quoted with backslash escapes. * * Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but * good enough for cpmove dumps, which mysqldump produces in a * predictable format. */ function parse_sql_row(string $row): array { $cells = []; $i = 0; $n = strlen($row); while ($i < $n) { // Skip leading whitespace + commas. while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++; if ($i >= $n) break; $c = $row[$i]; if ($c === "'") { // Quoted string. $i++; $buf = ''; while ($i < $n) { $cc = $row[$i]; if ($cc === '\\' && $i + 1 < $n) { $next = $row[$i + 1]; $buf .= match ($next) { 'n' => "\n", 't' => "\t", 'r' => "\r", '0' => "\0", default => $next, }; $i += 2; continue; } if ($cc === "'") { // MySQL `''` -> literal ' if ($i + 1 < $n && $row[$i + 1] === "'") { $buf .= "'"; $i += 2; continue; } $i++; break; } $buf .= $cc; $i++; } $cells[] = $buf; } else { // Bareword / number / NULL — read until next comma. $start = $i; while ($i < $n && $row[$i] !== ',') $i++; $tok = trim(substr($row, $start, $i - $start)); $cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok; } } return $cells; } /** * Discover the user's allowed-domain set by reading the cpmove * userdata. cPanel writes: * cpmove-/userdata/ — per-domain config * cpmove-/userdata/main — the main domain * cpmove-/addons — addon-domain list * cpmove-/sds — subdomain list * * Best-effort. If we can't find any, the siteurl check still runs but * will flag everything as external — surface up to admin. */ function collect_allowed_domains(string $extractDir, string $username): array { $domains = []; foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) { if (!is_dir($userdataDir)) continue; foreach (scandir($userdataDir) ?: [] as $entry) { if ($entry === '.' || $entry === '..' || $entry === 'main') continue; // userdata/ is a file or dir keyed by the domain. if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) { $domains[] = strtolower($entry); } } // userdata/main is a YAML-ish file with main_domain: $mainFile = $userdataDir . '/main'; if (is_file($mainFile)) { $content = file_get_contents($mainFile); if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) { $domains[] = strtolower($m[1]); } } } foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) { if (!is_file($addonsFile)) continue; $content = file_get_contents($addonsFile); if ($content === false) continue; // cPanel writes "addon.tld=parent.tld" lines. foreach (preg_split('/\R/', $content) as $line) { if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) { $domains[] = strtolower($m[1]); } } } return array_values(array_unique($domains)); } /** * True if $host is in the allow-list, including subdomain matches. * * e.g. allowed=['example.com'], host='www.example.com' -> true. * allowed=['example.com'], host='malicious.tld' -> false. * allowed=[], host='*' -> false (refuse-all). */ function domain_in_allowlist(string $host, array $allowed): bool { if (empty($allowed)) return false; $host = strtolower($host); foreach ($allowed as $d) { $d = strtolower($d); if ($host === $d) return true; if (str_ends_with($host, '.' . $d)) return true; } return false; }