400 lines
15 KiB
PHP
400 lines
15 KiB
PHP
|
|
<?php
|
||
|
|
/**
|
||
|
|
* scan-dbs.php — SQL dump engine swap + WordPress content scan.
|
||
|
|
*
|
||
|
|
* v1.0 scope:
|
||
|
|
* - Walk every cpmove-USER/mysql/DBNAME.sql under the extract dir.
|
||
|
|
* - ALWAYS: regex-rewrite ENGINE=MyISAM -> ENGINE=InnoDB.
|
||
|
|
* - WordPress identification: presence of wp_options/wp_posts/wp_users
|
||
|
|
* CREATE TABLEs (or prefix-variants where prefix != "wp_").
|
||
|
|
* - WP content scan: ONE check — siteurl_external_domain — comparing
|
||
|
|
* wp_options.siteurl / wp_options.home against the cpanel userdata's
|
||
|
|
* main_domain + addon-domain list.
|
||
|
|
* - If any high-confidence flag fires, the .sql file is written with
|
||
|
|
* a .flagged suffix and imported_into_new_server=false.
|
||
|
|
* - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/.
|
||
|
|
*
|
||
|
|
* v1.1 will grow the WP scan check set (post_content script-injection,
|
||
|
|
* user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how
|
||
|
|
* to add a check.
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* scan-dbs.php --extract DIR --out DIR --report OUT.json
|
||
|
|
* --import-id ID --username USER
|
||
|
|
*
|
||
|
|
* Exit codes:
|
||
|
|
* 0 on success (regardless of flags); 1 fatal; 2 usage.
|
||
|
|
*
|
||
|
|
* NOTE: docblock above must not contain the literal sequence "* /"
|
||
|
|
* (without the space) anywhere — PHP closes the C-style comment at
|
||
|
|
* that token and parses the rest as code. This bit us once on
|
||
|
|
* the cpmove-USER /mysql glob path.
|
||
|
|
*/
|
||
|
|
|
||
|
|
require __DIR__ . '/lib/safety-net.php';
|
||
|
|
|
||
|
|
const SCANNER_VERSION = '1.0.0';
|
||
|
|
|
||
|
|
$opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']);
|
||
|
|
foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) {
|
||
|
|
if (!isset($opts[$k])) {
|
||
|
|
fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n");
|
||
|
|
exit(2);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
$extractDir = rtrim($opts['extract'], '/');
|
||
|
|
$outDir = rtrim($opts['out'], '/');
|
||
|
|
$reportPath = $opts['report'];
|
||
|
|
$importId = $opts['import-id'];
|
||
|
|
$username = $opts['username'];
|
||
|
|
// --final-prefix is the path .sql files will live at AFTER the rsync to
|
||
|
|
// /host/sanitized/<importid>/mysql/. We record that path in the report
|
||
|
|
// so the panel doesn't have to translate /tmp/... paths.
|
||
|
|
$finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir;
|
||
|
|
|
||
|
|
@mkdir($outDir, 0750, true);
|
||
|
|
|
||
|
|
fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n");
|
||
|
|
|
||
|
|
// -- find all cpmove-*/mysql/*.sql dumps -----------------------------------
|
||
|
|
|
||
|
|
$sqlFiles = [];
|
||
|
|
foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) {
|
||
|
|
if (is_file($f)) $sqlFiles[] = $f;
|
||
|
|
}
|
||
|
|
// Some cpmove layouts use cpmove-<user>/mysql/<db>.create + <db>.sql;
|
||
|
|
// glob above already covers <db>.sql which is what we care about.
|
||
|
|
|
||
|
|
if (empty($sqlFiles)) {
|
||
|
|
fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
// -- discover the user's allowed-domain list from the cpmove userdata -----
|
||
|
|
|
||
|
|
$allowedDomains = collect_allowed_domains($extractDir, $username);
|
||
|
|
fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: "
|
||
|
|
. (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains))
|
||
|
|
. "\n");
|
||
|
|
|
||
|
|
$databases = [];
|
||
|
|
|
||
|
|
foreach ($sqlFiles as $sqlPath) {
|
||
|
|
$dbName = basename($sqlPath, '.sql');
|
||
|
|
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
||
|
|
|
||
|
|
$sizeBytes = filesize($sqlPath) ?: 0;
|
||
|
|
$sql = file_get_contents($sqlPath);
|
||
|
|
if ($sql === false) {
|
||
|
|
fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n");
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --- ENGINE SWAP (always applied) -------------------------------------
|
||
|
|
|
||
|
|
[$rewritten, $engineCounts] = engine_swap($sql);
|
||
|
|
|
||
|
|
// --- WordPress identification + content scan -------------------------
|
||
|
|
|
||
|
|
$isWp = is_wordpress_dump($rewritten);
|
||
|
|
$flags = [];
|
||
|
|
if ($isWp) {
|
||
|
|
$flags = wp_content_scan($rewritten, $allowedDomains);
|
||
|
|
}
|
||
|
|
|
||
|
|
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
||
|
|
$refused = (bool) count($highConfidence);
|
||
|
|
|
||
|
|
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
||
|
|
$outPath = $outDir . '/' . $outName;
|
||
|
|
$finalPath = $finalPrefix . '/' . $outName;
|
||
|
|
file_put_contents($outPath, $rewritten);
|
||
|
|
|
||
|
|
$databases[] = [
|
||
|
|
'dbname' => $dbName,
|
||
|
|
'size_bytes'=> $sizeBytes,
|
||
|
|
'engine_changes' => [
|
||
|
|
'myisam_to_innodb' => $engineCounts['myisam_to_innodb'],
|
||
|
|
'row_format_dynamic_applied' => 0, // v1.1
|
||
|
|
'fulltext_indexes_dropped' => 0, // v1.1
|
||
|
|
],
|
||
|
|
'wp_content_scan' => [
|
||
|
|
'is_wordpress' => $isWp,
|
||
|
|
'flags' => $flags,
|
||
|
|
],
|
||
|
|
'imported_into_new_server' => !$refused,
|
||
|
|
'sanitized_sql_path' => $refused ? null : $finalPath,
|
||
|
|
'flagged_sql_path' => $refused ? $finalPath : null,
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
$report = [
|
||
|
|
'scanner_version' => SCANNER_VERSION,
|
||
|
|
'import_id' => $importId,
|
||
|
|
'databases' => $databases,
|
||
|
|
];
|
||
|
|
file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
||
|
|
|
||
|
|
fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n");
|
||
|
|
exit(0);
|
||
|
|
|
||
|
|
// ---- helpers --------------------------------------------------------------
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a
|
||
|
|
* table-options token. Returns [string $newSql, array $counts].
|
||
|
|
*
|
||
|
|
* The regex is intentionally narrow:
|
||
|
|
* - case-insensitive (cpmove dumps vary)
|
||
|
|
* - anchored on word boundaries so we don't rewrite, say,
|
||
|
|
* a TEXT field that contains the literal string "ENGINE=MyISAM"
|
||
|
|
* (extremely unlikely but possible)
|
||
|
|
*/
|
||
|
|
function engine_swap(string $sql): array {
|
||
|
|
$count = 0;
|
||
|
|
$rewritten = preg_replace_callback(
|
||
|
|
'/\bENGINE\s*=\s*MyISAM\b/i',
|
||
|
|
function () use (&$count) { $count++; return 'ENGINE=InnoDB'; },
|
||
|
|
$sql
|
||
|
|
);
|
||
|
|
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Identify WordPress by the canonical core-table CREATE statements.
|
||
|
|
*
|
||
|
|
* cPanel exports respect the customer's prefix, so we accept any
|
||
|
|
* prefix as long as the three core tables exist in this dump.
|
||
|
|
*/
|
||
|
|
function is_wordpress_dump(string $sql): bool {
|
||
|
|
$hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql);
|
||
|
|
$hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql);
|
||
|
|
$hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql);
|
||
|
|
// Bonus signal: the dump also references the standard wp_options
|
||
|
|
// option_names. Cheap to check, drops a few false positives where
|
||
|
|
// an app shares table names with WP.
|
||
|
|
$optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql);
|
||
|
|
return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Run the WP content scan. v1.0 ships ONE check:
|
||
|
|
*
|
||
|
|
* siteurl_external_domain — wp_options.siteurl or .home points at a
|
||
|
|
* host not in the allow list (cpanel main + addons).
|
||
|
|
*
|
||
|
|
* Returns an array of flag dicts; an empty array means "clean."
|
||
|
|
*
|
||
|
|
* v1.1 add: post_content script-injection signature, theme/stylesheet
|
||
|
|
* known-malware patterns, user_pass leaked-hash check, Wordfence regex.
|
||
|
|
*/
|
||
|
|
function wp_content_scan(string $sql, array $allowedDomains): array {
|
||
|
|
$flags = [];
|
||
|
|
|
||
|
|
// Pull every (option_name, option_value) row from any INSERT INTO
|
||
|
|
// <prefix>options. We use a forgiving regex because cPanel dumps
|
||
|
|
// use both single-row INSERTs and chunked multi-row INSERTs.
|
||
|
|
$optionValues = extract_wp_options($sql);
|
||
|
|
|
||
|
|
foreach (['siteurl', 'home'] as $optName) {
|
||
|
|
if (!isset($optionValues[$optName])) continue;
|
||
|
|
$val = $optionValues[$optName];
|
||
|
|
$host = parse_url($val, PHP_URL_HOST);
|
||
|
|
if ($host === null || $host === false || $host === '') continue;
|
||
|
|
|
||
|
|
// localhost / IP literals are not external domains; let the
|
||
|
|
// panel handle them on the rewrite-wp-config pass.
|
||
|
|
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
||
|
|
|
||
|
|
if (!domain_in_allowlist($host, $allowedDomains)) {
|
||
|
|
$flags[] = [
|
||
|
|
'severity' => 'high',
|
||
|
|
'code' => 'siteurl_external_domain',
|
||
|
|
'details' => sprintf(
|
||
|
|
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
||
|
|
$optName,
|
||
|
|
json_encode($val),
|
||
|
|
$host,
|
||
|
|
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
||
|
|
),
|
||
|
|
];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $flags;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Pull a map of option_name => option_value from any INSERT into the
|
||
|
|
* options table. Returns ['siteurl' => '...', 'home' => '...', ...].
|
||
|
|
*
|
||
|
|
* Best-effort — multi-row INSERTs with weird quoting can defeat the
|
||
|
|
* regex, in which case we report no values and the scan returns clean.
|
||
|
|
* That's acceptable because the panel will still rewrite siteurl on its
|
||
|
|
* own pass and any malicious siteurl that survives WILL show up in the
|
||
|
|
* customer-facing rendered URL — admin can spot it post-import.
|
||
|
|
*/
|
||
|
|
function extract_wp_options(string $sql): array {
|
||
|
|
$map = [];
|
||
|
|
|
||
|
|
// Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...);
|
||
|
|
// The optional column list contains the literal "value" (lowercase
|
||
|
|
// via `option_value`) and uppercase V too, so we can't use [^V]
|
||
|
|
// as a delimiter — instead match a balanced parens column list
|
||
|
|
// followed by VALUES.
|
||
|
|
if (!preg_match_all(
|
||
|
|
'/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims',
|
||
|
|
$sql,
|
||
|
|
$stmts
|
||
|
|
)) {
|
||
|
|
return $map;
|
||
|
|
}
|
||
|
|
|
||
|
|
foreach ($stmts[1] as $body) {
|
||
|
|
// Split on `),(` between rows; first row has the leading `(`,
|
||
|
|
// last row has the trailing `)` — handled by trim below.
|
||
|
|
$body = trim($body);
|
||
|
|
$body = preg_replace('/^\(/', '', $body);
|
||
|
|
$body = preg_replace('/\)$/', '', $body);
|
||
|
|
$rows = preg_split('/\)\s*,\s*\(/', $body);
|
||
|
|
foreach ($rows as $row) {
|
||
|
|
$cells = parse_sql_row($row);
|
||
|
|
// wp_options columns: option_id, option_name, option_value, autoload
|
||
|
|
if (count($cells) >= 3) {
|
||
|
|
$name = $cells[1];
|
||
|
|
$value = $cells[2];
|
||
|
|
if (is_string($name) && is_string($value) && $name !== '') {
|
||
|
|
$map[$name] = $value;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $map;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse one row of a MySQL INSERT VALUES tuple — comma-separated,
|
||
|
|
* strings single-quoted with backslash escapes.
|
||
|
|
*
|
||
|
|
* Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but
|
||
|
|
* good enough for cpmove dumps, which mysqldump produces in a
|
||
|
|
* predictable format.
|
||
|
|
*/
|
||
|
|
function parse_sql_row(string $row): array {
|
||
|
|
$cells = [];
|
||
|
|
$i = 0;
|
||
|
|
$n = strlen($row);
|
||
|
|
while ($i < $n) {
|
||
|
|
// Skip leading whitespace + commas.
|
||
|
|
while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++;
|
||
|
|
if ($i >= $n) break;
|
||
|
|
$c = $row[$i];
|
||
|
|
if ($c === "'") {
|
||
|
|
// Quoted string.
|
||
|
|
$i++;
|
||
|
|
$buf = '';
|
||
|
|
while ($i < $n) {
|
||
|
|
$cc = $row[$i];
|
||
|
|
if ($cc === '\\' && $i + 1 < $n) {
|
||
|
|
$next = $row[$i + 1];
|
||
|
|
$buf .= match ($next) {
|
||
|
|
'n' => "\n",
|
||
|
|
't' => "\t",
|
||
|
|
'r' => "\r",
|
||
|
|
'0' => "\0",
|
||
|
|
default => $next,
|
||
|
|
};
|
||
|
|
$i += 2;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if ($cc === "'") {
|
||
|
|
// MySQL `''` -> literal '
|
||
|
|
if ($i + 1 < $n && $row[$i + 1] === "'") {
|
||
|
|
$buf .= "'";
|
||
|
|
$i += 2;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
$i++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
$buf .= $cc;
|
||
|
|
$i++;
|
||
|
|
}
|
||
|
|
$cells[] = $buf;
|
||
|
|
} else {
|
||
|
|
// Bareword / number / NULL — read until next comma.
|
||
|
|
$start = $i;
|
||
|
|
while ($i < $n && $row[$i] !== ',') $i++;
|
||
|
|
$tok = trim(substr($row, $start, $i - $start));
|
||
|
|
$cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return $cells;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Discover the user's allowed-domain set by reading the cpmove
|
||
|
|
* userdata. cPanel writes:
|
||
|
|
* cpmove-<user>/userdata/<domain> — per-domain config
|
||
|
|
* cpmove-<user>/userdata/main — the main domain
|
||
|
|
* cpmove-<user>/addons — addon-domain list
|
||
|
|
* cpmove-<user>/sds — subdomain list
|
||
|
|
*
|
||
|
|
* Best-effort. If we can't find any, the siteurl check still runs but
|
||
|
|
* will flag everything as external — surface up to admin.
|
||
|
|
*/
|
||
|
|
function collect_allowed_domains(string $extractDir, string $username): array {
|
||
|
|
$domains = [];
|
||
|
|
|
||
|
|
foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) {
|
||
|
|
if (!is_dir($userdataDir)) continue;
|
||
|
|
foreach (scandir($userdataDir) ?: [] as $entry) {
|
||
|
|
if ($entry === '.' || $entry === '..' || $entry === 'main') continue;
|
||
|
|
// userdata/<domain> is a file or dir keyed by the domain.
|
||
|
|
if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) {
|
||
|
|
$domains[] = strtolower($entry);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// userdata/main is a YAML-ish file with main_domain: <d>
|
||
|
|
$mainFile = $userdataDir . '/main';
|
||
|
|
if (is_file($mainFile)) {
|
||
|
|
$content = file_get_contents($mainFile);
|
||
|
|
if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) {
|
||
|
|
$domains[] = strtolower($m[1]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) {
|
||
|
|
if (!is_file($addonsFile)) continue;
|
||
|
|
$content = file_get_contents($addonsFile);
|
||
|
|
if ($content === false) continue;
|
||
|
|
// cPanel writes "addon.tld=parent.tld" lines.
|
||
|
|
foreach (preg_split('/\R/', $content) as $line) {
|
||
|
|
if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) {
|
||
|
|
$domains[] = strtolower($m[1]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return array_values(array_unique($domains));
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* True if $host is in the allow-list, including subdomain matches.
|
||
|
|
*
|
||
|
|
* e.g. allowed=['example.com'], host='www.example.com' -> true.
|
||
|
|
* allowed=['example.com'], host='malicious.tld' -> false.
|
||
|
|
* allowed=[], host='*' -> false (refuse-all).
|
||
|
|
*/
|
||
|
|
function domain_in_allowlist(string $host, array $allowed): bool {
|
||
|
|
if (empty($allowed)) return false;
|
||
|
|
$host = strtolower($host);
|
||
|
|
foreach ($allowed as $d) {
|
||
|
|
$d = strtolower($d);
|
||
|
|
if ($host === $d) return true;
|
||
|
|
if (str_ends_with($host, '.' . $d)) return true;
|
||
|
|
}
|
||
|
|
return false;
|
||
|
|
}
|