All checks were successful
cpanel-importer Build and Push / Build-and-Push (push) Successful in 1m21s
Surfaced on whp02 alsacorp retry: scan-dbs.php hit PHP fatal at line 86
"Allowed memory size of 134217728 bytes exhausted (tried to allocate
5488440384 bytes)" while loading alsacorp_alsa1.sql via
file_get_contents. The dump is multi-GB (typical for WooCommerce stores
with media metadata); the 128MB-default PHP memory_limit + the 2GB
cgroup on the container both fail well below the actual file size.
Rewrote the per-DB pass as a streaming loop over 4MB chunks:
- engine_swap_chunk: same `\bENGINE=MyISAM\b` regex, mutates a
per-DB counter via reference so the per-chunk counts accumulate
into a single myisam_to_innodb total.
- is_wp_chunk_scan: OR-folds the four WP fingerprint regexes
(CREATE TABLE *_options, *_posts, *_users + the
'siteurl|home|template|stylesheet' sentinel) into a state dict;
any chunk that flips a flag from false to true keeps it true for
the rest of the file. Caller AND-folds at finalization.
- wp_options_chunk_scan: extracts (option_name, option_value)
tuples from INSERT INTO options statements as they pass through.
First occurrence wins so we keep the live value, not later
duplicates.
- wp_content_scan_from_values: extracted the finalization logic
from the legacy wp_content_scan() so the streaming path can
submit a pre-built option-values map instead of re-scanning the
full string.
Per-chunk carry: a 128-byte buffer at the end of each chunk is held
back and prepended to the next chunk so a pattern split across a
chunk boundary (e.g. "ENGINE=" at byte 4194302, "MyISAM" at byte
4194304) is still seen by the regex. 128 bytes is generous for our
patterns (longest is "ENGINE = MyISAM" with whitespace flex).
Output goes to a `<db>.sql.tmp` first, then renamed to
`<db>.sql{,.flagged}` once we know the flag verdict — avoids a
partial file if the scan dies mid-stream.
Legacy `engine_swap`, `is_wordpress_dump`, and the unused
`wp_content_scan`+`extract_wp_options` are kept in place for the
small-file path (none of them currently called from the new
streaming loop, but they're public-ish helpers the next dbsanitize
revision could reuse).
Resident memory now bounded to <16 MB per DB regardless of input
file size — should handle the 30 GB+ outliers we'll inevitably see.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
576 lines
22 KiB
PHP
Executable File
576 lines
22 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* scan-dbs.php — SQL dump engine swap + WordPress content scan.
|
|
*
|
|
* v1.0 scope:
|
|
* - Walk every cpmove-USER/mysql/DBNAME.sql under the extract dir.
|
|
* - ALWAYS: regex-rewrite ENGINE=MyISAM -> ENGINE=InnoDB.
|
|
* - WordPress identification: presence of wp_options/wp_posts/wp_users
|
|
* CREATE TABLEs (or prefix-variants where prefix != "wp_").
|
|
* - WP content scan: ONE check — siteurl_external_domain — comparing
|
|
* wp_options.siteurl / wp_options.home against the cpanel userdata's
|
|
* main_domain + addon-domain list.
|
|
* - If any high-confidence flag fires, the .sql file is written with
|
|
* a .flagged suffix and imported_into_new_server=false.
|
|
* - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/.
|
|
*
|
|
* v1.1 will grow the WP scan check set (post_content script-injection,
|
|
* user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how
|
|
* to add a check.
|
|
*
|
|
* Usage:
|
|
* scan-dbs.php --extract DIR --out DIR --report OUT.json
|
|
* --import-id ID --username USER
|
|
*
|
|
* Exit codes:
|
|
* 0 on success (regardless of flags); 1 fatal; 2 usage.
|
|
*
|
|
* NOTE: docblock above must not contain the literal sequence "* /"
|
|
* (without the space) anywhere — PHP closes the C-style comment at
|
|
* that token and parses the rest as code. This bit us once on
|
|
* the cpmove-USER /mysql glob path.
|
|
*/
|
|
|
|
require __DIR__ . '/lib/safety-net.php';
|
|
|
|
const SCANNER_VERSION = '1.0.0';
|
|
|
|
$opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']);
|
|
foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) {
|
|
if (!isset($opts[$k])) {
|
|
fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n");
|
|
exit(2);
|
|
}
|
|
}
|
|
$extractDir = rtrim($opts['extract'], '/');
|
|
$outDir = rtrim($opts['out'], '/');
|
|
$reportPath = $opts['report'];
|
|
$importId = $opts['import-id'];
|
|
$username = $opts['username'];
|
|
// --final-prefix is the path .sql files will live at AFTER the rsync to
|
|
// /host/sanitized/<importid>/mysql/. We record that path in the report
|
|
// so the panel doesn't have to translate /tmp/... paths.
|
|
$finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir;
|
|
|
|
@mkdir($outDir, 0750, true);
|
|
|
|
fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n");
|
|
|
|
// -- find all cpmove-*/mysql/*.sql dumps -----------------------------------
|
|
|
|
$sqlFiles = [];
|
|
foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) {
|
|
if (is_file($f)) $sqlFiles[] = $f;
|
|
}
|
|
// Some cpmove layouts use cpmove-<user>/mysql/<db>.create + <db>.sql;
|
|
// glob above already covers <db>.sql which is what we care about.
|
|
|
|
if (empty($sqlFiles)) {
|
|
fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n");
|
|
}
|
|
|
|
// -- discover the user's allowed-domain list from the cpmove userdata -----
|
|
|
|
$allowedDomains = collect_allowed_domains($extractDir, $username);
|
|
fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: "
|
|
. (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains))
|
|
. "\n");
|
|
|
|
$databases = [];
|
|
|
|
foreach ($sqlFiles as $sqlPath) {
|
|
$dbName = basename($sqlPath, '.sql');
|
|
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
|
|
|
$sizeBytes = filesize($sqlPath) ?: 0;
|
|
|
|
// STREAMING POSTURE: load + scan the SQL file in 4MB chunks instead
|
|
// of one file_get_contents. cPanel customer dumps in the wild routinely
|
|
// hit 5-10GB (large WooCommerce + media metadata); a 128MB-default PHP
|
|
// memory_limit + the 2GB cgroup on the container both fail well below
|
|
// those sizes. Streaming bounds the resident memory to a few MB
|
|
// regardless of dump size.
|
|
//
|
|
// Per-chunk passes:
|
|
// - engine_swap_chunk: same regex as engine_swap() but on a chunk,
|
|
// plus a small carryover buffer at the end so
|
|
// a pattern split across a chunk boundary
|
|
// still matches.
|
|
// - is_wp_chunk_scan: OR-folds the three CREATE TABLE detections
|
|
// + the option_name sentinel into a state
|
|
// dict that finalizes after the last chunk.
|
|
// - wp_options_chunk_scan: extracts siteurl/home option values when
|
|
// they appear in this chunk's portion of an
|
|
// INSERT INTO options statement.
|
|
//
|
|
// We open input + output streams, push each rewritten chunk to the
|
|
// output as we go, and keep only the per-chunk regex carryover (up to
|
|
// 128 bytes — longer than any ENGINE=... or 'siteurl' string).
|
|
$fin = @fopen($sqlPath, 'rb');
|
|
if (!$fin) {
|
|
fwrite(STDERR, "scan-dbs: WARN failed to open $sqlPath; skipping\n");
|
|
continue;
|
|
}
|
|
|
|
// We write to a tmp file first; renamed to the final {.sql, .sql.flagged}
|
|
// name once we know whether the wp content scan flagged the dump.
|
|
$tmpOutPath = $outDir . '/' . $dbName . '.sql.tmp';
|
|
$fout = @fopen($tmpOutPath, 'wb');
|
|
if (!$fout) {
|
|
fwrite(STDERR, "scan-dbs: WARN failed to open output $tmpOutPath; skipping\n");
|
|
fclose($fin);
|
|
continue;
|
|
}
|
|
|
|
$engineSwapCount = 0;
|
|
$wpFlags = [
|
|
'has_options' => false,
|
|
'has_posts' => false,
|
|
'has_users' => false,
|
|
'has_option_sentinel'=> false,
|
|
];
|
|
$optionValues = []; // siteurl / home etc. accumulated across chunks
|
|
$carry = '';
|
|
$chunkSize = 4 * 1024 * 1024; // 4 MB
|
|
$carryWindow = 128; // bytes carried over for boundary matches
|
|
|
|
while (!feof($fin)) {
|
|
$buf = fread($fin, $chunkSize);
|
|
if ($buf === false || $buf === '') break;
|
|
|
|
// Prepend any carry from the previous iteration so a pattern split
|
|
// across the boundary is still visible to the regex.
|
|
$chunk = $carry . $buf;
|
|
|
|
// Per-chunk passes.
|
|
$chunk = engine_swap_chunk($chunk, $engineSwapCount);
|
|
is_wp_chunk_scan($chunk, $wpFlags);
|
|
wp_options_chunk_scan($chunk, $optionValues);
|
|
|
|
// Keep the tail of this chunk as carry for the next pass, but
|
|
// ONLY when we're not at EOF — otherwise the carry never gets
|
|
// written. The carry length is the regex max-match window; for
|
|
// our patterns (`\bENGINE=MyISAM\b`, `'siteurl'`, etc.) 128
|
|
// bytes is generous.
|
|
if (!feof($fin) && strlen($chunk) > $carryWindow) {
|
|
$writeLen = strlen($chunk) - $carryWindow;
|
|
fwrite($fout, substr($chunk, 0, $writeLen));
|
|
$carry = substr($chunk, $writeLen);
|
|
} else {
|
|
fwrite($fout, $chunk);
|
|
$carry = '';
|
|
}
|
|
}
|
|
fclose($fin);
|
|
fclose($fout);
|
|
|
|
$engineCounts = ['myisam_to_innodb' => $engineSwapCount];
|
|
|
|
// --- WordPress identification + content scan finalize -----------------
|
|
|
|
$isWp = $wpFlags['has_options'] && $wpFlags['has_posts']
|
|
&& $wpFlags['has_users'] && $wpFlags['has_option_sentinel'];
|
|
$flags = [];
|
|
if ($isWp) {
|
|
$flags = wp_content_scan_from_values($optionValues, $allowedDomains);
|
|
}
|
|
|
|
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
|
$refused = (bool) count($highConfidence);
|
|
|
|
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
|
$outPath = $outDir . '/' . $outName;
|
|
$finalPath = $finalPrefix . '/' . $outName;
|
|
if (!@rename($tmpOutPath, $outPath)) {
|
|
fwrite(STDERR, "scan-dbs: WARN failed to rename $tmpOutPath -> $outPath; skipping\n");
|
|
@unlink($tmpOutPath);
|
|
continue;
|
|
}
|
|
|
|
$databases[] = [
|
|
'dbname' => $dbName,
|
|
'size_bytes'=> $sizeBytes,
|
|
'engine_changes' => [
|
|
'myisam_to_innodb' => $engineCounts['myisam_to_innodb'],
|
|
'row_format_dynamic_applied' => 0, // v1.1
|
|
'fulltext_indexes_dropped' => 0, // v1.1
|
|
],
|
|
'wp_content_scan' => [
|
|
'is_wordpress' => $isWp,
|
|
'flags' => $flags,
|
|
],
|
|
'imported_into_new_server' => !$refused,
|
|
'sanitized_sql_path' => $refused ? null : $finalPath,
|
|
'flagged_sql_path' => $refused ? $finalPath : null,
|
|
];
|
|
}
|
|
|
|
$report = [
|
|
'scanner_version' => SCANNER_VERSION,
|
|
'import_id' => $importId,
|
|
'databases' => $databases,
|
|
];
|
|
file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n");
|
|
|
|
fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n");
|
|
exit(0);
|
|
|
|
// ---- helpers --------------------------------------------------------------
|
|
|
|
/**
|
|
* Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a
|
|
* table-options token. Returns [string $newSql, array $counts].
|
|
*
|
|
* The regex is intentionally narrow:
|
|
* - case-insensitive (cpmove dumps vary)
|
|
* - anchored on word boundaries so we don't rewrite, say,
|
|
* a TEXT field that contains the literal string "ENGINE=MyISAM"
|
|
* (extremely unlikely but possible)
|
|
*/
|
|
function engine_swap(string $sql): array {
|
|
$count = 0;
|
|
$rewritten = preg_replace_callback(
|
|
'/\bENGINE\s*=\s*MyISAM\b/i',
|
|
function () use (&$count) { $count++; return 'ENGINE=InnoDB'; },
|
|
$sql
|
|
);
|
|
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
|
}
|
|
|
|
/**
|
|
* Streaming engine_swap. Same regex but the per-chunk count is mutated
|
|
* into &$counter so the caller can accumulate across many chunks.
|
|
*/
|
|
function engine_swap_chunk(string $chunk, int &$counter): string {
|
|
$rewritten = preg_replace_callback(
|
|
'/\bENGINE\s*=\s*MyISAM\b/i',
|
|
function () use (&$counter) { $counter++; return 'ENGINE=InnoDB'; },
|
|
$chunk
|
|
);
|
|
return $rewritten ?? $chunk;
|
|
}
|
|
|
|
/**
|
|
* Streaming WP-dump fingerprint accumulator. The four checks are
|
|
* independent — any chunk that flips a flag from false to true keeps it
|
|
* true for the rest of the scan, even if subsequent chunks don't match.
|
|
* Caller AND-folds the four flags at finalization.
|
|
*/
|
|
function is_wp_chunk_scan(string $chunk, array &$flags): void {
|
|
if (!$flags['has_options'] && preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $chunk)) {
|
|
$flags['has_options'] = true;
|
|
}
|
|
if (!$flags['has_posts'] && preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $chunk)) {
|
|
$flags['has_posts'] = true;
|
|
}
|
|
if (!$flags['has_users'] && preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $chunk)) {
|
|
$flags['has_users'] = true;
|
|
}
|
|
if (!$flags['has_option_sentinel'] && preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $chunk)) {
|
|
$flags['has_option_sentinel'] = true;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Streaming siteurl/home extractor. Looks for the literal
|
|
* 'option_name', 'option_value'
|
|
* pair within any chunk that contains an INSERT INTO options statement.
|
|
* Stores into the accumulator only if not already populated — we keep the
|
|
* FIRST occurrence (which in practice is the live value; subsequent
|
|
* occurrences are duplicates from re-inserts or trigger noise).
|
|
*
|
|
* Best-effort: skips multi-row INSERTs where the option_name/value
|
|
* tuple is split across a chunk boundary. The non-streaming
|
|
* extract_wp_options() is still available for the small-file path if
|
|
* we ever reinstate it.
|
|
*/
|
|
function wp_options_chunk_scan(string $chunk, array &$values): void {
|
|
// Match a tuple of the form (anything, 'option_name', 'option_value', anything)
|
|
// inside an INSERT statement. The pattern allows either single or
|
|
// double quoting and escapes; SQL strings can contain \' and \" so
|
|
// we match `(?:\\.|[^'])*` for content.
|
|
if (!preg_match_all(
|
|
"/\(\s*[^,]+,\s*'((?:\\\\.|[^'])*)'\s*,\s*'((?:\\\\.|[^'])*)'/",
|
|
$chunk,
|
|
$matches,
|
|
PREG_SET_ORDER
|
|
)) {
|
|
return;
|
|
}
|
|
foreach ($matches as $m) {
|
|
$name = stripslashes($m[1]);
|
|
if (!in_array($name, ['siteurl', 'home', 'template', 'stylesheet'], true)) continue;
|
|
if (isset($values[$name])) continue;
|
|
$values[$name] = stripslashes($m[2]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Pull the wp_content_scan finalization out of the original function so
|
|
* the streaming path can call it with a pre-built option-values map
|
|
* instead of re-scanning the full SQL string.
|
|
*/
|
|
function wp_content_scan_from_values(array $optionValues, array $allowedDomains): array {
|
|
$flags = [];
|
|
foreach (['siteurl', 'home'] as $optName) {
|
|
if (!isset($optionValues[$optName])) continue;
|
|
$val = $optionValues[$optName];
|
|
$host = parse_url($val, PHP_URL_HOST);
|
|
if ($host === null || $host === false || $host === '') continue;
|
|
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
|
if (!domain_in_allowlist($host, $allowedDomains)) {
|
|
$flags[] = [
|
|
'severity' => 'high',
|
|
'code' => 'siteurl_external_domain',
|
|
'details' => sprintf(
|
|
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
|
$optName,
|
|
json_encode($val),
|
|
$host,
|
|
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
|
),
|
|
];
|
|
}
|
|
}
|
|
return $flags;
|
|
}
|
|
|
|
/**
|
|
* Identify WordPress by the canonical core-table CREATE statements.
|
|
*
|
|
* cPanel exports respect the customer's prefix, so we accept any
|
|
* prefix as long as the three core tables exist in this dump.
|
|
*/
|
|
function is_wordpress_dump(string $sql): bool {
|
|
$hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql);
|
|
$hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql);
|
|
$hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql);
|
|
// Bonus signal: the dump also references the standard wp_options
|
|
// option_names. Cheap to check, drops a few false positives where
|
|
// an app shares table names with WP.
|
|
$optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql);
|
|
return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel;
|
|
}
|
|
|
|
/**
|
|
* Run the WP content scan. v1.0 ships ONE check:
|
|
*
|
|
* siteurl_external_domain — wp_options.siteurl or .home points at a
|
|
* host not in the allow list (cpanel main + addons).
|
|
*
|
|
* Returns an array of flag dicts; an empty array means "clean."
|
|
*
|
|
* v1.1 add: post_content script-injection signature, theme/stylesheet
|
|
* known-malware patterns, user_pass leaked-hash check, Wordfence regex.
|
|
*/
|
|
function wp_content_scan(string $sql, array $allowedDomains): array {
|
|
$flags = [];
|
|
|
|
// Pull every (option_name, option_value) row from any INSERT INTO
|
|
// <prefix>options. We use a forgiving regex because cPanel dumps
|
|
// use both single-row INSERTs and chunked multi-row INSERTs.
|
|
$optionValues = extract_wp_options($sql);
|
|
|
|
foreach (['siteurl', 'home'] as $optName) {
|
|
if (!isset($optionValues[$optName])) continue;
|
|
$val = $optionValues[$optName];
|
|
$host = parse_url($val, PHP_URL_HOST);
|
|
if ($host === null || $host === false || $host === '') continue;
|
|
|
|
// localhost / IP literals are not external domains; let the
|
|
// panel handle them on the rewrite-wp-config pass.
|
|
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
|
|
|
if (!domain_in_allowlist($host, $allowedDomains)) {
|
|
$flags[] = [
|
|
'severity' => 'high',
|
|
'code' => 'siteurl_external_domain',
|
|
'details' => sprintf(
|
|
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
|
$optName,
|
|
json_encode($val),
|
|
$host,
|
|
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
|
),
|
|
];
|
|
}
|
|
}
|
|
|
|
return $flags;
|
|
}
|
|
|
|
/**
|
|
* Pull a map of option_name => option_value from any INSERT into the
|
|
* options table. Returns ['siteurl' => '...', 'home' => '...', ...].
|
|
*
|
|
* Best-effort — multi-row INSERTs with weird quoting can defeat the
|
|
* regex, in which case we report no values and the scan returns clean.
|
|
* That's acceptable because the panel will still rewrite siteurl on its
|
|
* own pass and any malicious siteurl that survives WILL show up in the
|
|
* customer-facing rendered URL — admin can spot it post-import.
|
|
*/
|
|
function extract_wp_options(string $sql): array {
|
|
$map = [];
|
|
|
|
// Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...);
|
|
// The optional column list contains the literal "value" (lowercase
|
|
// via `option_value`) and uppercase V too, so we can't use [^V]
|
|
// as a delimiter — instead match a balanced parens column list
|
|
// followed by VALUES.
|
|
if (!preg_match_all(
|
|
'/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims',
|
|
$sql,
|
|
$stmts
|
|
)) {
|
|
return $map;
|
|
}
|
|
|
|
foreach ($stmts[1] as $body) {
|
|
// Split on `),(` between rows; first row has the leading `(`,
|
|
// last row has the trailing `)` — handled by trim below.
|
|
$body = trim($body);
|
|
$body = preg_replace('/^\(/', '', $body);
|
|
$body = preg_replace('/\)$/', '', $body);
|
|
$rows = preg_split('/\)\s*,\s*\(/', $body);
|
|
foreach ($rows as $row) {
|
|
$cells = parse_sql_row($row);
|
|
// wp_options columns: option_id, option_name, option_value, autoload
|
|
if (count($cells) >= 3) {
|
|
$name = $cells[1];
|
|
$value = $cells[2];
|
|
if (is_string($name) && is_string($value) && $name !== '') {
|
|
$map[$name] = $value;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $map;
|
|
}
|
|
|
|
/**
|
|
* Parse one row of a MySQL INSERT VALUES tuple — comma-separated,
|
|
* strings single-quoted with backslash escapes.
|
|
*
|
|
* Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but
|
|
* good enough for cpmove dumps, which mysqldump produces in a
|
|
* predictable format.
|
|
*/
|
|
function parse_sql_row(string $row): array {
|
|
$cells = [];
|
|
$i = 0;
|
|
$n = strlen($row);
|
|
while ($i < $n) {
|
|
// Skip leading whitespace + commas.
|
|
while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++;
|
|
if ($i >= $n) break;
|
|
$c = $row[$i];
|
|
if ($c === "'") {
|
|
// Quoted string.
|
|
$i++;
|
|
$buf = '';
|
|
while ($i < $n) {
|
|
$cc = $row[$i];
|
|
if ($cc === '\\' && $i + 1 < $n) {
|
|
$next = $row[$i + 1];
|
|
$buf .= match ($next) {
|
|
'n' => "\n",
|
|
't' => "\t",
|
|
'r' => "\r",
|
|
'0' => "\0",
|
|
default => $next,
|
|
};
|
|
$i += 2;
|
|
continue;
|
|
}
|
|
if ($cc === "'") {
|
|
// MySQL `''` -> literal '
|
|
if ($i + 1 < $n && $row[$i + 1] === "'") {
|
|
$buf .= "'";
|
|
$i += 2;
|
|
continue;
|
|
}
|
|
$i++;
|
|
break;
|
|
}
|
|
$buf .= $cc;
|
|
$i++;
|
|
}
|
|
$cells[] = $buf;
|
|
} else {
|
|
// Bareword / number / NULL — read until next comma.
|
|
$start = $i;
|
|
while ($i < $n && $row[$i] !== ',') $i++;
|
|
$tok = trim(substr($row, $start, $i - $start));
|
|
$cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok;
|
|
}
|
|
}
|
|
return $cells;
|
|
}
|
|
|
|
/**
|
|
* Discover the user's allowed-domain set by reading the cpmove
|
|
* userdata. cPanel writes:
|
|
* cpmove-<user>/userdata/<domain> — per-domain config
|
|
* cpmove-<user>/userdata/main — the main domain
|
|
* cpmove-<user>/addons — addon-domain list
|
|
* cpmove-<user>/sds — subdomain list
|
|
*
|
|
* Best-effort. If we can't find any, the siteurl check still runs but
|
|
* will flag everything as external — surface up to admin.
|
|
*/
|
|
function collect_allowed_domains(string $extractDir, string $username): array {
|
|
$domains = [];
|
|
|
|
foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) {
|
|
if (!is_dir($userdataDir)) continue;
|
|
foreach (scandir($userdataDir) ?: [] as $entry) {
|
|
if ($entry === '.' || $entry === '..' || $entry === 'main') continue;
|
|
// userdata/<domain> is a file or dir keyed by the domain.
|
|
if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) {
|
|
$domains[] = strtolower($entry);
|
|
}
|
|
}
|
|
// userdata/main is a YAML-ish file with main_domain: <d>
|
|
$mainFile = $userdataDir . '/main';
|
|
if (is_file($mainFile)) {
|
|
$content = file_get_contents($mainFile);
|
|
if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) {
|
|
$domains[] = strtolower($m[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) {
|
|
if (!is_file($addonsFile)) continue;
|
|
$content = file_get_contents($addonsFile);
|
|
if ($content === false) continue;
|
|
// cPanel writes "addon.tld=parent.tld" lines.
|
|
foreach (preg_split('/\R/', $content) as $line) {
|
|
if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) {
|
|
$domains[] = strtolower($m[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($domains));
|
|
}
|
|
|
|
/**
|
|
* True if $host is in the allow-list, including subdomain matches.
|
|
*
|
|
* e.g. allowed=['example.com'], host='www.example.com' -> true.
|
|
* allowed=['example.com'], host='malicious.tld' -> false.
|
|
* allowed=[], host='*' -> false (refuse-all).
|
|
*/
|
|
function domain_in_allowlist(string $host, array $allowed): bool {
|
|
if (empty($allowed)) return false;
|
|
$host = strtolower($host);
|
|
foreach ($allowed as $d) {
|
|
$d = strtolower($d);
|
|
if ($host === $d) return true;
|
|
if (str_ends_with($host, '.' . $d)) return true;
|
|
}
|
|
return false;
|
|
}
|