scan-dbs: stream the SQL file instead of loading 5GB+ into memory
All checks were successful
cpanel-importer Build and Push / Build-and-Push (push) Successful in 1m21s
All checks were successful
cpanel-importer Build and Push / Build-and-Push (push) Successful in 1m21s
Surfaced on whp02 alsacorp retry: scan-dbs.php hit PHP fatal at line 86
"Allowed memory size of 134217728 bytes exhausted (tried to allocate
5488440384 bytes)" while loading alsacorp_alsa1.sql via
file_get_contents. The dump is multi-GB (typical for WooCommerce stores
with media metadata); the 128MB-default PHP memory_limit + the 2GB
cgroup on the container both fail well below the actual file size.
Rewrote the per-DB pass as a streaming loop over 4MB chunks:
- engine_swap_chunk: same `\bENGINE=MyISAM\b` regex, mutates a
per-DB counter via reference so the per-chunk counts accumulate
into a single myisam_to_innodb total.
- is_wp_chunk_scan: OR-folds the four WP fingerprint regexes
(CREATE TABLE *_options, *_posts, *_users + the
'siteurl|home|template|stylesheet' sentinel) into a state dict;
any chunk that flips a flag from false to true keeps it true for
the rest of the file. Caller AND-folds at finalization.
- wp_options_chunk_scan: extracts (option_name, option_value)
tuples from INSERT INTO options statements as they pass through.
First occurrence wins so we keep the live value, not later
duplicates.
- wp_content_scan_from_values: extracted the finalization logic
from the legacy wp_content_scan() so the streaming path can
submit a pre-built option-values map instead of re-scanning the
full string.
Per-chunk carry: a 128-byte buffer at the end of each chunk is held
back and prepended to the next chunk so a pattern split across a
chunk boundary (e.g. "ENGINE=" at byte 4194302, "MyISAM" at byte
4194304) is still seen by the regex. 128 bytes is generous for our
patterns (longest is "ENGINE = MyISAM" with whitespace flex).
Output goes to a `<db>.sql.tmp` first, then renamed to
`<db>.sql{,.flagged}` once we know the flag verdict — avoids a
partial file if the scan dies mid-stream.
Legacy `engine_swap`, `is_wordpress_dump`, and the unused
`wp_content_scan`+`extract_wp_options` are kept in place for the
small-file path (none of them currently called from the new
streaming loop, but they're public-ish helpers the next dbsanitize
revision could reuse).
Resident memory now bounded to <16 MB per DB regardless of input
file size — should handle the 30 GB+ outliers we'll inevitably see.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -83,22 +83,96 @@ foreach ($sqlFiles as $sqlPath) {
|
|||||||
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n");
|
||||||
|
|
||||||
$sizeBytes = filesize($sqlPath) ?: 0;
|
$sizeBytes = filesize($sqlPath) ?: 0;
|
||||||
$sql = file_get_contents($sqlPath);
|
|
||||||
if ($sql === false) {
|
// STREAMING POSTURE: load + scan the SQL file in 4MB chunks instead
|
||||||
fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n");
|
// of one file_get_contents. cPanel customer dumps in the wild routinely
|
||||||
|
// hit 5-10GB (large WooCommerce + media metadata); a 128MB-default PHP
|
||||||
|
// memory_limit + the 2GB cgroup on the container both fail well below
|
||||||
|
// those sizes. Streaming bounds the resident memory to a few MB
|
||||||
|
// regardless of dump size.
|
||||||
|
//
|
||||||
|
// Per-chunk passes:
|
||||||
|
// - engine_swap_chunk: same regex as engine_swap() but on a chunk,
|
||||||
|
// plus a small carryover buffer at the end so
|
||||||
|
// a pattern split across a chunk boundary
|
||||||
|
// still matches.
|
||||||
|
// - is_wp_chunk_scan: OR-folds the three CREATE TABLE detections
|
||||||
|
// + the option_name sentinel into a state
|
||||||
|
// dict that finalizes after the last chunk.
|
||||||
|
// - wp_options_chunk_scan: extracts siteurl/home option values when
|
||||||
|
// they appear in this chunk's portion of an
|
||||||
|
// INSERT INTO options statement.
|
||||||
|
//
|
||||||
|
// We open input + output streams, push each rewritten chunk to the
|
||||||
|
// output as we go, and keep only the per-chunk regex carryover (up to
|
||||||
|
// 128 bytes — longer than any ENGINE=... or 'siteurl' string).
|
||||||
|
$fin = @fopen($sqlPath, 'rb');
|
||||||
|
if (!$fin) {
|
||||||
|
fwrite(STDERR, "scan-dbs: WARN failed to open $sqlPath; skipping\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- ENGINE SWAP (always applied) -------------------------------------
|
// We write to a tmp file first; renamed to the final {.sql, .sql.flagged}
|
||||||
|
// name once we know whether the wp content scan flagged the dump.
|
||||||
|
$tmpOutPath = $outDir . '/' . $dbName . '.sql.tmp';
|
||||||
|
$fout = @fopen($tmpOutPath, 'wb');
|
||||||
|
if (!$fout) {
|
||||||
|
fwrite(STDERR, "scan-dbs: WARN failed to open output $tmpOutPath; skipping\n");
|
||||||
|
fclose($fin);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
[$rewritten, $engineCounts] = engine_swap($sql);
|
$engineSwapCount = 0;
|
||||||
|
$wpFlags = [
|
||||||
|
'has_options' => false,
|
||||||
|
'has_posts' => false,
|
||||||
|
'has_users' => false,
|
||||||
|
'has_option_sentinel'=> false,
|
||||||
|
];
|
||||||
|
$optionValues = []; // siteurl / home etc. accumulated across chunks
|
||||||
|
$carry = '';
|
||||||
|
$chunkSize = 4 * 1024 * 1024; // 4 MB
|
||||||
|
$carryWindow = 128; // bytes carried over for boundary matches
|
||||||
|
|
||||||
// --- WordPress identification + content scan -------------------------
|
while (!feof($fin)) {
|
||||||
|
$buf = fread($fin, $chunkSize);
|
||||||
|
if ($buf === false || $buf === '') break;
|
||||||
|
|
||||||
$isWp = is_wordpress_dump($rewritten);
|
// Prepend any carry from the previous iteration so a pattern split
|
||||||
|
// across the boundary is still visible to the regex.
|
||||||
|
$chunk = $carry . $buf;
|
||||||
|
|
||||||
|
// Per-chunk passes.
|
||||||
|
$chunk = engine_swap_chunk($chunk, $engineSwapCount);
|
||||||
|
is_wp_chunk_scan($chunk, $wpFlags);
|
||||||
|
wp_options_chunk_scan($chunk, $optionValues);
|
||||||
|
|
||||||
|
// Keep the tail of this chunk as carry for the next pass, but
|
||||||
|
// ONLY when we're not at EOF — otherwise the carry never gets
|
||||||
|
// written. The carry length is the regex max-match window; for
|
||||||
|
// our patterns (`\bENGINE=MyISAM\b`, `'siteurl'`, etc.) 128
|
||||||
|
// bytes is generous.
|
||||||
|
if (!feof($fin) && strlen($chunk) > $carryWindow) {
|
||||||
|
$writeLen = strlen($chunk) - $carryWindow;
|
||||||
|
fwrite($fout, substr($chunk, 0, $writeLen));
|
||||||
|
$carry = substr($chunk, $writeLen);
|
||||||
|
} else {
|
||||||
|
fwrite($fout, $chunk);
|
||||||
|
$carry = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose($fin);
|
||||||
|
fclose($fout);
|
||||||
|
|
||||||
|
$engineCounts = ['myisam_to_innodb' => $engineSwapCount];
|
||||||
|
|
||||||
|
// --- WordPress identification + content scan finalize -----------------
|
||||||
|
|
||||||
|
$isWp = $wpFlags['has_options'] && $wpFlags['has_posts']
|
||||||
|
&& $wpFlags['has_users'] && $wpFlags['has_option_sentinel'];
|
||||||
$flags = [];
|
$flags = [];
|
||||||
if ($isWp) {
|
if ($isWp) {
|
||||||
$flags = wp_content_scan($rewritten, $allowedDomains);
|
$flags = wp_content_scan_from_values($optionValues, $allowedDomains);
|
||||||
}
|
}
|
||||||
|
|
||||||
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
$highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high');
|
||||||
@@ -107,7 +181,11 @@ foreach ($sqlFiles as $sqlPath) {
|
|||||||
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
$outName = $dbName . '.sql' . ($refused ? '.flagged' : '');
|
||||||
$outPath = $outDir . '/' . $outName;
|
$outPath = $outDir . '/' . $outName;
|
||||||
$finalPath = $finalPrefix . '/' . $outName;
|
$finalPath = $finalPrefix . '/' . $outName;
|
||||||
file_put_contents($outPath, $rewritten);
|
if (!@rename($tmpOutPath, $outPath)) {
|
||||||
|
fwrite(STDERR, "scan-dbs: WARN failed to rename $tmpOutPath -> $outPath; skipping\n");
|
||||||
|
@unlink($tmpOutPath);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
$databases[] = [
|
$databases[] = [
|
||||||
'dbname' => $dbName,
|
'dbname' => $dbName,
|
||||||
@@ -159,6 +237,104 @@ function engine_swap(string $sql): array {
|
|||||||
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Streaming engine_swap. Same regex but the per-chunk count is mutated
|
||||||
|
* into &$counter so the caller can accumulate across many chunks.
|
||||||
|
*/
|
||||||
|
function engine_swap_chunk(string $chunk, int &$counter): string {
|
||||||
|
$rewritten = preg_replace_callback(
|
||||||
|
'/\bENGINE\s*=\s*MyISAM\b/i',
|
||||||
|
function () use (&$counter) { $counter++; return 'ENGINE=InnoDB'; },
|
||||||
|
$chunk
|
||||||
|
);
|
||||||
|
return $rewritten ?? $chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Streaming WP-dump fingerprint accumulator. The four checks are
|
||||||
|
* independent — any chunk that flips a flag from false to true keeps it
|
||||||
|
* true for the rest of the scan, even if subsequent chunks don't match.
|
||||||
|
* Caller AND-folds the four flags at finalization.
|
||||||
|
*/
|
||||||
|
function is_wp_chunk_scan(string $chunk, array &$flags): void {
|
||||||
|
if (!$flags['has_options'] && preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $chunk)) {
|
||||||
|
$flags['has_options'] = true;
|
||||||
|
}
|
||||||
|
if (!$flags['has_posts'] && preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $chunk)) {
|
||||||
|
$flags['has_posts'] = true;
|
||||||
|
}
|
||||||
|
if (!$flags['has_users'] && preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $chunk)) {
|
||||||
|
$flags['has_users'] = true;
|
||||||
|
}
|
||||||
|
if (!$flags['has_option_sentinel'] && preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $chunk)) {
|
||||||
|
$flags['has_option_sentinel'] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Streaming siteurl/home extractor. Looks for the literal
|
||||||
|
* 'option_name', 'option_value'
|
||||||
|
* pair within any chunk that contains an INSERT INTO options statement.
|
||||||
|
* Stores into the accumulator only if not already populated — we keep the
|
||||||
|
* FIRST occurrence (which in practice is the live value; subsequent
|
||||||
|
* occurrences are duplicates from re-inserts or trigger noise).
|
||||||
|
*
|
||||||
|
* Best-effort: skips multi-row INSERTs where the option_name/value
|
||||||
|
* tuple is split across a chunk boundary. The non-streaming
|
||||||
|
* extract_wp_options() is still available for the small-file path if
|
||||||
|
* we ever reinstate it.
|
||||||
|
*/
|
||||||
|
function wp_options_chunk_scan(string $chunk, array &$values): void {
|
||||||
|
// Match a tuple of the form (anything, 'option_name', 'option_value', anything)
|
||||||
|
// inside an INSERT statement. The pattern allows either single or
|
||||||
|
// double quoting and escapes; SQL strings can contain \' and \" so
|
||||||
|
// we match `(?:\\.|[^'])*` for content.
|
||||||
|
if (!preg_match_all(
|
||||||
|
"/\(\s*[^,]+,\s*'((?:\\\\.|[^'])*)'\s*,\s*'((?:\\\\.|[^'])*)'/",
|
||||||
|
$chunk,
|
||||||
|
$matches,
|
||||||
|
PREG_SET_ORDER
|
||||||
|
)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
foreach ($matches as $m) {
|
||||||
|
$name = stripslashes($m[1]);
|
||||||
|
if (!in_array($name, ['siteurl', 'home', 'template', 'stylesheet'], true)) continue;
|
||||||
|
if (isset($values[$name])) continue;
|
||||||
|
$values[$name] = stripslashes($m[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pull the wp_content_scan finalization out of the original function so
|
||||||
|
* the streaming path can call it with a pre-built option-values map
|
||||||
|
* instead of re-scanning the full SQL string.
|
||||||
|
*/
|
||||||
|
function wp_content_scan_from_values(array $optionValues, array $allowedDomains): array {
|
||||||
|
$flags = [];
|
||||||
|
foreach (['siteurl', 'home'] as $optName) {
|
||||||
|
if (!isset($optionValues[$optName])) continue;
|
||||||
|
$val = $optionValues[$optName];
|
||||||
|
$host = parse_url($val, PHP_URL_HOST);
|
||||||
|
if ($host === null || $host === false || $host === '') continue;
|
||||||
|
if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue;
|
||||||
|
if (!domain_in_allowlist($host, $allowedDomains)) {
|
||||||
|
$flags[] = [
|
||||||
|
'severity' => 'high',
|
||||||
|
'code' => 'siteurl_external_domain',
|
||||||
|
'details' => sprintf(
|
||||||
|
"wp_options.%s = %s — host '%s' not in allowed domain list (%s)",
|
||||||
|
$optName,
|
||||||
|
json_encode($val),
|
||||||
|
$host,
|
||||||
|
empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains)
|
||||||
|
),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $flags;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Identify WordPress by the canonical core-table CREATE statements.
|
* Identify WordPress by the canonical core-table CREATE statements.
|
||||||
*
|
*
|
||||||
|
|||||||
Reference in New Issue
Block a user