From 08b995a29cf892d4ee8c45f08a6e37e45935e148 Mon Sep 17 00:00:00 2001 From: "Claude (bootstrap)" Date: Sun, 31 May 2026 20:31:34 -0700 Subject: [PATCH] scan-dbs: stream the SQL file instead of loading 5GB+ into memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaced on whp02 alsacorp retry: scan-dbs.php hit PHP fatal at line 86 "Allowed memory size of 134217728 bytes exhausted (tried to allocate 5488440384 bytes)" while loading alsacorp_alsa1.sql via file_get_contents. The dump is multi-GB (typical for WooCommerce stores with media metadata); the 128MB-default PHP memory_limit + the 2GB cgroup on the container both fail well below the actual file size. Rewrote the per-DB pass as a streaming loop over 4MB chunks: - engine_swap_chunk: same `\bENGINE=MyISAM\b` regex, mutates a per-DB counter via reference so the per-chunk counts accumulate into a single myisam_to_innodb total. - is_wp_chunk_scan: OR-folds the four WP fingerprint regexes (CREATE TABLE *_options, *_posts, *_users + the 'siteurl|home|template|stylesheet' sentinel) into a state dict; any chunk that flips a flag from false to true keeps it true for the rest of the file. Caller AND-folds at finalization. - wp_options_chunk_scan: extracts (option_name, option_value) tuples from INSERT INTO options statements as they pass through. First occurrence wins so we keep the live value, not later duplicates. - wp_content_scan_from_values: extracted the finalization logic from the legacy wp_content_scan() so the streaming path can submit a pre-built option-values map instead of re-scanning the full string. Per-chunk carry: a 128-byte buffer at the end of each chunk is held back and prepended to the next chunk so a pattern split across a chunk boundary (e.g. "ENGINE=" at byte 4194302, "MyISAM" at byte 4194304) is still seen by the regex. 128 bytes is generous for our patterns (longest is "ENGINE = MyISAM" with whitespace flex). Output goes to a `.sql.tmp` first, then renamed to `.sql{,.flagged}` once we know the flag verdict — avoids a partial file if the scan dies mid-stream. Legacy `engine_swap`, `is_wordpress_dump`, and the unused `wp_content_scan`+`extract_wp_options` are kept in place for the small-file path (none of them currently called from the new streaming loop, but they're public-ish helpers the next dbsanitize revision could reuse). Resident memory now bounded to <16 MB per DB regardless of input file size — should handle the 30 GB+ outliers we'll inevitably see. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/scan-dbs.php | 194 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 185 insertions(+), 9 deletions(-) diff --git a/scripts/scan-dbs.php b/scripts/scan-dbs.php index 2190694..1c98736 100755 --- a/scripts/scan-dbs.php +++ b/scripts/scan-dbs.php @@ -83,22 +83,96 @@ foreach ($sqlFiles as $sqlPath) { fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n"); $sizeBytes = filesize($sqlPath) ?: 0; - $sql = file_get_contents($sqlPath); - if ($sql === false) { - fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n"); + + // STREAMING POSTURE: load + scan the SQL file in 4MB chunks instead + // of one file_get_contents. cPanel customer dumps in the wild routinely + // hit 5-10GB (large WooCommerce + media metadata); a 128MB-default PHP + // memory_limit + the 2GB cgroup on the container both fail well below + // those sizes. Streaming bounds the resident memory to a few MB + // regardless of dump size. + // + // Per-chunk passes: + // - engine_swap_chunk: same regex as engine_swap() but on a chunk, + // plus a small carryover buffer at the end so + // a pattern split across a chunk boundary + // still matches. + // - is_wp_chunk_scan: OR-folds the three CREATE TABLE detections + // + the option_name sentinel into a state + // dict that finalizes after the last chunk. + // - wp_options_chunk_scan: extracts siteurl/home option values when + // they appear in this chunk's portion of an + // INSERT INTO options statement. + // + // We open input + output streams, push each rewritten chunk to the + // output as we go, and keep only the per-chunk regex carryover (up to + // 128 bytes — longer than any ENGINE=... or 'siteurl' string). + $fin = @fopen($sqlPath, 'rb'); + if (!$fin) { + fwrite(STDERR, "scan-dbs: WARN failed to open $sqlPath; skipping\n"); continue; } - // --- ENGINE SWAP (always applied) ------------------------------------- + // We write to a tmp file first; renamed to the final {.sql, .sql.flagged} + // name once we know whether the wp content scan flagged the dump. + $tmpOutPath = $outDir . '/' . $dbName . '.sql.tmp'; + $fout = @fopen($tmpOutPath, 'wb'); + if (!$fout) { + fwrite(STDERR, "scan-dbs: WARN failed to open output $tmpOutPath; skipping\n"); + fclose($fin); + continue; + } - [$rewritten, $engineCounts] = engine_swap($sql); + $engineSwapCount = 0; + $wpFlags = [ + 'has_options' => false, + 'has_posts' => false, + 'has_users' => false, + 'has_option_sentinel'=> false, + ]; + $optionValues = []; // siteurl / home etc. accumulated across chunks + $carry = ''; + $chunkSize = 4 * 1024 * 1024; // 4 MB + $carryWindow = 128; // bytes carried over for boundary matches - // --- WordPress identification + content scan ------------------------- + while (!feof($fin)) { + $buf = fread($fin, $chunkSize); + if ($buf === false || $buf === '') break; - $isWp = is_wordpress_dump($rewritten); + // Prepend any carry from the previous iteration so a pattern split + // across the boundary is still visible to the regex. + $chunk = $carry . $buf; + + // Per-chunk passes. + $chunk = engine_swap_chunk($chunk, $engineSwapCount); + is_wp_chunk_scan($chunk, $wpFlags); + wp_options_chunk_scan($chunk, $optionValues); + + // Keep the tail of this chunk as carry for the next pass, but + // ONLY when we're not at EOF — otherwise the carry never gets + // written. The carry length is the regex max-match window; for + // our patterns (`\bENGINE=MyISAM\b`, `'siteurl'`, etc.) 128 + // bytes is generous. + if (!feof($fin) && strlen($chunk) > $carryWindow) { + $writeLen = strlen($chunk) - $carryWindow; + fwrite($fout, substr($chunk, 0, $writeLen)); + $carry = substr($chunk, $writeLen); + } else { + fwrite($fout, $chunk); + $carry = ''; + } + } + fclose($fin); + fclose($fout); + + $engineCounts = ['myisam_to_innodb' => $engineSwapCount]; + + // --- WordPress identification + content scan finalize ----------------- + + $isWp = $wpFlags['has_options'] && $wpFlags['has_posts'] + && $wpFlags['has_users'] && $wpFlags['has_option_sentinel']; $flags = []; if ($isWp) { - $flags = wp_content_scan($rewritten, $allowedDomains); + $flags = wp_content_scan_from_values($optionValues, $allowedDomains); } $highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high'); @@ -107,7 +181,11 @@ foreach ($sqlFiles as $sqlPath) { $outName = $dbName . '.sql' . ($refused ? '.flagged' : ''); $outPath = $outDir . '/' . $outName; $finalPath = $finalPrefix . '/' . $outName; - file_put_contents($outPath, $rewritten); + if (!@rename($tmpOutPath, $outPath)) { + fwrite(STDERR, "scan-dbs: WARN failed to rename $tmpOutPath -> $outPath; skipping\n"); + @unlink($tmpOutPath); + continue; + } $databases[] = [ 'dbname' => $dbName, @@ -159,6 +237,104 @@ function engine_swap(string $sql): array { return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]]; } +/** + * Streaming engine_swap. Same regex but the per-chunk count is mutated + * into &$counter so the caller can accumulate across many chunks. + */ +function engine_swap_chunk(string $chunk, int &$counter): string { + $rewritten = preg_replace_callback( + '/\bENGINE\s*=\s*MyISAM\b/i', + function () use (&$counter) { $counter++; return 'ENGINE=InnoDB'; }, + $chunk + ); + return $rewritten ?? $chunk; +} + +/** + * Streaming WP-dump fingerprint accumulator. The four checks are + * independent — any chunk that flips a flag from false to true keeps it + * true for the rest of the scan, even if subsequent chunks don't match. + * Caller AND-folds the four flags at finalization. + */ +function is_wp_chunk_scan(string $chunk, array &$flags): void { + if (!$flags['has_options'] && preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $chunk)) { + $flags['has_options'] = true; + } + if (!$flags['has_posts'] && preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $chunk)) { + $flags['has_posts'] = true; + } + if (!$flags['has_users'] && preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $chunk)) { + $flags['has_users'] = true; + } + if (!$flags['has_option_sentinel'] && preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $chunk)) { + $flags['has_option_sentinel'] = true; + } +} + +/** + * Streaming siteurl/home extractor. Looks for the literal + * 'option_name', 'option_value' + * pair within any chunk that contains an INSERT INTO options statement. + * Stores into the accumulator only if not already populated — we keep the + * FIRST occurrence (which in practice is the live value; subsequent + * occurrences are duplicates from re-inserts or trigger noise). + * + * Best-effort: skips multi-row INSERTs where the option_name/value + * tuple is split across a chunk boundary. The non-streaming + * extract_wp_options() is still available for the small-file path if + * we ever reinstate it. + */ +function wp_options_chunk_scan(string $chunk, array &$values): void { + // Match a tuple of the form (anything, 'option_name', 'option_value', anything) + // inside an INSERT statement. The pattern allows either single or + // double quoting and escapes; SQL strings can contain \' and \" so + // we match `(?:\\.|[^'])*` for content. + if (!preg_match_all( + "/\(\s*[^,]+,\s*'((?:\\\\.|[^'])*)'\s*,\s*'((?:\\\\.|[^'])*)'/", + $chunk, + $matches, + PREG_SET_ORDER + )) { + return; + } + foreach ($matches as $m) { + $name = stripslashes($m[1]); + if (!in_array($name, ['siteurl', 'home', 'template', 'stylesheet'], true)) continue; + if (isset($values[$name])) continue; + $values[$name] = stripslashes($m[2]); + } +} + +/** + * Pull the wp_content_scan finalization out of the original function so + * the streaming path can call it with a pre-built option-values map + * instead of re-scanning the full SQL string. + */ +function wp_content_scan_from_values(array $optionValues, array $allowedDomains): array { + $flags = []; + foreach (['siteurl', 'home'] as $optName) { + if (!isset($optionValues[$optName])) continue; + $val = $optionValues[$optName]; + $host = parse_url($val, PHP_URL_HOST); + if ($host === null || $host === false || $host === '') continue; + if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue; + if (!domain_in_allowlist($host, $allowedDomains)) { + $flags[] = [ + 'severity' => 'high', + 'code' => 'siteurl_external_domain', + 'details' => sprintf( + "wp_options.%s = %s — host '%s' not in allowed domain list (%s)", + $optName, + json_encode($val), + $host, + empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains) + ), + ]; + } + } + return $flags; +} + /** * Identify WordPress by the canonical core-table CREATE statements. *