diff --git a/scripts/scan-dbs.php b/scripts/scan-dbs.php index 2190694..1c98736 100755 --- a/scripts/scan-dbs.php +++ b/scripts/scan-dbs.php @@ -83,22 +83,96 @@ foreach ($sqlFiles as $sqlPath) { fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n"); $sizeBytes = filesize($sqlPath) ?: 0; - $sql = file_get_contents($sqlPath); - if ($sql === false) { - fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n"); + + // STREAMING POSTURE: load + scan the SQL file in 4MB chunks instead + // of one file_get_contents. cPanel customer dumps in the wild routinely + // hit 5-10GB (large WooCommerce + media metadata); a 128MB-default PHP + // memory_limit + the 2GB cgroup on the container both fail well below + // those sizes. Streaming bounds the resident memory to a few MB + // regardless of dump size. + // + // Per-chunk passes: + // - engine_swap_chunk: same regex as engine_swap() but on a chunk, + // plus a small carryover buffer at the end so + // a pattern split across a chunk boundary + // still matches. + // - is_wp_chunk_scan: OR-folds the three CREATE TABLE detections + // + the option_name sentinel into a state + // dict that finalizes after the last chunk. + // - wp_options_chunk_scan: extracts siteurl/home option values when + // they appear in this chunk's portion of an + // INSERT INTO options statement. + // + // We open input + output streams, push each rewritten chunk to the + // output as we go, and keep only the per-chunk regex carryover (up to + // 128 bytes — longer than any ENGINE=... or 'siteurl' string). + $fin = @fopen($sqlPath, 'rb'); + if (!$fin) { + fwrite(STDERR, "scan-dbs: WARN failed to open $sqlPath; skipping\n"); continue; } - // --- ENGINE SWAP (always applied) ------------------------------------- + // We write to a tmp file first; renamed to the final {.sql, .sql.flagged} + // name once we know whether the wp content scan flagged the dump. + $tmpOutPath = $outDir . '/' . $dbName . '.sql.tmp'; + $fout = @fopen($tmpOutPath, 'wb'); + if (!$fout) { + fwrite(STDERR, "scan-dbs: WARN failed to open output $tmpOutPath; skipping\n"); + fclose($fin); + continue; + } - [$rewritten, $engineCounts] = engine_swap($sql); + $engineSwapCount = 0; + $wpFlags = [ + 'has_options' => false, + 'has_posts' => false, + 'has_users' => false, + 'has_option_sentinel'=> false, + ]; + $optionValues = []; // siteurl / home etc. accumulated across chunks + $carry = ''; + $chunkSize = 4 * 1024 * 1024; // 4 MB + $carryWindow = 128; // bytes carried over for boundary matches - // --- WordPress identification + content scan ------------------------- + while (!feof($fin)) { + $buf = fread($fin, $chunkSize); + if ($buf === false || $buf === '') break; - $isWp = is_wordpress_dump($rewritten); + // Prepend any carry from the previous iteration so a pattern split + // across the boundary is still visible to the regex. + $chunk = $carry . $buf; + + // Per-chunk passes. + $chunk = engine_swap_chunk($chunk, $engineSwapCount); + is_wp_chunk_scan($chunk, $wpFlags); + wp_options_chunk_scan($chunk, $optionValues); + + // Keep the tail of this chunk as carry for the next pass, but + // ONLY when we're not at EOF — otherwise the carry never gets + // written. The carry length is the regex max-match window; for + // our patterns (`\bENGINE=MyISAM\b`, `'siteurl'`, etc.) 128 + // bytes is generous. + if (!feof($fin) && strlen($chunk) > $carryWindow) { + $writeLen = strlen($chunk) - $carryWindow; + fwrite($fout, substr($chunk, 0, $writeLen)); + $carry = substr($chunk, $writeLen); + } else { + fwrite($fout, $chunk); + $carry = ''; + } + } + fclose($fin); + fclose($fout); + + $engineCounts = ['myisam_to_innodb' => $engineSwapCount]; + + // --- WordPress identification + content scan finalize ----------------- + + $isWp = $wpFlags['has_options'] && $wpFlags['has_posts'] + && $wpFlags['has_users'] && $wpFlags['has_option_sentinel']; $flags = []; if ($isWp) { - $flags = wp_content_scan($rewritten, $allowedDomains); + $flags = wp_content_scan_from_values($optionValues, $allowedDomains); } $highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high'); @@ -107,7 +181,11 @@ foreach ($sqlFiles as $sqlPath) { $outName = $dbName . '.sql' . ($refused ? '.flagged' : ''); $outPath = $outDir . '/' . $outName; $finalPath = $finalPrefix . '/' . $outName; - file_put_contents($outPath, $rewritten); + if (!@rename($tmpOutPath, $outPath)) { + fwrite(STDERR, "scan-dbs: WARN failed to rename $tmpOutPath -> $outPath; skipping\n"); + @unlink($tmpOutPath); + continue; + } $databases[] = [ 'dbname' => $dbName, @@ -159,6 +237,104 @@ function engine_swap(string $sql): array { return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]]; } +/** + * Streaming engine_swap. Same regex but the per-chunk count is mutated + * into &$counter so the caller can accumulate across many chunks. + */ +function engine_swap_chunk(string $chunk, int &$counter): string { + $rewritten = preg_replace_callback( + '/\bENGINE\s*=\s*MyISAM\b/i', + function () use (&$counter) { $counter++; return 'ENGINE=InnoDB'; }, + $chunk + ); + return $rewritten ?? $chunk; +} + +/** + * Streaming WP-dump fingerprint accumulator. The four checks are + * independent — any chunk that flips a flag from false to true keeps it + * true for the rest of the scan, even if subsequent chunks don't match. + * Caller AND-folds the four flags at finalization. + */ +function is_wp_chunk_scan(string $chunk, array &$flags): void { + if (!$flags['has_options'] && preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $chunk)) { + $flags['has_options'] = true; + } + if (!$flags['has_posts'] && preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $chunk)) { + $flags['has_posts'] = true; + } + if (!$flags['has_users'] && preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $chunk)) { + $flags['has_users'] = true; + } + if (!$flags['has_option_sentinel'] && preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $chunk)) { + $flags['has_option_sentinel'] = true; + } +} + +/** + * Streaming siteurl/home extractor. Looks for the literal + * 'option_name', 'option_value' + * pair within any chunk that contains an INSERT INTO options statement. + * Stores into the accumulator only if not already populated — we keep the + * FIRST occurrence (which in practice is the live value; subsequent + * occurrences are duplicates from re-inserts or trigger noise). + * + * Best-effort: skips multi-row INSERTs where the option_name/value + * tuple is split across a chunk boundary. The non-streaming + * extract_wp_options() is still available for the small-file path if + * we ever reinstate it. + */ +function wp_options_chunk_scan(string $chunk, array &$values): void { + // Match a tuple of the form (anything, 'option_name', 'option_value', anything) + // inside an INSERT statement. The pattern allows either single or + // double quoting and escapes; SQL strings can contain \' and \" so + // we match `(?:\\.|[^'])*` for content. + if (!preg_match_all( + "/\(\s*[^,]+,\s*'((?:\\\\.|[^'])*)'\s*,\s*'((?:\\\\.|[^'])*)'/", + $chunk, + $matches, + PREG_SET_ORDER + )) { + return; + } + foreach ($matches as $m) { + $name = stripslashes($m[1]); + if (!in_array($name, ['siteurl', 'home', 'template', 'stylesheet'], true)) continue; + if (isset($values[$name])) continue; + $values[$name] = stripslashes($m[2]); + } +} + +/** + * Pull the wp_content_scan finalization out of the original function so + * the streaming path can call it with a pre-built option-values map + * instead of re-scanning the full SQL string. + */ +function wp_content_scan_from_values(array $optionValues, array $allowedDomains): array { + $flags = []; + foreach (['siteurl', 'home'] as $optName) { + if (!isset($optionValues[$optName])) continue; + $val = $optionValues[$optName]; + $host = parse_url($val, PHP_URL_HOST); + if ($host === null || $host === false || $host === '') continue; + if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue; + if (!domain_in_allowlist($host, $allowedDomains)) { + $flags[] = [ + 'severity' => 'high', + 'code' => 'siteurl_external_domain', + 'details' => sprintf( + "wp_options.%s = %s — host '%s' not in allowed domain list (%s)", + $optName, + json_encode($val), + $host, + empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains) + ), + ]; + } + } + return $flags; +} + /** * Identify WordPress by the canonical core-table CREATE statements. *