From 5487dfc8f1d5bde09f13d7bde84eb7f9b6678e07 Mon Sep 17 00:00:00 2001 From: "Claude (bootstrap)" Date: Sat, 30 May 2026 19:56:57 -0700 Subject: [PATCH] Initial bootstrap: cpanel-importer sanitization sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skeleton for the cpanel-importer Docker container — a one-shot sandbox the WHP panel invokes BEFORE extracting a customer cpmove tarball. See cpanel-import-container-spec.md (in /workspace/) for the full design. What this ships in v1.0: - Dockerfile: almalinux:10-minimal + PHP 8.4 (Remi) + ClamAV 1.4 + SaneSecurity Foxhole.PHP rules + tar/mariadb-client/rsync. Runs as UID 999 (whp-import) via the panel-side --user 999:999 flag. - scripts/entrypoint.sh: validates env, runs (optional) freshclam, drives extract -> scan-files -> scan-dbs -> rsync -> report.json. - scripts/extract.sh + scripts/lib/scan-symlinks.php: pre-extract symlink scan ported standalone from web-files/libs/CpanelBackupImporter.php (the existing 2026-05-29 whp02 destruction-vector fix). Aborts with exit 3 before tar runs if any DANGEROUS symlink is found. - scripts/scan-files.php: ClamAV walk + classify-and-action. v1.0 ships with an empty cleaner registry — every hit is QUARANTINE_ONLY. Cleaner hooks are stubbed for v1.1. - scripts/scan-dbs.php: regex MyISAM -> InnoDB rewrite (always applied), WordPress identification, and ONE WP content scan check (siteurl_external_domain). v1.1 will grow the check set. - scripts/lib/safety-net.php: container-narrow open_basedir allow-list, much tighter than the panel-side one. - .gitea/workflows/build-push.yaml: builds + smoke-tests + PHP-syntax-checks + bash-syntax-checks before pushing to repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer. - tests/build-fixtures.sh: builds cpmove-clean.tar.gz (benign WP dump) and cpmove-alfa.tar.gz (the ALFA-shell symlink-to-/etc vector) for local end-to-end testing. - README.md / CONTRIBUTING.md: docker-run invocation, bind-mount catalog, report.json schema, how to add a cleaner pattern or a WP scan signature. Local acceptance test results: - clean fixture -> status=completed, 3 MyISAM->InnoDB, no flags, 0 - ALFA fixture -> exit 1, status=failed, failed_stage=extract, "tarball contains dangerous symlinks; aborting" on stderr - compromised-siteurl fixture -> imported_into_new_server=false, .flagged file written, summary_for_panel.show_alert=true Image size: 197 MB compressed (gzipped docker save), ~397 MB unique layers extracted. Well under the spec's 600 MB compressed / 1.2 GB extracted budget. Co-Authored-By: Claude Opus 4.7 (1M context) --- .editorconfig | 18 ++ .gitea/workflows/build-push.yaml | 98 ++++++++ .gitignore | 32 +++ CONTRIBUTING.md | 192 +++++++++++++++ Dockerfile | 166 +++++++++++++ LICENSE | 21 ++ README.md | 221 +++++++++++++++++ configs/freshclam.conf | 41 ++++ configs/sanesecurity-mirror.txt | 1 + scripts/entrypoint.sh | 219 +++++++++++++++++ scripts/extract.sh | 64 +++++ scripts/lib/safety-net.php | 46 ++++ scripts/lib/scan-symlinks.php | 161 +++++++++++++ scripts/scan-dbs.php | 399 +++++++++++++++++++++++++++++++ scripts/scan-files.php | 216 +++++++++++++++++ tests/build-fixtures.sh | 113 +++++++++ tests/fixtures/.gitkeep | 0 17 files changed, 2008 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitea/workflows/build-push.yaml create mode 100644 .gitignore create mode 100644 CONTRIBUTING.md create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 configs/freshclam.conf create mode 100644 configs/sanesecurity-mirror.txt create mode 100755 scripts/entrypoint.sh create mode 100755 scripts/extract.sh create mode 100644 scripts/lib/safety-net.php create mode 100644 scripts/lib/scan-symlinks.php create mode 100755 scripts/scan-dbs.php create mode 100755 scripts/scan-files.php create mode 100755 tests/build-fixtures.sh create mode 100644 tests/fixtures/.gitkeep diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..4027cb1 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 4 + +[*.{yaml,yml,json,md}] +indent_size = 2 + +[Dockerfile] +indent_size = 4 + +[Makefile] +indent_style = tab diff --git a/.gitea/workflows/build-push.yaml b/.gitea/workflows/build-push.yaml new file mode 100644 index 0000000..45a4797 --- /dev/null +++ b/.gitea/workflows/build-push.yaml @@ -0,0 +1,98 @@ +name: cpanel-importer Build and Push +run-name: ${{ gitea.actor }} pushed a change to trunk +on: + push: + branches: + - trunk + tags: + - '20[0-9][0-9].[0-9][0-9].[0-9]+' + +jobs: + Build-and-Push: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea + uses: docker/login-action@v3 + with: + registry: repo.anhonesthost.net + username: ${{ secrets.CI_USER }} + password: ${{ secrets.CI_TOKEN }} + + # Compute the version tag. If the commit is on a `YYYY.MM.NNN` tag + # we tag the image with that version; otherwise we only tag :latest + # and :. + - name: Compute tags + id: tags + run: | + set -euo pipefail + SHA="${GITHUB_SHA:0:12}" + REG="repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer" + TAGS="${REG}:latest"$'\n'"${REG}:${SHA}" + # If this push includes a YYYY.MM.NNN tag, add it. + VER_TAG="${GITHUB_REF_NAME:-}" + if [[ "${GITHUB_REF:-}" == refs/tags/* && "$VER_TAG" =~ ^20[0-9][0-9]\.[0-9][0-9]\.[0-9]+$ ]]; then + TAGS="${TAGS}"$'\n'"${REG}:${VER_TAG}" + fi + echo "tags<> "$GITHUB_OUTPUT" + echo "$TAGS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "Resolved tags:" + echo "$TAGS" + + # First build locally (no push) so we can run a smoke test against + # the resolved image before pushing. The build is cached by Buildx + # so the push step below re-uses layers and is near-instant. + - name: Build Image (local, for smoke test) + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64 + push: false + load: true + tags: cpanel-importer:smoke + no-cache: true + + - name: Smoke test — image starts and `echo ok` works + run: | + set -euo pipefail + # Override the entrypoint so we don't have to provide the full + # IMPORT_* env set just to verify the image runs. + out="$(docker run --rm --entrypoint /bin/echo cpanel-importer:smoke ok)" + if [[ "$out" != "ok" ]]; then + echo "smoke test failed: expected 'ok', got '$out'" + exit 1 + fi + echo "smoke test passed" + + - name: PHP syntax check + run: | + set -euo pipefail + for f in scripts/*.php scripts/lib/*.php; do + docker run --rm -v "$PWD:/src" --entrypoint php cpanel-importer:smoke -l "/src/$f" + done + + - name: Bash syntax check + run: | + set -euo pipefail + for f in scripts/*.sh; do + docker run --rm -v "$PWD:/src" --entrypoint bash cpanel-importer:smoke -n "/src/$f" + done + + - name: Build and Push Image + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64 + push: true + tags: ${{ steps.tags.outputs.tags }} + cache-from: type=registry,ref=repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer:latest + cache-to: type=inline diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45ca3e9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +# Build artifacts +*.tar.gz +*.tgz +*.iso + +# Test fixtures are generated by tests/build-fixtures.sh — do NOT check in +# the synthetic tarballs themselves; rebuild from the script. +tests/fixtures/*.tar.gz +tests/fixtures/*.tgz + +# Local docker scratch +.docker-build/ +.docker-cache/ + +# Editor noise +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store +Thumbs.db + +# Local secrets (should never exist, but defense in depth) +.env +.env.local +*.key +*.pem + +# Local test output +/tmp/ +test-output/ +*.log diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..67bf69d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,192 @@ +# Contributing — cpanel-importer + +## How to add an auto-cleaner pattern + +Auto-cleaners live in `scripts/scan-files.php`, in the `$cleaners` +registry at the top of the main flow. + +A cleaner has three parts: + +```php +$cleaners['short-cleaner-name'] = [ + 'class' => 'KNOWN_REMOVABLE', // or 'REMOVABLE_WITH_BACKUP' + 'match' => fn(string $sig): bool => str_contains($sig, 'PHP.Trojan.EvalB64'), + 'clean' => function (string $path): bool { + // Read $path, transform, write back; return true on success. + // The file at $path is the LIVE extracted file — your edit + // here is what ends up in /host/sanitized//extracted/. + // The original has ALREADY been backed up to .original + // by the orchestrator before this is called. + }, +]; +``` + +### Safety checklist before merging a new cleaner + +1. **Backup is guaranteed.** The orchestrator copies the file to + `/.original` BEFORE calling `clean()`. Verify + this is still true in `scan-files.php` if you refactor the dispatch. +2. **Cleaner is idempotent.** Running it twice on the same file must + produce the same output the second time as the first. +3. **Cleaner is conservative.** If the file does NOT match your + transform exactly, return `false` (the orchestrator will fall back + to quarantining). Never "best-effort" a half-clean. +4. **Cleaner has a regression test.** Add a fixture under + `tests/fixtures/cleaner-/` with input + expected output, and + exercise it from `tests/run-tests.sh` (or your CI step). +5. **Cleaner classification is correct.** + - `KNOWN_REMOVABLE` = the whole pattern is known-safe to strip. + - `REMOVABLE_WITH_BACKUP` = legit file with injected lines; we are + confident in surgical removal but back up anyway. + - `QUARANTINE_ONLY` = no clean variant; don't write a `clean()`. +6. **Signature match is tight.** Prefer + `str_contains($sig, 'specific-sig-name')` over broad regex matches. + A false-positive cleaner can corrupt customer files. + +### Manual test loop + +```bash +docker build -t cpanel-importer:dev . +# Place a known-infected synthetic file under tests/fixtures/cleaner-X/in/ +# Run scan-files.php directly against it: +docker run --rm \ + --entrypoint /scripts/scan-files.php \ + -v "$PWD/tests/fixtures/cleaner-X/in:/tmp/extract" \ + -v "$PWD/tests/fixtures/cleaner-X/quarantine:/host/quarantine" \ + cpanel-importer:dev \ + --extract /tmp/extract --quarantine /host/quarantine \ + --report /tmp/r.json --import-id test +``` + +--- + +## How to add a WordPress content scan signature + +Scan checks live in `scripts/scan-dbs.php`, in `wp_content_scan()`. + +Each check should produce a flag dict on hit: + +```php +$flags[] = [ + 'severity' => 'high', // 'high' refuses the DB (per default threshold N=1) + // 'medium' / 'low' flag in the report but allow import + 'code' => 'short_machine_readable_code', + 'details' => 'Human-readable explanation including the matched value(s).', +]; +``` + +### Safety checklist + +1. **Severity reflects confidence.** Use `high` only when a false + positive is acceptable for the customer (they re-import via the + "import anyway" UI button). Errors of measurement here translate + directly to admin support tickets. +2. **Check is fast.** The whole `.sql` dump is in memory as a string; + prefer `preg_match` on the raw string or a pre-built map (see + `extract_wp_options()`) over re-parsing the full dump. +3. **Check is well-tested.** Add a fixture under + `tests/fixtures/wp-scan-/` with a synthetic dump that + triggers the flag and one that does not. +4. **Allow-list awareness.** If the check is comparing a value against + the customer's domain list, use + `domain_in_allowlist($host, $allowedDomains)` so subdomain matches + work consistently with the rest of the scanner. +5. **Don't break engine swap.** `wp_content_scan()` runs AFTER the + engine swap on the same `$rewritten` string. Both your check and + the engine swap must be tolerant of each other's output. + +--- + +## How to test locally + +### Build the image + +```bash +docker build -t cpanel-importer:dev . +``` + +Confirm the image is under the budget: + +```bash +docker images cpanel-importer:dev --format '{{.Size}}' +``` + +Target: < 1 GB extracted (spec asks < 600 MB compressed for prod, but +local builds typically come in around 700–900 MB extracted including +ClamAV signature DBs). + +### Build the fixtures + +```bash +bash tests/build-fixtures.sh +``` + +Two tarballs land under `tests/fixtures/`: +- `cpmove-clean.tar.gz` — a benign cpmove with a WordPress MyISAM dump. +- `cpmove-alfa.tar.gz` — same shape PLUS an ALFA-style symlink to /etc. + +### Run against the clean fixture + +```bash +mkdir -p /tmp/test-quarantine /tmp/test-sanitized +docker run --rm \ + -e IMPORT_ID=test-clean \ + -e IMPORT_USERNAME=testuser \ + -e IMPORT_BACKUP_FILE=/host/backup/cpmove-clean.tar.gz \ + -e CLAMAV_REFRESH=false \ + -v "$PWD/tests/fixtures/cpmove-clean.tar.gz:/host/backup/cpmove-clean.tar.gz:ro" \ + -v /tmp/test-quarantine:/host/quarantine \ + -v /tmp/test-sanitized:/host/sanitized \ + cpanel-importer:dev +``` + +Expect `status=completed`, MyISAM count > 0, no flags, exit 0. + +### Run against the ALFA fixture + +```bash +docker run --rm \ + -e IMPORT_ID=test-alfa \ + -e IMPORT_USERNAME=testuser \ + -e IMPORT_BACKUP_FILE=/host/backup/cpmove-alfa.tar.gz \ + -e CLAMAV_REFRESH=false \ + -v "$PWD/tests/fixtures/cpmove-alfa.tar.gz:/host/backup/cpmove-alfa.tar.gz:ro" \ + -v /tmp/test-quarantine:/host/quarantine \ + -v /tmp/test-sanitized:/host/sanitized \ + cpanel-importer:dev +``` + +Expect non-zero exit, `status=failed`, `failed_stage=extract`, and +stderr from inside the container containing +`tarball contains dangerous symlinks; aborting`. + +### Iterating on PHP / shell scripts + +The `scripts/` directory is `COPY`ed in late in the Dockerfile, so +edits there only re-trigger the last layer of the build — typical +turnaround is ~5 seconds. + +--- + +## Code style + +- Bash scripts: `set -euo pipefail`, absolute paths only, every external + command on its own logical line, comment each non-obvious flag. +- PHP scripts: 4-space indent, single quotes for non-interpolated + strings, ``. +- All scripts must be idempotent — the worker may be re-run against the + same `IMPORT_ID` on retry; second runs must overwrite the prior + `report.json` cleanly. + +--- + +## CI + +Pushes to `trunk` build + push the image to +`repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer:latest` and +`...:`. Pushes of a `YYYY.MM.NNN` tag additionally tag +`...:YYYY.MM.NNN`. CI runs the smoke test (image starts and +`echo ok` runs) and PHP `-l` / `bash -n` syntax checks on every script +before pushing. + +See `.gitea/workflows/build-push.yaml`. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e50d8ad --- /dev/null +++ b/Dockerfile @@ -0,0 +1,166 @@ +# cpanel-importer — sanitization sandbox for cPanel cpmove tarballs. +# +# See cpanel-import-container-spec.md §1 for the full design. +# +# Build: docker build -t cpanel-importer:dev . +# Run: see README.md for the docker run invocation the WHP panel uses. + +FROM almalinux:10-minimal + +LABEL org.opencontainers.image.title="cpanel-importer" +LABEL org.opencontainers.image.description="cPanel cpmove sanitization sandbox (ClamAV + SaneSecurity + WP content scan)" +LABEL org.opencontainers.image.source="https://repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer" +LABEL org.opencontainers.image.licenses="MIT" + +ARG TARGETARCH=amd64 +# UID/GID of the unprivileged worker. Matches the spec — panel calls +# `docker run --user 999:999`, so this UID must actually exist inside the +# image (the EPEL `clamav` and `php` user accounts collide with low UIDs; +# 999 is well clear of them). +ARG WHP_UID=999 +ARG WHP_GID=999 + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PHP_INI_DIR=/etc/php.d + +# Single RUN to minimize layers and image size. Cleans dnf cache and +# the SaneSecurity rsync temp files at the end of the layer. +# +# Pinning strategy: +# - PHP 8.4: AlmaLinux 10 stock ships PHP 8.3 only; the spec asks for +# 8.4 specifically. We add Remi's modular repo and enable the +# `php:remi-8.4` stream. We DO NOT pin to a specific 8.4.X because +# Remi rolls security patches into the same minor and an exact pin +# would block updates. +# - clamav / clamav-update: track the AL10 EPEL stream. CI builds +# monthly so signature DB age is bounded. +# - SaneSecurity: rsync at build time, then again at container start +# via `freshclam` (with the SaneSecurity third-party DBs configured). +# +# Ordering note: clamav-filesystem's RPM scripts auto-create a +# `virusgroup` system group at the next free GID. If we let dnf install +# clamav first, that lands at GID 999 — which then collides with the +# UID/GID we want for whp-import. We pre-create our user FIRST so +# virusgroup ends up at 998. +RUN set -eux; \ + # microdnf is what almalinux:10-minimal ships with by default. + microdnf -y install --setopt=install_weak_deps=0 \ + epel-release \ + dnf \ + shadow-utils \ + ; \ + # Add Remi's repo for PHP 8.4 (AL10 stock has 8.3 only). + dnf -y --setopt=install_weak_deps=0 install \ + https://rpms.remirepo.net/enterprise/remi-release-10.rpm ; \ + dnf -y --setopt=install_weak_deps=0 module reset php ; \ + dnf -y --setopt=install_weak_deps=0 module enable php:remi-8.4 ; \ + # Pre-create the worker BEFORE installing clamav so virusgroup + # doesn't claim our GID. + groupadd --system --gid ${WHP_GID} whp-import ; \ + useradd --system --uid ${WHP_UID} --gid ${WHP_GID} \ + --home-dir /opt/whp --no-create-home \ + --shell /sbin/nologin whp-import ; \ + dnf -y --setopt=install_weak_deps=0 install \ + php-cli \ + php-json \ + php-mbstring \ + php-pdo \ + php-mysqlnd \ + php-xml \ + php-zip \ + php-process \ + clamav \ + clamav-update \ + tar \ + gzip \ + bzip2 \ + xz \ + mariadb \ + rsync \ + ca-certificates \ + coreutils-single \ + findutils \ + which \ + ; \ + mkdir -p /opt/whp /scripts /host/backup /host/quarantine /host/sanitized \ + /var/lib/clamav /var/log/clamav ; \ + # /opt/whp + /var/log/clamav owned by worker now. /var/lib/clamav + # ownership is set AFTER the freshclam build-time pull below — root + # has to be able to write there during the build. + chown -R whp-import:whp-import /opt/whp /var/log/clamav ; \ + # /host/quarantine and /host/sanitized are the bind-mount RW + # targets. The panel chowns the HOST paths to UID 999 before + # invocation (see README.md). When the host path is empty Docker + # copies the IMAGE-side dir's ownership onto the new volume; we + # need that ownership to be whp-import so an empty bind mount on + # those paths still results in a writable volume. (Bind mounts to + # an EXISTING host dir keep host ownership and are independent of + # this — the panel sets up its own dirs with mode 750 owner 999.) + chown whp-import:whp-import /host/quarantine /host/sanitized ; \ + # Strip dnf cache. + dnf -y clean all ; \ + rm -rf /var/cache/dnf /var/cache/yum /var/cache/ldconfig/* \ + /usr/share/doc /usr/share/man /usr/share/info + +# Pre-seed ClamAV signature databases at build time so the first +# container run isn't dependent on freshclam succeeding before the scan. +# +# We do two passes: +# 1. freshclam (mainline ClamAV signatures: main.cvd, daily.cvd, bytecode.cvd). +# 2. rsync the SaneSecurity Foxhole.PHP DB — PHP-malware-focused, this +# is the high-value addition for our use case. Junkemailfilter rules +# are deliberately skipped (we don't scan email here). +# +# Both runs are wrapped in `|| true` so a transient network failure +# during build does not break the image build; the container also runs +# `freshclam` on start so a stale baseline gets refreshed at runtime. +COPY configs/freshclam.conf /etc/freshclam.conf +COPY configs/sanesecurity-mirror.txt /opt/whp/sanesecurity-mirror.txt + +# Pre-seed signatures as root, then chown the result. We don't ship the +# privilege-switching tools (runuser/su are in util-linux full, ~2MB we +# don't need at runtime) — the worker only needs to READ /var/lib/clamav +# and the runtime freshclam refresh runs as the same UID 999 anyway, so +# ownership matters there. +RUN set -eux; \ + chown whp-import:whp-import /etc/freshclam.conf ; \ + # Mainline ClamAV DB pull at build time so we have something to scan + # against even if the runtime freshclam refresh fails (e.g., no net). + # freshclam has a compile-time default --user=clamupdate (UID 997) + # and tries to setuid() to it; the build-time dir is whp-import-owned + # so we tell it explicitly to stay as root for this one-shot pull. + freshclam --no-warnings --user=root || \ + echo "WARN: freshclam failed during build; runtime refresh will retry" ; \ + # SaneSecurity Foxhole.PHP rules. The project rotates mirrors; the + # file we COPYed lists the working rsync mirror used at build time. + SANE_MIRROR="$(cat /opt/whp/sanesecurity-mirror.txt)" ; \ + rsync -av --no-motd --contimeout=30 \ + --include='foxhole_filename.cdb' \ + --include='foxhole_filename.cdb.sig' \ + --include='foxhole_generic.cdb' \ + --include='foxhole_generic.cdb.sig' \ + --include='foxhole_js.cdb' \ + --include='foxhole_js.cdb.sig' \ + --include='foxhole_js.ndb' \ + --include='foxhole_js.ndb.sig' \ + --include='foxhole_mail.cdb' \ + --include='foxhole_mail.cdb.sig' \ + --include='foxhole_all.ndb' \ + --include='foxhole_all.ndb.sig' \ + --exclude='*' \ + "rsync://${SANE_MIRROR}/sanesecurity/" /var/lib/clamav/ \ + || echo "WARN: SaneSecurity rsync failed during build; runtime freshclam will retry" ; \ + chown -R whp-import:whp-import /var/lib/clamav ; \ + chmod -R u=rwX,g=rX,o= /var/lib/clamav ; \ + ls -la /var/lib/clamav/ + +COPY --chown=whp-import:whp-import scripts/ /scripts/ +RUN chmod 0755 /scripts/entrypoint.sh /scripts/extract.sh \ + /scripts/scan-files.php /scripts/scan-dbs.php + +WORKDIR /opt/whp +USER whp-import + +# stdin is closed — the container reads its inputs from env + bind mounts. +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c063889 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 An Honest Host, LLC / cloud-hosting-platform contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..cee07e8 --- /dev/null +++ b/README.md @@ -0,0 +1,221 @@ +# cpanel-importer + +A **sanitization sandbox** for cPanel `cpmove` tarballs, run as a one-shot +Docker container before WHP imports a customer site. + +It is **not** a full importer. The container: + +1. extracts the cpmove tarball into a tmpfs scratch dir (after a + pre-extract symlink scan), +2. runs ClamAV (with SaneSecurity PHP-malware rules) over every file, + quarantining hits, +3. rewrites `ENGINE=MyISAM` → `ENGINE=InnoDB` in every `.sql` dump, +4. runs a WordPress content scan on each WP dump and refuses dumps with + high-confidence malware signals (e.g. `siteurl` pointing at a + non-customer domain), +5. rsyncs the cleaned tree to `/host/sanitized//`, +6. emits a JSON report describing every action taken. + +The WHP panel reads `/host/sanitized//report.json` after the +container exits and hands the cleaned files off to the existing +`CpanelBackupImporter` flow (Linux-user create, MySQL DB create, file +rsync, DNS push, container provision, etc.). + +**Full design:** `/workspace/cpanel-import-container-spec.md` (also +checked in at `docs/cpanel-import-container-spec.md` when this repo is +mirrored to the panel). + +**Panel-side glue:** `/workspace/whp/web-files/libs/CpanelBackupImporter.php` ++ `web-files/api/cpanel-import-ajax.php` + `web-files/pages/cpanel-import-results.php`. + +--- + +## How the panel invokes it + +```bash +docker run \ + --rm \ + --name whp-cpanel-import-${IMPORT_ID} \ + --network client-net \ + --user 999:999 \ + --cap-drop=ALL \ + --security-opt=no-new-privileges \ + --read-only \ + --tmpfs /tmp:rw,nosuid,nodev,exec,size=4g \ + --tmpfs /var/lib/clamav:rw,nosuid,nodev,size=512m \ + --volume /docker/users/${USERNAME}/userfiles/${BACKUP_NAME}:/host/backup/${BACKUP_NAME}:ro \ + --volume /docker/users/${USERNAME}/.cpanel-import-quarantine:/host/quarantine:rw \ + --volume /docker/users/${USERNAME}/.cpanel-import-sanitized:/host/sanitized:rw \ + --env IMPORT_ID=${IMPORT_ID} \ + --env IMPORT_USERNAME=${USERNAME} \ + --env IMPORT_BACKUP_FILE=/host/backup/${BACKUP_NAME} \ + --env CLAMAV_REFRESH=true \ + --memory=4g \ + --memory-swap=4g \ + --cpus=2 \ + --pull=missing \ + repo.anhonesthost.net/cloud-hosting-platform/cpanel-importer:2026.05.NNN +``` + +Container exits with status `0` on success, non-zero on any failure +(missing/unreadable backup, dangerous symlink found, scanner error). +Even on failure, `/host/sanitized//report.json` is written +with `"status": "failed"` and the failing stage. + +--- + +## Bind-mount catalog + +| Host path | Container path | Mode | Purpose | +|---|---|---|---| +| `/docker/users//userfiles/` | `/host/backup/` | RO | the cpmove input | +| `/docker/users//.cpanel-import-quarantine/` | `/host/quarantine/` | RW | files moved here on ClamAV hit | +| `/docker/users//.cpanel-import-sanitized//` | `/host/sanitized/` | RW | cleaned output the panel reads | + +Anything not listed here is **not** visible to the container. No `/etc`, +no `/usr`, no `/root`, no `/home`, no `docker.sock`. The worker runs as +UID/GID 999 with `--cap-drop=ALL --read-only`. + +--- + +## `report.json` schema + +Written to `/host/sanitized//report.json` at the end of every +run, success or failure. + +### Success + +```json +{ + "import_id": "import_abc123", + "status": "completed", + "scan_duration_seconds": 143, + "files_scanned": 28471, + "files_clean": 28432, + "files_cleaned": 0, + "files_quarantined": 39, + "actions": [ + { + "path": "cpmove-testuser/homedir/public_html/example.com/ALFA_DATA/index.php", + "signature": "PHP.Webshell.ALFA", + "action": "quarantined", + "cleaner": null, + "backup": "/host/quarantine/import_abc123/cpmove-testuser/homedir/public_html/example.com/ALFA_DATA/index.php" + } + ], + "databases": [ + { + "dbname": "testuser_wp", + "size_bytes": 5393199573, + "engine_changes": { + "myisam_to_innodb": 17, + "row_format_dynamic_applied": 0, + "fulltext_indexes_dropped": 0 + }, + "wp_content_scan": { + "is_wordpress": true, + "flags": [ + { + "severity": "high", + "code": "siteurl_external_domain", + "details": "wp_options.siteurl = \"http://evil.tld\" — host 'evil.tld' not in allowed domain list (example.com)" + } + ] + }, + "imported_into_new_server": false, + "flagged_sql_path": "/host/sanitized/import_abc123/mysql/testuser_wp.sql.flagged" + } + ], + "summary_for_panel": { + "show_alert": true, + "alert_severity": "warning", + "alert_message": "39 files quarantined + 0 cleaned in place; 1 database(s) refused as compromised. ..." + } +} +``` + +### Failure + +```json +{ + "import_id": "import_abc123", + "status": "failed", + "failed_stage": "extract", + "error": "scan-symlinks.php exited non-zero — tarball contains DANGEROUS symlinks", + "scan_duration_seconds": 4, + "files": null, + "databases": null +} +``` + +`failed_stage` is one of: `validate_env`, `freshclam`, `extract`, +`scan_files`, `scan_dbs`, `rsync_out`, `write_report`. + +--- + +## Local development + +```bash +# Build the image +docker build -t cpanel-importer:dev . + +# Build the synthetic fixture tarballs +bash tests/build-fixtures.sh + +# Run against the clean fixture +mkdir -p /tmp/test-quarantine /tmp/test-sanitized +docker run --rm \ + -e IMPORT_ID=test \ + -e IMPORT_USERNAME=testuser \ + -e IMPORT_BACKUP_FILE=/host/backup/cpmove-clean.tar.gz \ + -e CLAMAV_REFRESH=false \ + -v "$(pwd)/tests/fixtures/cpmove-clean.tar.gz:/host/backup/cpmove-clean.tar.gz:ro" \ + -v /tmp/test-quarantine:/host/quarantine \ + -v /tmp/test-sanitized:/host/sanitized \ + cpanel-importer:dev +cat /tmp/test-sanitized/test/report.json + +# Run against the ALFA-symlink fixture — must exit non-zero with a +# "dangerous symlinks" message and report.json should have +# status=failed, failed_stage=extract. +docker run --rm \ + -e IMPORT_ID=test-alfa \ + -e IMPORT_USERNAME=testuser \ + -e IMPORT_BACKUP_FILE=/host/backup/cpmove-alfa.tar.gz \ + -e CLAMAV_REFRESH=false \ + -v "$(pwd)/tests/fixtures/cpmove-alfa.tar.gz:/host/backup/cpmove-alfa.tar.gz:ro" \ + -v /tmp/test-quarantine:/host/quarantine \ + -v /tmp/test-sanitized:/host/sanitized \ + cpanel-importer:dev \ + && echo "BUG: should have exited non-zero" \ + || echo "OK: refused dangerous tarball" +cat /tmp/test-sanitized/test-alfa/report.json +``` + +--- + +## What is in this v1.0 vs. what is stubbed for v1.1+ + +| Feature | v1.0 | v1.1 | +|---|---|---| +| Pre-extract symlink scan | full port of `scanTarballForDangerousSymlinks` | – | +| Hardened tar extract | yes | – | +| ClamAV + SaneSecurity Foxhole.PHP rules | yes | – | +| File classification | quarantine-on-every-hit | KNOWN_REMOVABLE + REMOVABLE_WITH_BACKUP cleaners | +| MyISAM → InnoDB rewrite | yes | – | +| WP identification | yes (wp_options + wp_posts + wp_users + sentinel) | – | +| WP content scan | siteurl_external_domain only | post_content script-injection, theme/stylesheet malware patterns, user_pass leaked-hash, Wordfence regex | +| ROW_FORMAT=DYNAMIC, FULLTEXT drop | stubbed (always 0) | yes | +| Sandboxed MariaDB-in-container for SQL transforms | not present (regex transforms only) | yes | + +See `CONTRIBUTING.md` for how to add a cleaner pattern or a new WP scan +signature. + +--- + +## References + +- Spec: `/workspace/cpanel-import-container-spec.md` +- Panel-side importer: `/workspace/whp/web-files/libs/CpanelBackupImporter.php` +- WHP panel `safety-net.php`: `/workspace/whp/web-files/includes/safety-net.php` +- Existing CI workflow for sibling project: `/workspace/cloud-apache-container/.gitea/workflows/build-push.yaml` diff --git a/configs/freshclam.conf b/configs/freshclam.conf new file mode 100644 index 0000000..5ce42e6 --- /dev/null +++ b/configs/freshclam.conf @@ -0,0 +1,41 @@ +# cpanel-importer freshclam config. +# +# Minimal subset of /etc/freshclam.conf that the EL `clamav-update` +# package ships. We run freshclam at image build time AND at container +# start time (via entrypoint.sh when CLAMAV_REFRESH=true) so the rules +# DB is reasonably current. +# +# Anything not listed here uses the package defaults. + +DatabaseDirectory /var/lib/clamav +UpdateLogFile /var/log/clamav/freshclam.log +LogVerbose no +LogTime yes +LogFileMaxSize 10M +Foreground yes +# NOTE: DatabaseOwner is intentionally omitted. At build time freshclam +# runs as root and we chown the DB to whp-import after the pull. At +# runtime the entrypoint is already running as UID 999 (whp-import) via +# the docker `--user 999:999` flag, so no privilege drop is needed — +# leaving DatabaseOwner set would cause freshclam to refuse to start as +# whp-import (it tries to setuid to its configured DatabaseOwner before +# accepting the running uid is already that user). + +# Mainline ClamAV signatures. +DatabaseMirror database.clamav.net + +# Bound the SaneSecurity refresh attempts. SaneSecurity rules are +# secondary defense for us; the mainline ClamAV DB is the primary. +Checks 12 +ConnectTimeout 30 +ReceiveTimeout 60 + +# Skip the bytecode signatures — they target binary malware and add ~30 +# MB to the rules DB with limited payoff against PHP webshells. +# (Comment out the next line to re-enable.) +Bytecode no + +# Proxy support left at compile-time defaults (none). To enable, set +# HTTPProxyServer and HTTPProxyPort . We deliberately do +# NOT emit empty values for these — freshclam rejects empty option +# values with "Missing argument for option" and refuses to start. diff --git a/configs/sanesecurity-mirror.txt b/configs/sanesecurity-mirror.txt new file mode 100644 index 0000000..35004ab --- /dev/null +++ b/configs/sanesecurity-mirror.txt @@ -0,0 +1 @@ +rsync.sanesecurity.net diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100755 index 0000000..d3125a3 --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,219 @@ +#!/usr/bin/env bash +# +# entrypoint.sh — main controller for the cpanel-importer sandbox. +# +# Inputs (env, set by the panel's docker run): +# IMPORT_ID unique id for this run; used in quarantine + report paths +# IMPORT_USERNAME cPanel/WHP username the cpmove belongs to +# IMPORT_BACKUP_FILE absolute path inside the container, typically +# /host/backup/cpmove-.tar.gz +# CLAMAV_REFRESH "true" to run freshclam at start (default: true) +# +# Flow (spec §0): +# 1. validate env +# 2. (optional) refresh ClamAV signatures +# 3. extract → /tmp/extract/ +# 4. file scan → /tmp/scan-files-report.json +# 5. DB sanitize → /tmp/sanitized/mysql/, /tmp/scan-dbs-report.json +# 6. rsync /tmp/sanitized/ → /host/sanitized// +# 7. write /host/sanitized//report.json (merged) +# +# On failure at any stage we still write a partial report.json with +# status="failed" + the stage that broke, then exit non-zero. + +set -euo pipefail + +# --- logging --------------------------------------------------------------- + +ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; } +log() { printf '[%s] %s\n' "$(ts)" "$*"; } +die() { log "FATAL: $*"; write_failure_report "$STAGE" "$*"; exit 1; } + +# Buffered partial state. The final report.json is written by the merge +# step (see write_final_report); if we crash before then, write_failure_report +# emits whatever partial pieces exist. +STAGE="init" +START_TS="$(date -u +%s)" + +write_failure_report() { + local stage="$1" + local msg="$2" + local out_dir="/host/sanitized/${IMPORT_ID:-unknown}" + # mkdir AND the report write can both fail (mount RO, missing + # /host/sanitized, etc.); we log every failure to stderr and never + # let the report-writer abort the script. + if ! mkdir -p "$out_dir" 2>/dev/null; then + log "WARN: failure-report mkdir failed for $out_dir; report will not be persisted" + return 0 + fi + if ! cat > "$out_dir/report.json" 2>/dev/null </dev/null || echo '"(unencodable)"'), + "scan_duration_seconds": $(( $(date -u +%s) - START_TS )), + "files": null, + "databases": null +} +JSON + then + log "WARN: failure-report write failed for $out_dir/report.json" + fi +} + +# --- env validation -------------------------------------------------------- + +STAGE="validate_env" +log "cpanel-importer starting (container UID=$(id -u) GID=$(id -g))" + +: "${IMPORT_ID:?IMPORT_ID env var is required}" +: "${IMPORT_USERNAME:?IMPORT_USERNAME env var is required}" +: "${IMPORT_BACKUP_FILE:?IMPORT_BACKUP_FILE env var is required}" +CLAMAV_REFRESH="${CLAMAV_REFRESH:-true}" + +log "import_id=$IMPORT_ID username=$IMPORT_USERNAME backup=$IMPORT_BACKUP_FILE" + +if [[ ! -f "$IMPORT_BACKUP_FILE" ]]; then + die "backup file does not exist or is not a regular file: $IMPORT_BACKUP_FILE" +fi + +# Make sure the output dirs exist (they're bind mounts, so we trust the +# host to have created them, but mkdir -p is harmless). +QUARANTINE_DIR="/host/quarantine/$IMPORT_ID" +SANITIZED_DIR="/host/sanitized/$IMPORT_ID" +mkdir -p "$QUARANTINE_DIR" "$SANITIZED_DIR" \ + || die "cannot create quarantine/sanitized output dirs (are the bind mounts RW?)" + +# Container-internal scratch space (mounted as tmpfs by the panel). +EXTRACT_DIR="/tmp/extract" +WORK_DIR="/tmp/sanitized" +mkdir -p "$EXTRACT_DIR" "$WORK_DIR/mysql" + +# --- refresh ClamAV signatures -------------------------------------------- + +STAGE="freshclam" +if [[ "$CLAMAV_REFRESH" == "true" ]]; then + log "refreshing ClamAV signatures (freshclam)" + # freshclam is allowed to fail (e.g., container has no outbound net); + # we proceed with the baseline rules from build time + log a warning. + if ! freshclam --no-warnings >/tmp/freshclam.log 2>&1; then + log "WARN: freshclam failed; proceeding with build-time signature DB" + tail -20 /tmp/freshclam.log || true + fi +else + log "CLAMAV_REFRESH=false; skipping freshclam" +fi + +# --- extract the cpmove ---------------------------------------------------- + +STAGE="extract" +log "stage: extract" +if ! /scripts/extract.sh "$IMPORT_BACKUP_FILE" "$EXTRACT_DIR" "$IMPORT_USERNAME"; then + die "extract.sh failed; see stderr above" +fi + +# --- ClamAV scan + auto-clean/quarantine ---------------------------------- + +STAGE="scan_files" +log "stage: scan_files" +php /scripts/scan-files.php \ + --extract "$EXTRACT_DIR" \ + --quarantine "$QUARANTINE_DIR" \ + --report /tmp/scan-files-report.json \ + --import-id "$IMPORT_ID" \ + || die "scan-files.php failed; see stderr above" + +# --- DB engine swap + WP content scan ------------------------------------- + +STAGE="scan_dbs" +log "stage: scan_dbs" +php /scripts/scan-dbs.php \ + --extract "$EXTRACT_DIR" \ + --out "$WORK_DIR/mysql" \ + --final-prefix "$SANITIZED_DIR/mysql" \ + --report /tmp/scan-dbs-report.json \ + --import-id "$IMPORT_ID" \ + --username "$IMPORT_USERNAME" \ + || die "scan-dbs.php failed; see stderr above" + +# --- rsync cleaned tree to /host/sanitized -------------------------------- + +STAGE="rsync_out" +log "stage: rsync_out" +# Copy the (now-cleaned) extracted tree to the sanitized output. We exclude +# files that scan-files.php quarantined — they are NOT present in the +# extract dir anymore (the scanner moved them), so this is the cleaned +# tree by construction. +rsync -a --no-owner --no-group --no-perms --chmod=Du=rwx,Dg=rx,Do=,Fu=rw,Fg=r,Fo= \ + "$EXTRACT_DIR"/ "$SANITIZED_DIR/extracted/" \ + || die "rsync to sanitized dir failed" + +# Then drop the cleaned .sql files in place too. +rsync -a --no-owner --no-group --no-perms --chmod=Du=rwx,Dg=rx,Do=,Fu=rw,Fg=r,Fo= \ + "$WORK_DIR/mysql"/ "$SANITIZED_DIR/mysql/" \ + || die "rsync of cleaned .sql files failed" + +# --- merge per-stage reports into the final report.json ------------------- + +STAGE="write_report" +log "stage: write_report" +DURATION=$(( $(date -u +%s) - START_TS )) +php -r ' +$importId = $argv[1]; +$duration = (int) $argv[2]; +$filesPath = $argv[3]; +$dbsPath = $argv[4]; +$outPath = $argv[5]; + +$files = is_file($filesPath) ? json_decode(file_get_contents($filesPath), true) : null; +$dbs = is_file($dbsPath) ? json_decode(file_get_contents($dbsPath), true) : null; + +$filesScanned = $files["files_scanned"] ?? 0; +$filesClean = $files["files_clean"] ?? 0; +$filesCleaned = $files["files_cleaned"] ?? 0; +$filesQuarantined = $files["files_quarantined"] ?? 0; +$actions = $files["actions"] ?? []; +$databases = $dbs["databases"] ?? []; + +$dbRefused = 0; +foreach ($databases as $db) { + if (($db["imported_into_new_server"] ?? true) === false) $dbRefused++; +} + +$severity = "info"; +$alert = false; +$msg = "Sanitization clean: no malware signatures detected."; +if ($filesQuarantined > 0 || $dbRefused > 0) { + $alert = true; + $severity = ($filesQuarantined > 50 || $dbRefused > 0) ? "warning" : "info"; + $msg = sprintf( + "%d files quarantined + %d cleaned in place; %d database(s) refused as compromised. Customer site may have been compromised at the source — recommend review.", + $filesQuarantined, $filesCleaned, $dbRefused + ); +} + +$report = [ + "import_id" => $importId, + "status" => "completed", + "scan_duration_seconds" => $duration, + "files_scanned" => $filesScanned, + "files_clean" => $filesClean, + "files_cleaned" => $filesCleaned, + "files_quarantined" => $filesQuarantined, + "actions" => $actions, + "databases" => $databases, + "summary_for_panel" => [ + "show_alert" => $alert, + "alert_severity" => $severity, + "alert_message" => $msg, + ], +]; + +file_put_contents($outPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); +fprintf(STDERR, "report written: %s\n", $outPath); +' "$IMPORT_ID" "$DURATION" /tmp/scan-files-report.json /tmp/scan-dbs-report.json "$SANITIZED_DIR/report.json" \ + || die "report merge failed" + +log "done — exited cleanly after ${DURATION}s" +exit 0 diff --git a/scripts/extract.sh b/scripts/extract.sh new file mode 100755 index 0000000..a3ba98c --- /dev/null +++ b/scripts/extract.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# +# extract.sh — pre-extract symlink scan + cpmove untar. +# +# Usage: extract.sh +# +# Calls scripts/lib/scan-symlinks.php first; if it reports any DANGEROUS +# findings we abort BEFORE tar runs (per spec §0 step 2). On clean, +# extracts with the same hardening flags CpanelBackupImporter::extractBackup +# uses on the panel today (see web-files/libs/CpanelBackupImporter.php). + +set -euo pipefail + +TARBALL="${1:?usage: extract.sh }" +DEST="${2:?usage: extract.sh }" +USERNAME="${3:?usage: extract.sh }" + +ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; } +log() { printf '[%s] extract: %s\n' "$(ts)" "$*"; } + +[[ -f "$TARBALL" ]] || { log "tarball not found: $TARBALL"; exit 2; } +mkdir -p "$DEST" + +# --- pre-extract symlink scan --------------------------------------------- + +log "scanning tarball for dangerous symlinks (cpmove vector check)" +SYMLINK_REPORT=$(mktemp -p /tmp scan-symlinks.XXXXXX.json) +if ! php /scripts/lib/scan-symlinks.php \ + --tarball "$TARBALL" \ + --username "$USERNAME" \ + --report "$SYMLINK_REPORT"; then + log "scan-symlinks.php exited non-zero" + cat "$SYMLINK_REPORT" >&2 || true + log "ABORT: tarball contains dangerous symlinks; aborting" + # Propagate the report on stdout so entrypoint.sh can include it + # in the failure record. + exit 3 +fi + +log "symlink scan clean (no DANGEROUS findings)" + +# --- extract -------------------------------------------------------------- + +# Detect compression. cpmove can be .tar.gz / .tar.bz2 / .tar. +TAR_FLAGS="-xf" +case "$TARBALL" in + *.tar.gz|*.tgz) TAR_FLAGS="-xzf" ;; + *.tar.bz2|*.tbz2) TAR_FLAGS="-xjf" ;; + *.tar.xz|*.txz) TAR_FLAGS="-xJf" ;; + *.tar) TAR_FLAGS="-xf" ;; +esac + +log "extracting with hardened tar flags into $DEST" +# Hardening flags (mirrored from CpanelBackupImporter::extractBackup): +# --no-same-owner / --no-same-permissions: drop archive-recorded +# uid/perm bits so the cpmove can't drop setuid binaries at us. +# --no-overwrite-dir: refuse to clobber existing directory metadata, +# closing one historical tar-symlink-escape vector. +# --absolute-names is NOT used — leading / in a member name is stripped. +cd "$DEST" +tar --no-same-owner --no-same-permissions --no-overwrite-dir $TAR_FLAGS "$TARBALL" + +log "extracted OK ($(find "$DEST" -type f | wc -l) files)" +exit 0 diff --git a/scripts/lib/safety-net.php b/scripts/lib/safety-net.php new file mode 100644 index 0000000..350657a --- /dev/null +++ b/scripts/lib/safety-net.php @@ -0,0 +1,46 @@ + --report [--username ]\n"); + exit(2); +} +$tarPath = $opts['tarball']; +$reportPath = $opts['report']; +$username = $opts['username'] ?? ''; + +if (!is_file($tarPath) || !is_readable($tarPath)) { + fwrite(STDERR, "scan-symlinks: not a readable file: $tarPath\n"); + exit(2); +} + +// Same prefix list as the panel. +$dangerousPrefixes = [ + '/etc', '/usr', '/bin', '/sbin', '/lib', '/lib64', + '/boot', '/root', + '/var/lib', '/var/log', '/var/cache', '/var/spool', +]; + +$findings = []; +$cpanelUsername = null; + +$cmd = 'tar -tvf ' . escapeshellarg($tarPath) . ' 2>/dev/null'; +$fh = @popen($cmd, 'r'); +if (!$fh) { + fwrite(STDERR, "scan-symlinks: failed to spawn tar -tvf on $tarPath\n"); + exit(2); +} + +while (($line = fgets($fh)) !== false) { + if ($line === '' || $line[0] !== 'l') continue; + $arrow = strpos($line, ' -> '); + if ($arrow === false) continue; + $left = substr($line, 0, $arrow); + $right = rtrim(substr($line, $arrow + 4), "\r\n"); + $parts = preg_split('/\s+/', $left, 6); + if (count($parts) < 6) continue; + $archivePath = $parts[5]; + $target = $right; + + if ($target === '' || $target[0] !== '/') continue; + + if ($cpanelUsername === null) { + if (preg_match('#^cpmove-([^/]+)/#', $archivePath, $m)) { + $cpanelUsername = $m[1]; + } + } + + // (1) user-internal — accept symlinks pointing into the customer's + // own /home// tree. The panel rewrites these on extract. + $userInternal = false; + $usernames = []; + if ($cpanelUsername !== null && $cpanelUsername !== '') $usernames[] = $cpanelUsername; + if ($username !== '') $usernames[] = $username; + foreach ($usernames as $u) { + $prefix = '/home/' . $u . '/'; + if (strpos($target, $prefix) === 0 || $target === rtrim($prefix, '/')) { + $userInternal = true; + break; + } + if (preg_match('#^/home\d+/' . preg_quote($u, '#') . '(/|$)#', $target)) { + $userInternal = true; + break; + } + } + if ($userInternal) continue; + + // (2) exact root. + $type = null; + $reason = ''; + if ($target === '/') { + $type = 'DANGEROUS'; + $reason = 'absolute target is root /'; + } else { + // (3) — in container, every dangerous-prefix target is treated + // as DANGEROUS without a file_exists() check (see security note + // at top of file). + foreach ($dangerousPrefixes as $p) { + if ($target === $p || strpos($target, $p . '/') === 0) { + $type = 'DANGEROUS'; + $reason = "absolute target resolves under system path $p"; + break; + } + } + if ($type === null) { + // Target is absolute, not user-internal, not under a known + // dangerous prefix. Operators want to know about these. + $type = 'UNCERTAIN'; + $reason = 'absolute target outside user tree and not on dangerous-prefix list'; + } + } + + $findings[] = [ + 'type' => $type, + 'archive_path' => $archivePath, + 'target' => $target, + 'reason' => $reason, + ]; +} +pclose($fh); + +$dangerousCount = count(array_filter($findings, fn($f) => $f['type'] === 'DANGEROUS')); +$uncertainCount = count(array_filter($findings, fn($f) => $f['type'] === 'UNCERTAIN')); + +$report = [ + 'tarball' => $tarPath, + 'total_findings' => count($findings), + 'dangerous_count' => $dangerousCount, + 'uncertain_count' => $uncertainCount, + 'findings' => $findings, +]; + +@file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); + +if ($dangerousCount > 0) { + fwrite(STDERR, "scan-symlinks: $dangerousCount DANGEROUS finding(s); refusing tarball\n"); + foreach ($findings as $f) { + if ($f['type'] === 'DANGEROUS') { + fwrite(STDERR, sprintf(" %s -> %s (%s)\n", $f['archive_path'], $f['target'], $f['reason'])); + } + } + exit(1); +} + +fwrite(STDERR, "scan-symlinks: clean (uncertain=$uncertainCount, dangerous=0)\n"); +exit(0); diff --git a/scripts/scan-dbs.php b/scripts/scan-dbs.php new file mode 100755 index 0000000..2190694 --- /dev/null +++ b/scripts/scan-dbs.php @@ -0,0 +1,399 @@ + ENGINE=InnoDB. + * - WordPress identification: presence of wp_options/wp_posts/wp_users + * CREATE TABLEs (or prefix-variants where prefix != "wp_"). + * - WP content scan: ONE check — siteurl_external_domain — comparing + * wp_options.siteurl / wp_options.home against the cpanel userdata's + * main_domain + addon-domain list. + * - If any high-confidence flag fires, the .sql file is written with + * a .flagged suffix and imported_into_new_server=false. + * - Otherwise the rewritten .sql lands in /tmp/sanitized/mysql/. + * + * v1.1 will grow the WP scan check set (post_content script-injection, + * user_pass leaked-hash, Wordfence regex). See CONTRIBUTING.md for how + * to add a check. + * + * Usage: + * scan-dbs.php --extract DIR --out DIR --report OUT.json + * --import-id ID --username USER + * + * Exit codes: + * 0 on success (regardless of flags); 1 fatal; 2 usage. + * + * NOTE: docblock above must not contain the literal sequence "* /" + * (without the space) anywhere — PHP closes the C-style comment at + * that token and parses the rest as code. This bit us once on + * the cpmove-USER /mysql glob path. + */ + +require __DIR__ . '/lib/safety-net.php'; + +const SCANNER_VERSION = '1.0.0'; + +$opts = getopt('', ['extract:', 'out:', 'report:', 'import-id:', 'username:', 'final-prefix:']); +foreach (['extract', 'out', 'report', 'import-id', 'username'] as $k) { + if (!isset($opts[$k])) { + fwrite(STDERR, "usage: scan-dbs.php --extract DIR --out DIR --report OUT.json --import-id ID --username USER [--final-prefix PATH]\n"); + exit(2); + } +} +$extractDir = rtrim($opts['extract'], '/'); +$outDir = rtrim($opts['out'], '/'); +$reportPath = $opts['report']; +$importId = $opts['import-id']; +$username = $opts['username']; +// --final-prefix is the path .sql files will live at AFTER the rsync to +// /host/sanitized//mysql/. We record that path in the report +// so the panel doesn't have to translate /tmp/... paths. +$finalPrefix = isset($opts['final-prefix']) ? rtrim($opts['final-prefix'], '/') : $outDir; + +@mkdir($outDir, 0750, true); + +fwrite(STDERR, "scan-dbs: starting (extract=$extractDir, out=$outDir)\n"); + +// -- find all cpmove-*/mysql/*.sql dumps ----------------------------------- + +$sqlFiles = []; +foreach (glob($extractDir . '/cpmove-*/mysql/*.sql') ?: [] as $f) { + if (is_file($f)) $sqlFiles[] = $f; +} +// Some cpmove layouts use cpmove-/mysql/.create + .sql; +// glob above already covers .sql which is what we care about. + +if (empty($sqlFiles)) { + fwrite(STDERR, "scan-dbs: no .sql dumps found under $extractDir/cpmove-*/mysql/\n"); +} + +// -- discover the user's allowed-domain list from the cpmove userdata ----- + +$allowedDomains = collect_allowed_domains($extractDir, $username); +fwrite(STDERR, "scan-dbs: allowed domains for siteurl check: " + . (empty($allowedDomains) ? '(none discovered)' : implode(', ', $allowedDomains)) + . "\n"); + +$databases = []; + +foreach ($sqlFiles as $sqlPath) { + $dbName = basename($sqlPath, '.sql'); + fwrite(STDERR, "scan-dbs: processing $dbName ($sqlPath)\n"); + + $sizeBytes = filesize($sqlPath) ?: 0; + $sql = file_get_contents($sqlPath); + if ($sql === false) { + fwrite(STDERR, "scan-dbs: WARN failed to read $sqlPath; skipping\n"); + continue; + } + + // --- ENGINE SWAP (always applied) ------------------------------------- + + [$rewritten, $engineCounts] = engine_swap($sql); + + // --- WordPress identification + content scan ------------------------- + + $isWp = is_wordpress_dump($rewritten); + $flags = []; + if ($isWp) { + $flags = wp_content_scan($rewritten, $allowedDomains); + } + + $highConfidence = array_filter($flags, fn($f) => ($f['severity'] ?? '') === 'high'); + $refused = (bool) count($highConfidence); + + $outName = $dbName . '.sql' . ($refused ? '.flagged' : ''); + $outPath = $outDir . '/' . $outName; + $finalPath = $finalPrefix . '/' . $outName; + file_put_contents($outPath, $rewritten); + + $databases[] = [ + 'dbname' => $dbName, + 'size_bytes'=> $sizeBytes, + 'engine_changes' => [ + 'myisam_to_innodb' => $engineCounts['myisam_to_innodb'], + 'row_format_dynamic_applied' => 0, // v1.1 + 'fulltext_indexes_dropped' => 0, // v1.1 + ], + 'wp_content_scan' => [ + 'is_wordpress' => $isWp, + 'flags' => $flags, + ], + 'imported_into_new_server' => !$refused, + 'sanitized_sql_path' => $refused ? null : $finalPath, + 'flagged_sql_path' => $refused ? $finalPath : null, + ]; +} + +$report = [ + 'scanner_version' => SCANNER_VERSION, + 'import_id' => $importId, + 'databases' => $databases, +]; +file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); + +fwrite(STDERR, "scan-dbs: done — " . count($databases) . " database(s) processed\n"); +exit(0); + +// ---- helpers -------------------------------------------------------------- + +/** + * Rewrite ENGINE=MyISAM to ENGINE=InnoDB everywhere it appears as a + * table-options token. Returns [string $newSql, array $counts]. + * + * The regex is intentionally narrow: + * - case-insensitive (cpmove dumps vary) + * - anchored on word boundaries so we don't rewrite, say, + * a TEXT field that contains the literal string "ENGINE=MyISAM" + * (extremely unlikely but possible) + */ +function engine_swap(string $sql): array { + $count = 0; + $rewritten = preg_replace_callback( + '/\bENGINE\s*=\s*MyISAM\b/i', + function () use (&$count) { $count++; return 'ENGINE=InnoDB'; }, + $sql + ); + return [$rewritten ?? $sql, ['myisam_to_innodb' => $count]]; +} + +/** + * Identify WordPress by the canonical core-table CREATE statements. + * + * cPanel exports respect the customer's prefix, so we accept any + * prefix as long as the three core tables exist in this dump. + */ +function is_wordpress_dump(string $sql): bool { + $hasOptions = (bool) preg_match('/CREATE TABLE [`"]?\w*options[`"]?\s*\(/i', $sql); + $hasPosts = (bool) preg_match('/CREATE TABLE [`"]?\w*posts[`"]?\s*\(/i', $sql); + $hasUsers = (bool) preg_match('/CREATE TABLE [`"]?\w*users[`"]?\s*\(/i', $sql); + // Bonus signal: the dump also references the standard wp_options + // option_names. Cheap to check, drops a few false positives where + // an app shares table names with WP. + $optionsSentinel = (bool) preg_match("/'siteurl'|'home'|'template'|'stylesheet'/", $sql); + return $hasOptions && $hasPosts && $hasUsers && $optionsSentinel; +} + +/** + * Run the WP content scan. v1.0 ships ONE check: + * + * siteurl_external_domain — wp_options.siteurl or .home points at a + * host not in the allow list (cpanel main + addons). + * + * Returns an array of flag dicts; an empty array means "clean." + * + * v1.1 add: post_content script-injection signature, theme/stylesheet + * known-malware patterns, user_pass leaked-hash check, Wordfence regex. + */ +function wp_content_scan(string $sql, array $allowedDomains): array { + $flags = []; + + // Pull every (option_name, option_value) row from any INSERT INTO + // options. We use a forgiving regex because cPanel dumps + // use both single-row INSERTs and chunked multi-row INSERTs. + $optionValues = extract_wp_options($sql); + + foreach (['siteurl', 'home'] as $optName) { + if (!isset($optionValues[$optName])) continue; + $val = $optionValues[$optName]; + $host = parse_url($val, PHP_URL_HOST); + if ($host === null || $host === false || $host === '') continue; + + // localhost / IP literals are not external domains; let the + // panel handle them on the rewrite-wp-config pass. + if ($host === 'localhost' || filter_var($host, FILTER_VALIDATE_IP)) continue; + + if (!domain_in_allowlist($host, $allowedDomains)) { + $flags[] = [ + 'severity' => 'high', + 'code' => 'siteurl_external_domain', + 'details' => sprintf( + "wp_options.%s = %s — host '%s' not in allowed domain list (%s)", + $optName, + json_encode($val), + $host, + empty($allowedDomains) ? 'NONE; could not discover from cpmove userdata' : implode(', ', $allowedDomains) + ), + ]; + } + } + + return $flags; +} + +/** + * Pull a map of option_name => option_value from any INSERT into the + * options table. Returns ['siteurl' => '...', 'home' => '...', ...]. + * + * Best-effort — multi-row INSERTs with weird quoting can defeat the + * regex, in which case we report no values and the scan returns clean. + * That's acceptable because the panel will still rewrite siteurl on its + * own pass and any malicious siteurl that survives WILL show up in the + * customer-facing rendered URL — admin can spot it post-import. + */ +function extract_wp_options(string $sql): array { + $map = []; + + // Match INSERT INTO `..options` [(col, col, ...)] VALUES (rows...); + // The optional column list contains the literal "value" (lowercase + // via `option_value`) and uppercase V too, so we can't use [^V] + // as a delimiter — instead match a balanced parens column list + // followed by VALUES. + if (!preg_match_all( + '/INSERT\s+INTO\s+[`"]?\w*options[`"]?\s*(?:\([^)]*\)\s*)?VALUES\s*(.+?);\s*$/ims', + $sql, + $stmts + )) { + return $map; + } + + foreach ($stmts[1] as $body) { + // Split on `),(` between rows; first row has the leading `(`, + // last row has the trailing `)` — handled by trim below. + $body = trim($body); + $body = preg_replace('/^\(/', '', $body); + $body = preg_replace('/\)$/', '', $body); + $rows = preg_split('/\)\s*,\s*\(/', $body); + foreach ($rows as $row) { + $cells = parse_sql_row($row); + // wp_options columns: option_id, option_name, option_value, autoload + if (count($cells) >= 3) { + $name = $cells[1]; + $value = $cells[2]; + if (is_string($name) && is_string($value) && $name !== '') { + $map[$name] = $value; + } + } + } + } + + return $map; +} + +/** + * Parse one row of a MySQL INSERT VALUES tuple — comma-separated, + * strings single-quoted with backslash escapes. + * + * Not bulletproof (no comment handling, no DOUBLE-QUOTE strings) but + * good enough for cpmove dumps, which mysqldump produces in a + * predictable format. + */ +function parse_sql_row(string $row): array { + $cells = []; + $i = 0; + $n = strlen($row); + while ($i < $n) { + // Skip leading whitespace + commas. + while ($i < $n && (ctype_space($row[$i]) || $row[$i] === ',')) $i++; + if ($i >= $n) break; + $c = $row[$i]; + if ($c === "'") { + // Quoted string. + $i++; + $buf = ''; + while ($i < $n) { + $cc = $row[$i]; + if ($cc === '\\' && $i + 1 < $n) { + $next = $row[$i + 1]; + $buf .= match ($next) { + 'n' => "\n", + 't' => "\t", + 'r' => "\r", + '0' => "\0", + default => $next, + }; + $i += 2; + continue; + } + if ($cc === "'") { + // MySQL `''` -> literal ' + if ($i + 1 < $n && $row[$i + 1] === "'") { + $buf .= "'"; + $i += 2; + continue; + } + $i++; + break; + } + $buf .= $cc; + $i++; + } + $cells[] = $buf; + } else { + // Bareword / number / NULL — read until next comma. + $start = $i; + while ($i < $n && $row[$i] !== ',') $i++; + $tok = trim(substr($row, $start, $i - $start)); + $cells[] = (strcasecmp($tok, 'NULL') === 0) ? null : $tok; + } + } + return $cells; +} + +/** + * Discover the user's allowed-domain set by reading the cpmove + * userdata. cPanel writes: + * cpmove-/userdata/ — per-domain config + * cpmove-/userdata/main — the main domain + * cpmove-/addons — addon-domain list + * cpmove-/sds — subdomain list + * + * Best-effort. If we can't find any, the siteurl check still runs but + * will flag everything as external — surface up to admin. + */ +function collect_allowed_domains(string $extractDir, string $username): array { + $domains = []; + + foreach (glob($extractDir . '/cpmove-*/userdata') as $userdataDir) { + if (!is_dir($userdataDir)) continue; + foreach (scandir($userdataDir) ?: [] as $entry) { + if ($entry === '.' || $entry === '..' || $entry === 'main') continue; + // userdata/ is a file or dir keyed by the domain. + if (preg_match('/^[a-z0-9._-]+\.[a-z]{2,}$/i', $entry)) { + $domains[] = strtolower($entry); + } + } + // userdata/main is a YAML-ish file with main_domain: + $mainFile = $userdataDir . '/main'; + if (is_file($mainFile)) { + $content = file_get_contents($mainFile); + if ($content !== false && preg_match('/^main_domain:\s*(\S+)/m', $content, $m)) { + $domains[] = strtolower($m[1]); + } + } + } + + foreach (glob($extractDir . '/cpmove-*/addons') ?: [] as $addonsFile) { + if (!is_file($addonsFile)) continue; + $content = file_get_contents($addonsFile); + if ($content === false) continue; + // cPanel writes "addon.tld=parent.tld" lines. + foreach (preg_split('/\R/', $content) as $line) { + if (preg_match('/^([a-z0-9._-]+\.[a-z]{2,})/i', $line, $m)) { + $domains[] = strtolower($m[1]); + } + } + } + + return array_values(array_unique($domains)); +} + +/** + * True if $host is in the allow-list, including subdomain matches. + * + * e.g. allowed=['example.com'], host='www.example.com' -> true. + * allowed=['example.com'], host='malicious.tld' -> false. + * allowed=[], host='*' -> false (refuse-all). + */ +function domain_in_allowlist(string $host, array $allowed): bool { + if (empty($allowed)) return false; + $host = strtolower($host); + foreach ($allowed as $d) { + $d = strtolower($d); + if ($host === $d) return true; + if (str_ends_with($host, '.' . $d)) return true; + } + return false; +} diff --git a/scripts/scan-files.php b/scripts/scan-files.php new file mode 100755 index 0000000..6b96d4f --- /dev/null +++ b/scripts/scan-files.php @@ -0,0 +1,216 @@ + --quarantine --report --import-id + * + * Exit codes: + * 0 — scan completed (regardless of how many hits) + * 1 — fatal scanner error (clamscan binary missing, signature DB unreadable) + * 2 — usage error + * + * Report shape: matches spec §3, e.g.: + * { + * "files_scanned": N, + * "files_clean": N, + * "files_cleaned": 0, // always 0 in v1.0 — no cleaners yet + * "files_quarantined": N, + * "actions": [ { path, signature, action, cleaner, backup } ] + * } + */ + +require __DIR__ . '/lib/safety-net.php'; + +const SCANNER_VERSION = '1.0.0'; + +$opts = getopt('', ['extract:', 'quarantine:', 'report:', 'import-id:']); +foreach (['extract', 'quarantine', 'report', 'import-id'] as $k) { + if (!isset($opts[$k])) { + fwrite(STDERR, "usage: scan-files.php --extract --quarantine --report --import-id \n"); + exit(2); + } +} +$extractDir = rtrim($opts['extract'], '/'); +$quarantineDir = rtrim($opts['quarantine'], '/'); +$reportPath = $opts['report']; +$importId = $opts['import-id']; + +if (!is_dir($extractDir)) { + fwrite(STDERR, "scan-files: extract dir does not exist: $extractDir\n"); + exit(2); +} + +@mkdir($quarantineDir, 0750, true); + +fwrite(STDERR, "scan-files: starting (extract=$extractDir, quarantine=$quarantineDir)\n"); + +// -- v1.0 cleaner registry (intentionally empty) ---------------------------- +// +// Each entry maps a ClamAV signature substring -> classification + +// cleaner callable. v1.0 ships empty so EVERY hit is classified as +// QUARANTINE_ONLY. See CONTRIBUTING.md "Adding an auto-cleaner pattern" +// for how to add a tested entry. +// +// Shape (v1.1+): +// $cleaners = [ +// 'php-eval-base64-prefix' => [ +// 'class' => 'KNOWN_REMOVABLE', +// 'match' => fn(string $sig): bool => str_contains($sig, 'PHP.Trojan.EvalB64'), +// 'clean' => fn(string $path): bool => /* rewrite file in place; return ok */, +// ], +// ]; +$cleaners = []; + +// -- run clamscan recursively over the extract dir -------------------------- + +// We use --infected so the output is only hits, and --recursive so we +// walk subdirectories. We deliberately do NOT use --remove (we never want +// clamscan unlinking files — we control quarantine). +// +// Output format per line on a hit: +// /tmp/extract/foo/bar.php: Some.Signature.Name FOUND +$cmd = sprintf( + 'clamscan --infected --recursive --no-summary --stdout %s 2>/dev/null', + escapeshellarg($extractDir) +); + +$fh = popen($cmd, 'r'); +if (!$fh) { + fwrite(STDERR, "scan-files: failed to spawn clamscan\n"); + exit(1); +} + +$hits = []; +while (($line = fgets($fh)) !== false) { + $line = rtrim($line, "\r\n"); + if ($line === '' || !str_ends_with($line, ' FOUND')) continue; + // Strip trailing ' FOUND'. + $body = substr($line, 0, -6); + $colon = strrpos($body, ': '); + if ($colon === false) continue; + $path = substr($body, 0, $colon); + $sig = substr($body, $colon + 2); + if (!str_starts_with($path, $extractDir)) { + // Defensive: shouldn't happen with our invocation. + continue; + } + $hits[] = ['path' => $path, 'signature' => $sig]; +} +pclose($fh); + +// File count — we need files_scanned for the report. clamscan's summary +// counting is suppressed; do a fast file count ourselves. +$filesScanned = 0; +$rdi = new RecursiveDirectoryIterator($extractDir, FilesystemIterator::SKIP_DOTS); +$it = new RecursiveIteratorIterator($rdi); +foreach ($it as $entry) { + /** @var SplFileInfo $entry */ + if ($entry->isFile()) $filesScanned++; +} + +// -- classify + action each hit -------------------------------------------- + +$actions = []; +$cleaned = 0; +$quarantined = 0; + +foreach ($hits as $h) { + $path = $h['path']; + $sig = $h['signature']; + + // v1.0 — every hit is QUARANTINE_ONLY because the cleaner registry + // is empty. Future work in v1.1 will iterate $cleaners and pick a + // matching cleaner. + $classification = 'QUARANTINE_ONLY'; + foreach ($cleaners as $name => $entry) { + if (($entry['match'])($sig)) { + $classification = $entry['class']; + $cleanerName = $name; + break; + } + } + + $relPath = ltrim(substr($path, strlen($extractDir)), '/'); + $qPath = $quarantineDir . '/' . $relPath; + + if ($classification === 'QUARANTINE_ONLY') { + // Move the whole file to quarantine; remove from extract dir so + // the rsync to /host/sanitized/ does not include it. + @mkdir(dirname($qPath), 0750, true); + if (!@rename($path, $qPath)) { + // Fall back to copy + unlink (rename across mount boundaries + // sometimes EXDEVs even though /tmp and /host are both ours). + if (@copy($path, $qPath)) { + @unlink($path); + } else { + fwrite(STDERR, "scan-files: WARN failed to quarantine $path -> $qPath\n"); + continue; + } + } + $quarantined++; + $actions[] = [ + 'path' => $relPath, + 'signature' => $sig, + 'action' => 'quarantined', + 'cleaner' => null, + 'backup' => $qPath, + ]; + continue; + } + + // v1.1+ paths: + if ($classification === 'KNOWN_REMOVABLE' || $classification === 'REMOVABLE_WITH_BACKUP') { + // Backup first, then run the cleaner. + @mkdir(dirname($qPath), 0750, true); + $backup = $qPath . '.original'; + if (!@copy($path, $backup)) { + fwrite(STDERR, "scan-files: backup before clean failed: $path; quarantining instead\n"); + @rename($path, $qPath); + $quarantined++; + $actions[] = [ + 'path' => $relPath, 'signature' => $sig, + 'action' => 'quarantined', 'cleaner' => null, 'backup' => $qPath, + ]; + continue; + } + $cleanerOk = ($cleaners[$cleanerName]['clean'])($path); + if (!$cleanerOk) { + // Cleaner refused; fall back to quarantine. + @rename($path, $qPath); + $quarantined++; + $actions[] = [ + 'path' => $relPath, 'signature' => $sig, + 'action' => 'quarantined', 'cleaner' => $cleanerName, 'backup' => $qPath, + ]; + continue; + } + $cleaned++; + $actions[] = [ + 'path' => $relPath, 'signature' => $sig, + 'action' => 'cleaned', 'cleaner' => $cleanerName, 'backup' => $backup, + ]; + } +} + +$report = [ + 'scanner_version' => SCANNER_VERSION, + 'import_id' => $importId, + 'files_scanned' => $filesScanned, + 'files_clean' => max(0, $filesScanned - count($hits)), + 'files_cleaned' => $cleaned, + 'files_quarantined' => $quarantined, + 'actions' => $actions, +]; + +@file_put_contents($reportPath, json_encode($report, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"); + +fwrite(STDERR, sprintf( + "scan-files: done — scanned=%d clean=%d cleaned=%d quarantined=%d\n", + $filesScanned, $report['files_clean'], $cleaned, $quarantined +)); +exit(0); diff --git a/tests/build-fixtures.sh b/tests/build-fixtures.sh new file mode 100755 index 0000000..7914886 --- /dev/null +++ b/tests/build-fixtures.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# build-fixtures.sh — generate synthetic cpmove tarballs for testing. +# +# Two fixtures are built: +# +# cpmove-clean.tar.gz — a minimal cpmove with a benign homedir, one +# wp-style SQL dump with ENGINE=MyISAM tables +# and a clean siteurl, and a user-internal +# relative symlink (must not trigger). +# +# cpmove-alfa.tar.gz — same shape PLUS an ALFA-style symlink: +# `cpmove-testuser/homedir/.../alfasymlink -> /etc` +# — the pre-extract scan MUST refuse this. +# +# Run: bash tests/build-fixtures.sh +# Output: tests/fixtures/cpmove-clean.tar.gz, tests/fixtures/cpmove-alfa.tar.gz + +set -euo pipefail + +FIXTURES_DIR="$(cd "$(dirname "$0")" && pwd)/fixtures" +mkdir -p "$FIXTURES_DIR" + +USER=testuser +DOMAIN=example.com + +build_common_tree() { + local root="$1" + mkdir -p "$root/cpmove-$USER"/{homedir/public_html,mysql,userdata,addons,sds,ssl} + + # main userdata + cat > "$root/cpmove-$USER/userdata/main" < "$root/cpmove-$USER/userdata/$DOMAIN" < "$root/cpmove-$USER/homedir/public_html/index.php" + echo "Hello world." > "$root/cpmove-$USER/homedir/public_html/about.txt" + + # benign user-internal relative symlink — must NOT trigger the scan + ln -sf "../public_html/about.txt" "$root/cpmove-$USER/homedir/about-shortcut" + + # one synthetic WordPress mysql dump with ENGINE=MyISAM + a clean siteurl + cat > "$root/cpmove-$USER/mysql/${USER}_wp.sql" </dev/null || true' EXIT +build_common_tree "$CLEAN_TMP" + +tar -C "$CLEAN_TMP" -czf "$FIXTURES_DIR/cpmove-clean.tar.gz" "cpmove-$USER" +echo "wrote $FIXTURES_DIR/cpmove-clean.tar.gz ($(stat -c%s "$FIXTURES_DIR/cpmove-clean.tar.gz") bytes)" + +# ---- cpmove-alfa.tar.gz --------------------------------------------------- +# +# Build the SAME tree, then add an ALFA-shell-style symlink pointing at +# /etc. This is the exact vector that wiped whp02 — the importer's +# recursive walker followed the symlink and unlink()'d every file in +# /etc. Our pre-extract scan MUST refuse to extract this tarball. + +ALFA_TMP="$(mktemp -d)" +build_common_tree "$ALFA_TMP" + +mkdir -p "$ALFA_TMP/cpmove-$USER/homedir/public_html/$DOMAIN/ALFA_DATA" +echo "" \ + > "$ALFA_TMP/cpmove-$USER/homedir/public_html/$DOMAIN/ALFA_DATA/index.php" + +# THE attack: absolute-target symlink to /etc. +ln -sf "/etc" "$ALFA_TMP/cpmove-$USER/homedir/public_html/$DOMAIN/ALFA_DATA/root" + +tar -C "$ALFA_TMP" -czf "$FIXTURES_DIR/cpmove-alfa.tar.gz" "cpmove-$USER" +echo "wrote $FIXTURES_DIR/cpmove-alfa.tar.gz ($(stat -c%s "$FIXTURES_DIR/cpmove-alfa.tar.gz") bytes)" + +echo "" +echo "fixtures built:" +ls -la "$FIXTURES_DIR" diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..e69de29