Files
cpanel-importer/scripts/extract.sh
Claude (bootstrap) 9652a71816
All checks were successful
cpanel-importer Build and Push / Build-and-Push (push) Successful in 54s
extract: skip cpmove-*/homedir/mail tree
WHP does not import cPanel mailbox data (mail-import is a panel-side
roadmap item, not a sandbox-mode step). Extracting + ClamAV-scanning
the mail tree wastes time and disk: on real customer accounts the mail
dir often dwarfs everything else (10+ GB of historical maildir/mbox),
and clamscan has to walk every message.

Appended to the existing tar --exclude-from list (where we already
strip DANGEROUS-classified symlinks) so the existing plumbing covers
both. tar's fnmatch globs handle nested mail subdirs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 11:57:32 -07:00

134 lines
5.3 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# extract.sh — symlink scan + sanitized cpmove untar.
#
# Usage: extract.sh <tarball> <dest> <username> [<actions_out>]
#
# Calls scripts/lib/scan-symlinks.php first, then untars the cpmove with
# every DANGEROUS-classified symlink entry stripped via tar --exclude.
# The stripped-symlinks list is written as JSON to <actions_out> (default
# $DEST/.cpanel-importer-stripped-symlinks.json) so the merge step in
# entrypoint.sh can fold the stripped entries into report.json's actions[].
#
# Sandbox-mode posture: never refuse. ALFA-class root symlinks and other
# DANGEROUS entries are silently excluded from extraction; the panel sees
# them as quarantine actions on the results page instead of an import abort.
set -euo pipefail
TARBALL="${1:?usage: extract.sh <tarball> <dest> <username> [<actions_out>]}"
DEST="${2:?usage: extract.sh <tarball> <dest> <username> [<actions_out>]}"
USERNAME="${3:?usage: extract.sh <tarball> <dest> <username> [<actions_out>]}"
ACTIONS_OUT="${4:-${DEST}/.cpanel-importer-stripped-symlinks.json}"
ts() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }
log() { printf '[%s] extract: %s\n' "$(ts)" "$*"; }
[[ -f "$TARBALL" ]] || { log "tarball not found: $TARBALL"; exit 2; }
mkdir -p "$DEST"
# --- pre-extract symlink scan ---------------------------------------------
log "scanning tarball for dangerous symlinks (cpmove vector check)"
SYMLINK_REPORT=$(mktemp -p /tmp scan-symlinks.XXXXXX.json)
if ! php /scripts/lib/scan-symlinks.php \
--tarball "$TARBALL" \
--username "$USERNAME" \
--report "$SYMLINK_REPORT"; then
log "scan-symlinks.php exited with usage/IO error; aborting (this is not a sanitize-able state)"
cat "$SYMLINK_REPORT" >&2 || true
exit 3
fi
# --- compute exclude list from dangerous findings -------------------------
# Build a newline-delimited list of archive_path strings for tar --exclude-
# from. Also build a JSON actions[] array so entrypoint.sh's merge step can
# fold the strip-actions into report.json without re-parsing scan-symlinks.
EXCLUDES_FILE=$(mktemp -p /tmp tar-excludes.XXXXXX)
DANGEROUS_COUNT=$(python3 - "$SYMLINK_REPORT" "$EXCLUDES_FILE" "$ACTIONS_OUT" <<'PY'
import json, sys
src, excl_path, actions_path = sys.argv[1], sys.argv[2], sys.argv[3]
try:
with open(src) as fh:
r = json.load(fh)
except Exception as e:
sys.stderr.write(f"failed to parse scan-symlinks report: {e}\n")
print(0)
sys.exit(0)
dangerous = [f for f in r.get('findings', []) if f.get('type') == 'DANGEROUS']
with open(excl_path, 'w') as eh:
for f in dangerous:
p = f.get('archive_path', '')
if p:
eh.write(p + '\n')
actions = [
{
'action': 'stripped_dangerous_symlink',
'path': f.get('archive_path', ''),
'target': f.get('target', ''),
'reason': f.get('reason', ''),
}
for f in dangerous
]
with open(actions_path, 'w') as ah:
json.dump({'actions': actions, 'count': len(actions)}, ah, indent=2)
print(len(dangerous))
PY
)
if [[ "$DANGEROUS_COUNT" -gt 0 ]]; then
log "stripping $DANGEROUS_COUNT dangerous symlink(s) via tar --exclude-from"
while IFS= read -r path; do
log " STRIP: $path"
done < "$EXCLUDES_FILE"
fi
# Also skip the cpanel user's mail spool entirely. WHP doesn't import
# email (mail-import is a panel-side roadmap item, not a sandbox-mode
# step) so extracting + scanning the mailbox tree wastes time and disk:
# on real customer accounts the mail dir often dwarfs everything else
# (10+ GB of historical mbox/maildir). The exclude pattern uses tar's
# fnmatch globs and covers both maildir + spool layouts at any depth
# under homedir/. Logged as an info line so it shows up in the panel
# import log alongside the symlink strips.
{
echo "cpmove-*/homedir/mail"
echo "cpmove-*/homedir/mail/*"
} >> "$EXCLUDES_FILE"
log "skipping mail tree (cpmove-*/homedir/mail) — WHP does not import cPanel mail"
# --- extract --------------------------------------------------------------
# Detect compression. cpmove can be .tar.gz / .tar.bz2 / .tar.
TAR_FLAGS="-xf"
case "$TARBALL" in
*.tar.gz|*.tgz) TAR_FLAGS="-xzf" ;;
*.tar.bz2|*.tbz2) TAR_FLAGS="-xjf" ;;
*.tar.xz|*.txz) TAR_FLAGS="-xJf" ;;
*.tar) TAR_FLAGS="-xf" ;;
esac
log "extracting with hardened tar flags into $DEST"
# Hardening flags (mirrored from CpanelBackupImporter::extractBackup):
# --no-same-owner / --no-same-permissions: drop archive-recorded
# uid/perm bits so the cpmove can't drop setuid binaries at us.
# --no-overwrite-dir: refuse to clobber existing directory metadata,
# closing one historical tar-symlink-escape vector.
# --exclude-from=$EXCLUDES_FILE: strip every DANGEROUS-classified
# symlink (target = /, /etc, /root, /boot, /proc, /sys, /dev).
# Empty file = no-op exclude. tar's --exclude pattern matching
# uses fnmatch but our archive_path entries don't contain glob
# metacharacters (they came verbatim from `tar -tvf`), so the
# match is effectively a literal-path skip.
# --absolute-names is NOT used — leading / in a member name is stripped.
cd "$DEST"
tar --no-same-owner --no-same-permissions --no-overwrite-dir \
--exclude-from="$EXCLUDES_FILE" \
$TAR_FLAGS "$TARBALL"
log "extracted OK ($(find "$DEST" -type f | wc -l) files; $DANGEROUS_COUNT symlinks stripped)"
exit 0