hpr-tools/InternetArchive/repair_assets

645 lines
20 KiB
Plaintext
Raw Permalink Normal View History

#!/bin/bash -
#===============================================================================
#
# FILE: repair_assets
#
# USAGE: ./repair_assets showid
#
# DESCRIPTION: Given a show where there was a directory of asset files on the
# old HPR server whichj got lost in the migration, rebuild it
# and fill it with assets from the IA. Modify the show notes to
# point to these recovered assets.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
2024-08-22 20:51:44 +00:00
# VERSION: 0.0.8
# CREATED: 2024-05-10 21:26:31
# REVISION: 2024-08-23 11:55:25
#
#===============================================================================
# set -o nounset # Treat unset variables as an error
2024-08-22 20:51:44 +00:00
VERSION="0.0.8"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Select the appropriate working directory for the host
#
case $(hostname) in
i7-desktop)
BASEDIR="$HOME/HPR/InternetArchive"
;;
borg)
BASEDIR="$HOME/IA"
;;
*)
echo "Wrong host!"
exit 1
;;
esac
cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
#
# Load library functions
#
LIB="$HOME/HPR/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
# shellcheck disable=SC1090
source "$LIB"
#
# Enable coloured messages
#
define_colours
#
2024-08-22 20:51:44 +00:00
# Sanity checks. Calling 'command' on a non-existent script/program will
# return an empty string. This works for built-in stuff and things on the
# PATH. These are all 'true' tests with the stuff in braces being run if they
# are not true.
#
IA=$(command -v ia)
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
Q2T=$(command -v query2tt2)
[ -n "$Q2T" ] || { echo "Program 'query2tt2' was not found"; exit 1; }
FIXAL="$BASEDIR/fix_asset_links"
[ -e "$FIXAL" ] || { echo "Program '$FIXAL' was not found"; exit 1; }
2024-08-22 20:51:44 +00:00
TUNNEL_IS_OPEN=$(command -v tunnel_is_open)
[ -n "$TUNNEL_IS_OPEN" ] || { echo "Program 'tunnel_is_open' was not found"; exit 1; }
2024-08-22 20:51:44 +00:00
OPEN_TUNNEL=$(command -v open_tunnel)
[ -n "$OPEN_TUNNEL" ] || { echo "Program 'open_tunnel' was not found"; exit 1; }
#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
TMP2=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1 $TMP2' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
# {{{ -- Functions -- _verbose, _usage, _log, find_missing, make_dir
#=== FUNCTION ================================================================
# NAME: find_missing
# DESCRIPTION: Given two arrays containing IA assets and HPR assets,
# determine which IA assets are missing from the HPR list.
# PARAMETERS: $1 (nameref) IA list
# $2 (nameref) HPR list
# $3 Name of array to receive list of missing assets
# RETURNS: Nothing
#===============================================================================
find_missing () {
local -n IA="${1}"
local -n HPR="${2}"
local output="${3}"
local -A hIA hHPR
local i key
#
# Make a hash keyed by the IA file base names from an indexed array
#
for (( i=0; i<${#IA[@]}; i++ )); do
hIA+=([${IA[$i]##*/}]=${IA[$i]})
done
#
# Make a hash keyed by the HPR file base names from an indexed array
#
for (( i=0; i<${#HPR[@]}; i++ )); do
hHPR+=([${HPR[$i]##*/}]=${HPR[$i]})
done
#
# Use the basename keys to check what's missing, but return the full path
# names.
#
for key in "${!hIA[@]}"; do
if ! exists_in hHPR "$key"; then
eval "$output+=('${hIA[$key]}')"
fi
done
}
#=== FUNCTION ================================================================
# NAME: make_dir
# DESCRIPTION: Make a directory if it doesn't exist, failing gracefully on
# errors.
# PARAMETERS: $1 directory path
# RETURNS: True if success, otherwise exits the caller script
#===============================================================================
make_dir () {
local dir="${1}"
if [[ ! -d $dir ]]; then
mkdir -p "$dir" || {
coloured 'red' "Failed to create $dir"
exit 1
}
fi
}
#=== FUNCTION ================================================================
# NAME: _verbose
# DESCRIPTION: Writes a message in verbose mode
# PARAMETERS: * message strings to write
# RETURNS: Nothing
#===============================================================================
_verbose () {
[ "$VERBOSE" -eq 0 ] && return
for msg; do
printf '%s\n' "$msg"
done
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Appends a record to the file "$LOGFILE"
# PARAMETERS: $1 Message to write
# RETURNS: Nothing
#===============================================================================
_log () {
local message="${1}"
echo "$(date +%F\ %T) $message" >> "$LOGFILE"
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Reports usage; always exits the script after doing so
# PARAMETERS: 1 - the integer to pass to the 'exit' command
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i result=${1:-0}
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] showid
Attempts to repair an show where the directory of assets was not transferred
from the old HPR server.
Options:
-h Print this help
-v Run in verbose mode where more information is
reported. Default is off. If -v is repeated it
increases the verbosity level (levels 1 and 2 only).
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is changed but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-D Run in debug mode where a lot more information is
reported
Arguments:
showid The show id in the form 'hpr1234'
endusage
exit "$result"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#-------------------------------------------------------------------------------
# Directories and files
#-------------------------------------------------------------------------------
LOGS="$BASEDIR/logs"
make_dir "${LOGS}"
LOGFILE="$LOGS/$SCRIPT.log"
#-------------------------------------------------------------------------------
# Options
#-------------------------------------------------------------------------------
# Default settings
#
VERBOSE=0
#
# Process options
#
while getopts :d:Dhv opt
do
case "${opt}" in
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
h) _usage 0;;
v) ((VERBOSE++));;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
#
# Set option defaults and check their values
#
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
coloured 'red' "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $VERBOSE -gt 0 && $DRYRUN -eq 1 ]] && echo "Dry run mode"
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
#-------------------------------------------------------------------------------
# Argument check
#-------------------------------------------------------------------------------
# Should have one argument
#
if [[ $# != 1 ]]; then
coloured 'red' "Missing argument"
_usage 1
fi
show="${1,,}"
#
# Ensure show id is correctly formatted. We want it to be 'hpr1234'
#
if [[ $show =~ (hpr)?([0-9]+) ]]; then
printf -v show 'hpr%04d' "${BASH_REMATCH[2]}"
else
coloured 'red' "Incorrect show specification: $show"
coloured 'yellow' "Use 'hpr9999' or '9999' format"
exit 1
fi
_DEBUG "Parsed item: $show"
echo "Processing show $show"
_log "Processing show $show; dry-run: $([ "$DRYRUN" -eq 1 ] && echo "on" || echo "off")"
#-------------------------------------------------------------------------------
# Declarations and constants
#-------------------------------------------------------------------------------
declare -a iacache
#
# SHOWURL is where the show will be on the webserver
#
printf -v SHOWURL 'https://hackerpublicradio.org/eps/%s/index.html' "$show"
#
# CACHEDIR is where we store asset details and files
#
CACHEDIR="$BASEDIR/assets"
[ ! -d "$CACHEDIR" ] && {
coloured 'red' "Creating cache directory"
make_dir "$CACHEDIR"
}
#
# Pointers into the cache:
# LOCAL_ASSETDIR - where the cache for this show lives
# LOCAL_FILEDIR - where the IA files have been placed
# LOCAL_PARENTDIR - the equivalent directory to the top show dir
#
LOCAL_ASSETDIR="$CACHEDIR/${show}"
LOCAL_FILEDIR="$LOCAL_ASSETDIR/files"
LOCAL_PARENTDIR="$LOCAL_FILEDIR/${show}"
#
# Pointers to the HPR server directories:
# REMOTE_ASSETDIR - where the assets are to go
# REMOTE_PARENTDIR - the remote parent directory
#
REMOTE_ASSETDIR="public_html/eps/${show}/${show}"
REMOTE_PARENTDIR="public_html/eps/${show}"
CMDTPL='ssh hpr@hackerpublicradio.org %s'
MANIFEST="$CACHEDIR/$show/manifest"
DBNOTES="$CACHEDIR/$show/notes.html"
#-------------------------------------------------------------------------------
# Check the show exists in the database (or is visible on the website).
#-------------------------------------------------------------------------------
_verbose "Checking the show exists on the HPR server"
result=$(curl --head --silent --write-out "%{http_code}" --output /dev/null "$SHOWURL")
if [[ $result -eq 404 ]]; then
coloured 'red' "Could not detect show '$show' on the HPR server"
_log "Show '$show' not on the HPR server"
exit 1
fi
#-------------------------------------------------------------------------------
# Check the show exists on the IA
#-------------------------------------------------------------------------------
_verbose "Checking the show exists on the IA server"
if ! ia metadata "$show" --exists > /dev/null 2>&1; then
coloured 'red' "Could not detect show '$show' on the IA server"
coloured 'yellow' "Check that archive.org is available"
coloured 'yellow' "Try https://downfor.io/internet-archive"
_log "Show '$show' not on the IA server"
exit 1
fi
#-------------------------------------------------------------------------------
# Check IA, collect contents, classify them
#-------------------------------------------------------------------------------
# Interrogate the IA for the required item contents. If it returns True we can
# collect its contents, otherwise we can't proceed. The file 'TMP1' contains
# just a simple list of the files on the IA relating to this item.
#
_verbose "Collecting filenames from the IA server"
if ia list "$show" > "$TMP1"; then
while read -r iafile; do
iacache+=("$iafile")
done < "$TMP1"
else
coloured 'red' "Item $show can't be found on the IA"
coloured 'red' "Can't continue"
_log "Files for show '$show' not on the IA server"
exit 1
fi
_DEBUG "IA cache" "${iacache[@]}"
#
# Determine which files are assets
#
_verbose "Categorising files held on the IA"
declare -a audio ia_transcript ia_asset
audio_re="^${show}\.(flac|mp3|ogg|opus|spx|wav)\$"
# transcript_re="^${show}/${show}/${show}\.(json|srt|tsv|txt|vtt)\$"
transcript_re="^${show}/${show}\.(json|srt|tsv|txt|vtt)\$"
asset_re="^${show}/(${show}/)?.*\$"
metadata_re="^(__ia_thumb.jpg|${show}[^/]+\.(afpk|torrent|gz|xml|sqlite|png))\$"
for file in "${iacache[@]}"; do
if [[ $file =~ $audio_re ]]; then
audio+=("$file")
elif [[ $file =~ $metadata_re ]]; then
_verbose "Skipping $file"
continue
elif [[ $file =~ $transcript_re ]]; then
ia_transcript+=("$file")
elif [[ $file =~ $asset_re ]]; then
ia_asset+=("$file")
fi
done
#
# Report what was collected at verbosity level 2
#
if [[ $VERBOSE -gt 1 ]]; then
coloured 'cyan' "** audio (${#audio[@]}):"
printf '%s\n' "${audio[@]}"
coloured 'cyan' "** transcript (${#ia_transcript[@]}):"
printf '%s\n' "${ia_transcript[@]}"
coloured 'cyan' "** asset (${#ia_asset[@]}):"
printf '%s\n' "${ia_asset[@]}"
_log "IA asset count for show '$show' = ${#ia_asset[@]}"
fi
#
# No assets, no need to proceed!
#
if [[ ${#ia_asset[@]} -eq 0 ]]; then
coloured 'green' "No IA assets found for show $show; nothing to do"
_log "Nothing to do for show $show"
exit
fi
#-------------------------------------------------------------------------------
# Check what's on the HPR server
#-------------------------------------------------------------------------------
#
# 'rc' is the remote command template
#
printf -v rc 'find public_html/eps/%s -type f -printf "%s/%%P\\n"' "$show" "$show"
#
# 'command' is the local command we'll run to run a remote command on the HPR
# server
#
# shellcheck disable=SC2059 disable=SC2089
printf -v command "$CMDTPL" "'$rc'"
if [[ $VERBOSE -gt 1 ]]; then
echo "Command: $command"
fi
declare -a hpr_asset
ignore_re="index.html$"
#
# Run the command and save the output. Save the asset names returned in an
# array. TODO: Handle errors from the command
#
if [[ $DRYRUN -eq 0 ]]; then
eval "$command" > "$TMP2"
RES=$?
if [[ $RES -eq 0 ]]; then
_verbose "$(coloured 'green' "Remote command successful")"
while read -r hprfile; do
if [[ ! $hprfile =~ $ignore_re ]]; then
hpr_asset+=("${hprfile}")
fi
done < "$TMP2"
_verbose "$(coloured 'green' "Assets found on HPR server = ${#hpr_asset[@]}")"
_verbose "$(printf '%s\n' "${hpr_asset[@]}")"
_log "Assets found on HPR server = ${#hpr_asset[@]}"
else
coloured 'red' "Remote command failed"
_log "Failed while searching for HPR assets"
exit 1
fi
else
coloured 'yellow' "Would have searched for assets on the HPR server"
fi
#-------------------------------------------------------------------------------
# Compare the two asset lists and return what's missing on the HPR server
#-------------------------------------------------------------------------------
# TODO: This algorithm does not handle the instance where there are pictures
# in one directory and a lower directory containing thumbnails, AND THE FILE
# NAMES ARE THE SAME!
#
declare -a missing
find_missing ia_asset hpr_asset missing
_verbose "$(coloured 'cyan' "** missing (${#missing[@]}):")"
_verbose "$(printf '%s\n' "${missing[@]}")"
if [[ ${#missing[@]} -eq 0 ]]; then
coloured 'green' "No missing assets detected; nothing to do"
_log "No missing assets detected; nothing to do"
exit
else
coloured 'yellow' \
"Found ${#missing[@]} $(ngettext file files ${#missing[@]}) missing on the HPR server"
fi
#-------------------------------------------------------------------------------
# Prepare to copy the missing files
#-------------------------------------------------------------------------------
make_dir "$LOCAL_FILEDIR"
declare -a downloads
#
# Check whether files are already downloaded
#
for file in "${missing[@]}"; do
if [[ ! -e "$LOCAL_FILEDIR/$show/$file" ]]; then
downloads+=("$file")
fi
done
_verbose "$(coloured 'cyan' "** downloads (${#downloads[@]}):")"
_verbose "$(printf '%s\n' "${downloads[@]}")"
#
# If we have files to download get them now
#
if [[ ${#downloads[@]} -gt 0 ]]; then
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have downloaded missing files from the IA"
else
ia download "$show" --destdir="$LOCAL_FILEDIR" "${downloads[@]}"
RES=$?
if [[ $RES -eq 0 ]]; then
coloured 'green' "Downloads complete"
_log "Downloaded IA assets for show $show"
fi
fi
else
coloured 'yellow' "IA files are already downloaded"
fi
# shellcheck disable=SC2089
RSYNCTPL="rsync -a -e 'ssh' %s hpr@hpr:%s"
#-------------------------------------------------------------------------------
# Build the 'ssh' command to make a directory
#-------------------------------------------------------------------------------
#
# Prepare to make the remote directory if necessary.
#
# - $rc is the remote command we'll run on the server
# - $command is the full 'ssh' command including $rc
#
printf -v rc 'if [ ! -e "%s" ]; then mkdir -p "%s"; fi' \
"$REMOTE_ASSETDIR" "$REMOTE_ASSETDIR"
# shellcheck disable=SC2059 disable=SC2089
printf -v command "$CMDTPL" "'$rc'"
#-------------------------------------------------------------------------------
# Run or report the command that would be run
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 0 ]]; then
eval "$command"
RES=$?
if [[ $RES -eq 0 ]]; then
coloured 'green' "Remote directory creation successful"
else
coloured 'red' "Remote directory creation failed"
fi
else
coloured 'yellow' "Would have created the remote directory"
echo "$command"
fi
#-------------------------------------------------------------------------------
# Synchronise assets to the directory
#-------------------------------------------------------------------------------
# We perform an 'rsync' over 'ssh' to synchronise files from
# ~/HPR/InternetArchive/assets/hprXXXX/files/hprXXXX to
# public_html/eps/hprXXXX (on the HPR server)
#
# shellcheck disable=SC2059 disable=SC2089
printf -v command "$RSYNCTPL" "$LOCAL_PARENTDIR/" "$REMOTE_PARENTDIR/"
if [[ $DRYRUN -eq 0 ]]; then
eval "$command"
RES=$?
if [[ $RES -eq 0 ]]; then
coloured 'green' "Remote upload successful"
_log "Uploaded assets for show $show"
else
coloured 'red' "Remote upload failed"
exit 1
fi
else
coloured 'yellow' "Would have synchronised local assets with the remote directory"
echo "$command"
fi
#-------------------------------------------------------------------------------
# Make a 'manifest' file if necessary
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 0 ]]; then
if [[ ! -e $MANIFEST ]]; then
find "$LOCAL_PARENTDIR" -type f -printf '%P\n' > "$MANIFEST"
_verbose "$(coloured 'green' "Created manifest file")"
_log "Created manifest file $MANIFEST"
fi
fi
#-------------------------------------------------------------------------------
# Save the notes from the database if necessary
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 0 ]]; then
if [[ ! -e $DBNOTES ]]; then
2024-08-22 20:51:44 +00:00
if ! $TUNNEL_IS_OPEN; then
$OPEN_TUNNEL
fi
2024-08-22 20:51:44 +00:00
if $Q2T -config="$BASEDIR/.hpr_livedb.cfg" \
-temp="$BASEDIR/query2tt2_nokey.tpl" \
-out="$DBNOTES" \
-dbarg="${show:3}" \
'select notes from eps where id = ?'
then
_verbose "$(coloured 'green' "Created notes file")"
_log "Created notes file $DBNOTES"
else
_verbose "$(coloured 'red' "Creation of notes file failed")"
_log "Creation of notes file $DBNOTES failed"
fi
fi
fi
#-------------------------------------------------------------------------------
2024-08-22 20:51:44 +00:00
# Adjust the notes with 'fix_asset_links' (exists but not fully implemented
# yet :-)
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 0 ]]; then
echo "$FIXAL"
# $FIXAL
fi
#
# All done
#
if [[ $DRYRUN -eq 0 ]]; then
_log "Repaired show $show"
fi
#-------------------------------------------------------------------------------
# √ Make a place to hold the files on this machine
# √ Download them from the IA
# √ Make a directory on the HPR server
# √ Copy the assets to the HPR server
2024-08-22 20:51:44 +00:00
# √ Modify the notes to point to the assets on the server
#-------------------------------------------------------------------------------
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker