From 19030fee718c758be6f08b3637b909732d10a061 Mon Sep 17 00:00:00 2001 From: Dave Morriss Date: Thu, 22 Aug 2024 13:13:38 +0100 Subject: [PATCH] Updates for show "repair" processing InternetArchive/future_upload: Added logging and debugging InternetArchive/ia_db.sql: Added new tables InternetArchive/recover_transcripts: New script to run on 'borg' and copy missing files from the backup disk to the IA InternetArchive/repair_assets: More comments, including one about a bug in the design. InternetArchive/repair_item: Fix relating to octal numbers (if there are leading zeroes in a number). '_DEBUG' is now in the function library. Added comments to explain obscure stuff. InternetArchive/snapshot_metadata: New Bash script (to run on my desktop) which collects metadata for a show and stores in in the '~/HPR/IA/assets' directory. Runs 'view_derivatives' on it to find derivative files for deletion. InternetArchive/tidy_uploaded: Moves files and directories containing uploaded files into a holding area for later backup. Added debugging, logging and a 'force' mode. InternetArchive/upload_manager: Manages 'ia.db' (on my workstation). Needs many updates which have just started to be added. InternetArchive/weekly_upload: Old script, now obsolete. --- InternetArchive/future_upload | 53 ++- InternetArchive/ia_db.sql | 70 +++- InternetArchive/recover_transcripts | 590 ++++++++++++++++++++++++++++ InternetArchive/repair_assets | 19 +- InternetArchive/repair_item | 46 ++- InternetArchive/snapshot_metadata | 197 ++++++++++ InternetArchive/tidy_uploaded | 83 +++- InternetArchive/upload_manager | 3 +- InternetArchive/weekly_upload | 6 + 9 files changed, 994 insertions(+), 73 deletions(-) create mode 100755 InternetArchive/recover_transcripts create mode 100755 InternetArchive/snapshot_metadata diff --git a/InternetArchive/future_upload b/InternetArchive/future_upload index 689d71e..1330861 100755 --- a/InternetArchive/future_upload +++ b/InternetArchive/future_upload @@ -13,9 +13,9 @@ # NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as # well as 'update_state' # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.14 +# VERSION: 0.0.15 # CREATED: 2021-01-07 12:11:02 -# REVISION: 2024-03-03 14:12:30 +# REVISION: 2024-07-29 23:17:45 # #=============================================================================== @@ -26,7 +26,7 @@ SCRIPT=${0##*/} STDOUT="/dev/fd/2" -VERSION="0.0.14" +VERSION="0.0.15" # # Load library functions @@ -36,6 +36,8 @@ LIB="$HOME/bin/function_lib.sh" # shellcheck disable=SC1090 source "$LIB" +# {{{ -- Functions -- check_uploads, _log, _usage + #=== FUNCTION ================================================================ # NAME: check_uploads # DESCRIPTION: Determines if files exist for uploading @@ -59,6 +61,31 @@ check_uploads () { return 0 } +#=== FUNCTION ================================================================ +# NAME: _log +# DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script +# using the predefined $LOGREC, a template for 'printf'. If the +# latter is not defined the function will use a default. +# For some reason 'shellcheck' objects to this function. The +# first argument to 'printf' needs to be -1 to make the +# '%(fmt)T' use today's date and time. +# PARAMETERS: 1 - the message to write +# RETURNS: Nothing +#=============================================================================== +# shellcheck disable=SC2317 disable=SC2059 +_log () { + local msg="$1" + +# echo "D> $LOGFILE $LOGREC" + [ -v LOGFILE ] || { echo "${FUNCNAME[0]}: \$LOGFILE is not defined"; exit 1; } + [ -v LOGREC ] || { local LOGREC='%(%F %T)T %s\n'; } + +# echo "D> $LOGFILE $LOGREC" + printf "$LOGREC" -1 "$msg" >> "$LOGFILE" + + return +} + #=== FUNCTION ================================================================ # NAME: _usage # DESCRIPTION: Report usage @@ -108,18 +135,7 @@ endusage exit "$res" } -#=== FUNCTION ================================================================ -# NAME: _DEBUG -# DESCRIPTION: Writes a message if in DEBUG mode -# PARAMETERS: List of messages -# RETURNS: Nothing -#=============================================================================== -_DEBUG () { - [ "$DEBUG" == 0 ] && return - for msg in "$@"; do - printf 'D> %s\n' "$msg" - done -} +# }}} #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -144,6 +160,7 @@ cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; } # LOGS="$BASEDIR/logs" LOGFILE="$LOGS/$SCRIPT.log" +LOGREC='%(%F %T)T %s\n' # # Tools @@ -157,8 +174,8 @@ UPSTATE="$BASEDIR/update_state" # # Fallback URL # -URL_BAK="http://hub.hackerpublicradio.org/cms/status.php" -QUERY2_BAK="${BASECOM} -o - ${URL_BAK}" +# URL_BAK="http://hub.hackerpublicradio.org/cms/status.php" +# QUERY2_BAK="${BASECOM} -o - ${URL_BAK}" # # Prerequisites @@ -575,4 +592,4 @@ fi exit -# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21 +# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker diff --git a/InternetArchive/ia_db.sql b/InternetArchive/ia_db.sql index 45baa34..53ce303 100644 --- a/InternetArchive/ia_db.sql +++ b/InternetArchive/ia_db.sql @@ -3,11 +3,11 @@ * ========= * * Schema for SQLite database 'ia.db' used to hold IA upload information - * Last updated: 2022-06-16 + * Last updated: 2024-07-15 * */ -/* +/* ---------------------------------------------------------------------------- * Table: episodes * * id show number from HPR @@ -44,7 +44,7 @@ CREATE TABLE episodes ( notes text ); -/* +/* ---------------------------------------------------------------------------- * Table: assets * * id primary key @@ -62,7 +62,7 @@ CREATE TABLE assets ( uploaded integer default 0 ); -/* +/* ---------------------------------------------------------------------------- * Index: assets_filename_idx * * Attempt to constrain duplicates in the assets table @@ -70,7 +70,7 @@ CREATE TABLE assets ( */ CREATE UNIQUE INDEX assets_filename_idx ON assets (episode_id, filename); -/* +/* ---------------------------------------------------------------------------- * Table: dirlist * * id primary key @@ -82,6 +82,66 @@ CREATE TABLE dirlist ( filename text NOT NULL ); +/* ---------------------------------------------------------------------------- + * Table: hpr_repairs + * + * episode_id Primary key, foreign key for 'episodes' + * repaired Boolean showing whether the show has been repaired + * repair_date Date of repair + * notes Notes about any anomalies + * asset_count Number of assets (after ignoring transcripts, etc) + * + */ +CREATE TABLE hpr_repairs ( + episode_id integer PRIMARY KEY REFERENCES episodes(id), + repaired integer default 0, + repair_date integer default 0, + notes text default null, + asset_count integer default 0 + +); + +/* ---------------------------------------------------------------------------- + * Table: ia_repairs + * + * episode_id Primary key, foreign key for 'episodes' + * repaired Boolean showing whether the show has been repaired + * repair_date Date of repair + * notes Notes about any anomalies + * + */ +CREATE TABLE ia_repairs ( + episode_id integer PRIMARY KEY REFERENCES episodes(id), + repaired integer default 0, + repair_date integer default 0, + notes text default null +); + +/* ---------------------------------------------------------------------------- + * Table: show_host_xref + * + * episode_id Foreign key for 'episodes' + * hostid Host number from MySQL database + * hostname Host name from MySQL database + * + */ +CREATE TABLE "show_host_xref" ( + "episode_id" integer, + "hostid" integer, + "hostname" text DEFAULT null, + FOREIGN KEY("episode_id") REFERENCES "episodes"("id") +); + +/* ---------------------------------------------------------------------------- + * Index: show_host_xref_idx + * + * Attempt to constrain duplicates in the show_host_xref table + * + */ +CREATE UNIQUE INDEX "show_host_xref_idx" ON "show_host_xref" ( + "episode_id" ASC +); + /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * View: episodes_view * diff --git a/InternetArchive/recover_transcripts b/InternetArchive/recover_transcripts new file mode 100755 index 0000000..c1d23e5 --- /dev/null +++ b/InternetArchive/recover_transcripts @@ -0,0 +1,590 @@ +#!/bin/bash - +#=============================================================================== +# +# FILE: recover_transcripts +# +# USAGE: ./recover_transcripts item +# +# DESCRIPTION: Intended to be run on `borg`; collects assets from the +# locally-mounted backup disk and places them in a local +# directory (organised to be compatible with the IA), then +# uploads anything that is missing on the IA. +# +# Version 0.1.* looks for assets in the 'eps/' directory and +# copies them to the cache. Also moves the IA copies so all is +# aligned. Many shows earlier than mid 2019 are likely to need +# this addition. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.1.4 +# CREATED: 2024-07-14 13:22:58 +# REVISION: 2024-08-20 17:38:19 +# +#=============================================================================== + +# set -o nounset # Treat unset variables as an error + +VERSION="0.1.4" + +SCRIPT=${0##*/} +# DIR=${0%/*} + +STDOUT="/dev/fd/2" + +# +# Select the appropriate working directory for the host +# +case $(hostname) in + i7-desktop) + echo "To be run only on 'borg'" + exit 1 + ;; + borg) + BASEDIR="$HOME/IA" + REPAIRS="$BASEDIR/repairs" + BACKUP="/mnt/backup_disk/HPR/HPR-MIRROR" + ;; + *) + echo "Wrong host!" + exit 1 + ;; +esac + +cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; } + +# +# Load library functions +# +LIB="$HOME/bin/function_lib.sh" +[ -e "$LIB" ] || { echo "Unable to source functions"; exit; } +# shellcheck disable=SC1090 +source "$LIB" + +# +# Enable coloured messages +# +define_colours + +# +# Sanity checks +# +JQ=$(command -v jq) +[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; } +IA=$(command -v ia) +[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; } +REPIT="$BASEDIR/repair_item" +[ -e "$REPIT" ] || { echo "Program '$REPIT' was not found"; exit 1; } +IADB="$BASEDIR/ia.db" +[ -e "$IADB" ] || { echo "Database '$IADB' was not found"; exit 1; } + +# {{{ -- Functions -- _IA_move, queued_tasks, _verbose, _usage + + +#=== FUNCTION ================================================================ +# NAME: _IA_move +# DESCRIPTION: Performs a file move on the IA, with retries if it fails. +# Assumes the existence of functions 'coloured', '_log', +# '_verbose' and '_DEBUG' +# PARAMETERS: $1 IA command to run (as a string) +# $2 The path to move from +# $3 The path to move to +# RETURNS: False if the number of retries is exceeded, otherwise true. +#=============================================================================== +_IA_move () { + local from="${1:?Usage _IA_move command from to}" + local to="${2:?Usage _IA_move command from to}" + + local retry_threshold=5 + local retries=0 + local sleeptime=20 + local command="ia move \"$from\" \"$to\" --no-derive --no-backup > /dev/null 2>&1" + _DEBUG "$command" + + # coloured 'blue' "Moving $from to $to" + + # + # Run 'command'. If it succeeds then exit. If it fails enter the 'until' + # loop and report the problem, then sleep and try again. Count the number + # of times this is done, so it doesn't loop forever. If we have reached + # the limit count this as a failure and exit with an error. If we haven't + # retried enough yet, sleep for a while and try again. The intention is to + # catch the case when an upload times out. The 'ia' command is performing + # its own retries per upload when the system is overloaded, but these are + # non-fatal. + # + until eval "$command"; do + coloured 'red' "Failure when moving $from to $to" + ((retries++)) + + _log "$(printf 'Failed to move %s to %s [%d]' "$from" "$to" $retries)" + + [ "$retries" -eq "$retry_threshold" ] && { + _verbose \ + "$(coloured 'red' "Retry limit reached; abandoning this move")" + return 1 + } + + _verbose "$(coloured 'blue' "Pausing for $sleeptime seconds and retrying")" + sleep $sleeptime + done # until eval ... + + coloured 'green' "Moved $from to $to on the IA" + _log "Moved $from to $to on the IA" + + return +} + +#=== FUNCTION ================================================================ +# NAME: queued_tasks +# DESCRIPTION: Queries the IA for any queued or running tasks for an item. +# Writes the number to STDOUT so it can be captured. +# PARAMETERS: $1 IA item (like hpr1192) +# RETURNS: Nothing +#=============================================================================== +queued_tasks () { + local item="${1:?Usage: queued_tasks item}" + local -i count=0 + + count="$(ia tasks "$item" |\ + jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')" + + echo "$count" + + return +} + +#=== FUNCTION ================================================================ +# NAME: make_dir +# DESCRIPTION: Make a directory if it doesn't exist, failing gracefully on +# errors. +# PARAMETERS: $1 directory path +# RETURNS: True if success, otherwise exits the caller script +#=============================================================================== +make_dir () { + local dir="${1}" + + if [[ ! -d $dir ]]; then + mkdir -p "$dir" || { + coloured 'red' "Failed to create $dir" + exit 1 + } + fi +} + +#=== FUNCTION ================================================================ +# NAME: _ifbool +# DESCRIPTION: Simplifies conditional expressions when they nned to return +# one of two strings. Use as: +# echo "Hello $(_ifbool 1 'World' 'Everyone')" → "Hello World" +# PARAMETERS: $1 Integer being tested. If 1 then it's true, otherwise +# it's false. Non-numeric is treated as 0/false. +# $2 String returned for True +# $3 String returned for False +# RETURNS: Nothing +#=============================================================================== +_ifbool () { + local -i _bool="${1:-0}" + local _t="${2:-true}" + local _f="${3:-false}" + + if [ "$_bool" -eq 1 ]; then + echo "$_t" + else + echo "$_f" + fi + + return +} + +#=== FUNCTION ================================================================ +# NAME: _log +# DESCRIPTION: Appends a record to the file "$LOGFILE" +# PARAMETERS: $1 Message to write +# RETURNS: Nothing +#=============================================================================== +_log () { + local message="${1}" + + echo "$(date +%F\ %T) $message" >> "$LOGFILE" +} + +#=== FUNCTION ================================================================ +# NAME: _verbose +# DESCRIPTION: Writes a message in verbose mode +# PARAMETERS: * message strings to write +# RETURNS: Nothing +#=============================================================================== +_verbose () { + [ "$VERBOSE" -eq 0 ] && return + for msg; do + printf '%s\n' "$msg" + done +} + +#=== FUNCTION ================================================================ +# NAME: _usage +# DESCRIPTION: Reports usage; always exits the script after doing so +# PARAMETERS: 1 - the integer to pass to the 'exit' command +# RETURNS: Nothing +#=============================================================================== +_usage () { + local -i result=${1:-0} + + cat >$STDOUT <<-endusage +${SCRIPT} - version: ${VERSION} + +Usage: ./${SCRIPT} [-h] [-D] [-F] [-v] item + +Attempts to repair an IA item where the upload has failed for some reason. + +Options: + -h Print this help. + -d 0|1 Dry run: -d 1 (the default) runs the script in dry-run + mode where nothing is changed but the actions that + will be taken are reported; -d 0 turns off dry-run + mode and the actions will be carried out. + -D Run in debug mode where a lot more information is + reported. + -F Ignore (some) interlocks that will cause failure, such + as the existence of the local cache directory for the + item being processed. + -v Run in verbose mode where more information is + reported. Default is off. + +Arguments: + item The item in the form 'hpr1234' + +endusage + exit "$result" +} + +# }}} + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#------------------------------------------------------------------------------- +# Directories and files +#------------------------------------------------------------------------------- +LOGS="$BASEDIR/logs" +make_dir "${LOGS}" +LOGFILE="$LOGS/$SCRIPT.log" + +while getopts :d:DFhv opt +do + case "${opt}" in + D) DEBUG=1;; + d) DRYRUN=$OPTARG;; + F) FORCE=1;; + h) _usage 0;; + v) VERBOSE=1;; + *) echo "** Unknown option" + _usage 1;; + esac +done +shift $((OPTIND - 1)) + +# +# Set option defaults and check their values +# +DRYRUN=${DRYRUN:-1} +if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then + echo "** Use '-d 0' or '-d 1'" + _usage 1 +fi +[[ $DRYRUN -eq 1 ]] && echo "Dry run mode" + +DEBUG=${DEBUG:-0} +[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode" + +FORCE=${FORCE:-0} + +VERBOSE=${VERBOSE:-0} + +# +# Should have one argument +# +if [[ $# != 1 ]]; then + coloured 'red' "Missing argument" + _usage 1 +fi +item="${1}" + +# +# Ensure item spec is correctly formatted +# +if [[ $item =~ hpr([0-9]+) ]]; then + printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))" +else + coloured 'red' "Incorrect show specification: $item" + coloured 'yellow' "Use 'hpr9999' format" + exit 1 +fi +_DEBUG "Parsed item: $item" + +_log "$SCRIPT $VERSION ($(_ifbool "$DRYRUN" 'dry-run' 'live'))" + +# +# Having an entry for the show in 'ia.db' is important, so check there is one +# +SQL="select 1 from episodes where id = ${item:3}" +if [[ $(sqlite3 -list "$IADB" "$SQL" 2>/dev/null) -ne 1 ]]; then + coloured 'red' "Unable to find show $item in the local IA database" + coloured 'yellow' "Can't continue" + exit 1 +fi +_verbose "$(coloured 'yellow' "Show $item is in the local IA database")" +_log "Show $item is in the local IA database" + +# +# It's possible that the show upload failed before anything was uploaded, even +# the metadata. It's never been seen, but it seems wise to cater for it. +# +# TODO: uncomment below; disabled for speed +if ! ia metadata "$item" --exists > /dev/null 2>&1; then + coloured 'red' "This item is not apparently on the IA; can't continue" + exit 1 +fi +_verbose "$(coloured 'yellow' "Show $item is on the IA")" +_log "Show $item is on the IA" + +# +# Directory paths +# +FROMPARENTDIR="$BACKUP/public_html/eps" +FROMDIR="$FROMPARENTDIR/$item" +TOPARENTDIR="$REPAIRS/$item" +TOASSETDIR="$TOPARENTDIR/$item" + +# +# RE to ignore certain files using 'grep -v -E ...' +# +IGNORE="($item\.(flac|mp3|ogg|opus|spx|wav)$)" + +#------------------------------------------------------------------------------- +# Check there are asset files on the backup disk before proceeding. At least +# we need the transcripts. If no files at all we can't continue. +#------------------------------------------------------------------------------- +declare -a BACKUPFILES +mapfile -t BACKUPFILES < \ + <(find "$FROMPARENTDIR" -type f -name "$item*" | grep -v -E "${IGNORE}") +_DEBUG "$(coloured 'purple' "Backup files")" "${BACKUPFILES[@]}" + +if [[ ! -d $FROMDIR || ${#BACKUPFILES[@]} -eq 0 ]]; then + coloured 'red' "No files found in $FROMDIR" + coloured 'red' "Can't continue!" + exit 1 +fi + +_log "Files found on backup disk ${#BACKUPFILES[*]}" + +#------------------------------------------------------------------------------- +# Make the needed local cache directory for later +#------------------------------------------------------------------------------- +if [[ $FORCE -ne 1 && -e $TOPARENTDIR ]]; then + coloured 'red' "Directory $TOPARENTDIR already exists; can't continue". + coloured 'yellow' 'This implies that all files have been copied already.' + coloured 'yellow' "If you're sure, consider running: '$REPIT -X -d0 $item'" + coloured 'yellow' 'Otherwise, consider running again with option -F.' + exit 1 +else + if [[ $DRYRUN -eq 1 ]]; then + coloured 'yellow' "Would have created directory $TOPARENTDIR" + else + mkdir -p "$TOASSETDIR" + _verbose "$(coloured 'yellow' "Created directory $TOASSETDIR")" + _log "Created directory $TOASSETDIR" + fi +fi + +#------------------------------------------------------------------------------- +# Collect asset data from the database +#------------------------------------------------------------------------------- +SQL="select filename from assets where episode_id = ${item:3}" +declare -a IADBASSETS +mapfile -t IADBASSETS < <(sqlite3 -list "$IADB" "$SQL" 2>/dev/null) +_DEBUG "$(coloured 'purple' "SQLite IA DB files")" "${IADBASSETS[@]}" +_log "Files found in ia.db ${#IADBASSETS[*]}" + +#------------------------------------------------------------------------------- +# Collect IA data, only original files generated by HPR. We exclude audio +# files from this set. +#------------------------------------------------------------------------------- +JQPROG='.files[] | select(.source == "original" and .format != "Metadata" and ' +JQPROG+='.format != "Item Tile") | (.name) | @text' +declare -a IAFILES +mapfile -t IAFILES < \ + <(ia metadata "$item" | $JQ -r "$JQPROG" | grep -v -E "${IGNORE}") +_DEBUG "$(coloured 'purple' "IA files (originals)")" "${IAFILES[@]}" +_log "Files found on IA (originals) ${#IAFILES[*]}" + +#------------------------------------------------------------------------------- +# Work out whether to copy assets from the backup disk, or whether to move +# files on the IA. Whatever we decide we also need to copy transcripts from +# the backup disk and upload to the IA +#------------------------------------------------------------------------------- +# +# Check each asset from the $IADB database to see if it's on the IA. We'll get back +# a path if it's where we want it, otherwise just a filename. +# +declare -a MOVES + +coloured 'purple' "Checking IA files for moves" + +# +# If we find an asset by looking for its basename in the list of files we got +# from the IA and if they are the same we need to move such files to the +# sub-directory. +# +for asset in "${IAFILES[@]}"; do + # + # Skip IA files with directories + # + if [[ $asset =~ / ]]; then + continue + fi + + IA_match=$( grep "${asset}" <(printf '%s\n' "${IADBASSETS[@]}") ) + if [[ $IA_match = "$asset" ]]; then + MOVES+=("$IA_match") + fi +done + +# +# If we found any moves then we can move them in the IA item now and copy the +# files from the backup disk to the cache in case we need them. They will +# eventually get deleted by 'cron'. +# +if [[ ${#MOVES[@]} -gt 0 ]]; then + _DEBUG "$(coloured 'purple' "Files to be moved")" "${MOVES[@]}" "----" + + mcount=0 + for asset in "${MOVES[@]}"; do + # source & destination for IA moves + iafrom="$item/$asset" + iato="$item/$item/$asset" + _DEBUG "\$iafrom: $iafrom" "\$iato: $iato" "" + + # + # If IA source and destination are the same no moves are needed. For + # the local cache the later 'rsync' will be enough. + # + if [[ $iafrom != "$iato" ]]; then + if [[ $DRYRUN -eq 1 ]]; then + coloured 'yellow' "ia move $iafrom $iato --no-derive --no-backup" + coloured 'yellow' "cp $FROMPARENTDIR/$asset $TOASSETDIR/" + else + # + # Perform the move. If the retries are exceeded things get + # complicated, so just abort so we can try again later. + # + _verbose "$(coloured 'blue' "Moving $iafrom → $iato on IA")" + if _IA_move "$iafrom" "$iato"; then + # + # Update the cache (but only if the move occurred) + # + _verbose "$(coloured 'blue' "Copying from backup disk to cache")" + cp "$FROMPARENTDIR/$asset" "$TOASSETDIR/" + + ((mcount++)) + else + coloured 'red' "Retries exhausted. Aborting recovery" + exit 1 + fi + fi + fi + + done + # + # Report what was done + # + coloured 'green' "Moved $mcount $(ngettext file files "$mcount")" + _log "Moved $mcount $(ngettext file files "$mcount")" + +else + coloured 'yellow' "No moves needed" + _log "No moves needed" +fi + +# +# Wait for the IA moves to finish +# +if [[ $DRYRUN -eq 0 ]]; then + if [[ $mcount -gt 0 ]]; then + until [[ $(queued_tasks "$item") -eq 0 ]]; do + coloured 'yellow' "Waiting for IA tasks to complete" + sleep 1m + done + fi +else + if [[ $mcount -gt 0 ]]; then + coloured 'yellow' "Would have waited for any IA tasks to complete" + fi +fi + +#------------------------------------------------------------------------------- +# Copy files from the backup disk to the cache +#------------------------------------------------------------------------------- +if [[ $DRYRUN -eq 1 ]]; then + coloured 'yellow' "Would have copied files from backup disk → cache" + rsync -n -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR" +else + rsync -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR" + _verbose "$(coloured 'yellow' "Copied files from $FROMDIR")" + _log "Copied files from $FROMDIR" +fi + +# TODO: Is this needed? +# +# Put any source audio in the right place. +# +# if [[ $DRYRUN -eq 1 ]]; then +# coloured 'yellow' "Would have moved source files if found" +# else +# # +# # Turn on 'nullglob' to get an empty result if the glob expression doesn't +# # match. +# # +# NG=$(shopt -p nullglob) +# shopt -s nullglob +# +# # +# # Any source files should be in repairs/hpr1234/ and should go to the IA +# # in the comparable place. We will not put it on the HPR server though. +# # +# # TODO: Is this right? +# movecount=0 +# for file in "$TOPARENTDIR"/*_source.*; do +# if mv "$file" "$TOPARENTDIR"; then +# ((movecount++)) +# fi +# done +# +# eval "$NG" +# +# # +# # Show the directories after any move +# # +# if [[ $movecount -gt 0 ]]; then +# _verbose "$(coloured 'yellow' "Moved source file(s)")" +# ls -lR "$REPAIRS/$item/" +# fi +# +# fi + +#------------------------------------------------------------------------------- +# Using the cache as the reference upload whatever is missing to the IA +#------------------------------------------------------------------------------- +if [[ $DRYRUN -eq 1 ]]; then + coloured 'yellow' "Would have found and repaired missing files" +else + _verbose "$(coloured 'yellow' "Finding and repairing missing files")" + _log "Finding and repairing missing files (with $REPIT)" + "$REPIT" -X -d0 "$item" +fi + +# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker + diff --git a/InternetArchive/repair_assets b/InternetArchive/repair_assets index c00b131..3371427 100755 --- a/InternetArchive/repair_assets +++ b/InternetArchive/repair_assets @@ -15,15 +15,15 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.6 +# VERSION: 0.0.7 # CREATED: 2024-05-10 21:26:31 -# REVISION: 2024-07-10 15:12:54 +# REVISION: 2024-08-04 19:40:52 # #=============================================================================== # set -o nounset # Treat unset variables as an error -VERSION="0.0.6" +VERSION="0.0.7" SCRIPT=${0##*/} # DIR=${0%/*} @@ -357,7 +357,7 @@ else exit 1 fi -_DEBUG "$(printf '%s\n' "${iacache[@]}")" +_DEBUG "IA cache" "${iacache[@]}" # # Determine which files are assets @@ -461,6 +461,10 @@ fi #------------------------------------------------------------------------------- # Compare the two asset lists and return what's missing on the HPR server #------------------------------------------------------------------------------- +# TODO: This algorithm does not handle the instance where there are pictures +# in one directory and a lower directory containing thumbnails, AND THE FILE +# NAMES ARE THE SAME! +# declare -a missing find_missing ia_asset hpr_asset missing _verbose "$(coloured 'cyan' "** missing (${#missing[@]}):")" @@ -471,7 +475,8 @@ if [[ ${#missing[@]} -eq 0 ]]; then _log "No missing assets detected; nothing to do" exit else - coloured 'yellow' "Found ${#missing[@]} files missing on the HPR server" + coloured 'yellow' \ + "Found ${#missing[@]} $(ngettext file files ${#missing[@]}) missing on the HPR server" fi #------------------------------------------------------------------------------- @@ -548,6 +553,10 @@ fi #------------------------------------------------------------------------------- # Synchronise assets to the directory #------------------------------------------------------------------------------- +# We perform an 'rsync' over 'ssh' to synchronise files from +# ~/HPR/InternetArchive/assets/hprXXXX/files/hprXXXX to +# public_html/eps/hprXXXX (on the HPR server) +# # shellcheck disable=SC2059 disable=SC2089 printf -v command "$RSYNCTPL" "$LOCAL_PARENTDIR/" "$REMOTE_PARENTDIR/" diff --git a/InternetArchive/repair_item b/InternetArchive/repair_item index 7700330..107997e 100755 --- a/InternetArchive/repair_item +++ b/InternetArchive/repair_item @@ -6,19 +6,24 @@ # USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] itemname # # DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during -# the upload. +# the upload (and when recovering deleted files from the +# changeover to the HPR static site). # # The most common failures are caused by the file upload # processes timing out and being aborted (by the 'ia' tool which # performs the item creation and the uploads). This failure # means that a show being processed on 'borg' does not get all -# of the components loaded to the IA. +# of the components loaded to the IA. This happens during the +# sequence of running the 'make_metadata' Perl script which +# generates a CSV file of show data, followed by 'ia metadata +# --spreadsheet='. Failures in the second part cause +# it to be aborted # # This script looks at the files belonging to the show (stored # temporarily on 'borg') and determines which have not been # uploaded, then takes steps to perform the uploads. # -# Version 0.0.10 onwards has the capability to repair an IA item +# Version 0.0.11 onwards has the capability to repair an IA item # from the HPR backup disk. This seems to be necessary because # the transcripts were not carried over (although we are # adding them to the IA for new shows now, older ones were never @@ -30,20 +35,24 @@ # source file is in the upper one. This emulates the placement # on the IA itself. # +# This script can be called directly to recover a new show which +# failed during creation/upload, or by 'recover_transcripts' +# which is repairing shows with missing assets. +# # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.10 +# VERSION: 0.0.11 # CREATED: 2020-01-05 22:42:46 -# REVISION: 2024-07-12 14:39:38 +# REVISION: 2024-07-20 17:06:10 # #=============================================================================== #set -o nounset # Treat unset variables as an error -VERSION="0.0.10" +VERSION="0.0.11" SCRIPT=${0##*/} # DIR=${0%/*} @@ -55,6 +64,7 @@ STDOUT="/dev/fd/2" # case $(hostname) in i7-desktop) + # TODO: consider not allowing this to be run anywhere but on 'borg' BASEDIR="$HOME/HPR/InternetArchive" UPLOADS="$HOME/HPR/IA/uploads" REPAIRS="$BASEDIR/repairs" @@ -100,7 +110,7 @@ TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT -# {{{ -- Functions -- Upload, exists_in, queued_tasks, _DEBUG, _usage +# {{{ -- Functions -- Upload, exists_in, queued_tasks, _usage #=== FUNCTION ================================================================ # NAME: Upload @@ -174,19 +184,6 @@ queued_tasks () { return } -#=== FUNCTION ================================================================ -# NAME: _DEBUG -# DESCRIPTION: Writes a message if in DEBUG mode -# PARAMETERS: List of messages -# RETURNS: Nothing -#=============================================================================== -_DEBUG () { - [ "$DEBUG" == 0 ] && return - for msg in "$@"; do - printf 'D> %s\n' "$msg" - done -} - #=== FUNCTION ================================================================ # NAME: _usage # DESCRIPTION: Reports usage; always exits the script after doing so @@ -297,10 +294,11 @@ fi item="${1}" # -# Ensure item spec is correctly formatted +# Ensure item spec is correctly formatted. Have to cater for leading zeroes +# being interpreted as octal. # if [[ $item =~ hpr([0-9]+) ]]; then - printf -v item 'hpr%04d' "${BASH_REMATCH[1]}" + printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))" else coloured 'red' "Incorrect show specification: $item" coloured 'yellow' "Use 'hpr9999' format" @@ -310,7 +308,7 @@ _DEBUG "Parsed item: $item" # # It's possible that the show upload failed before anything was uploaded, even -# the metadata. It's never been seen, but it seems wise to cater for it. +# the metadata. It's rarely seen, but it seems wise to cater for it. # if ! ia metadata "$item" --exists > /dev/null 2>&1; then coloured 'red' "This item is not apparently on the IA; can't continue" @@ -323,7 +321,7 @@ fi # mysteriously vanished from the IA. The directories here are equivalent to # those used by 'repair_assets'. There is a top-level directory the represents # the IA item, and below that a hierarchy defining placement under the item. -# There is a 'repairs' directory per host in case we need to preair IA stuff +# There is a 'repairs' directory per host in case we need to repair IA stuff # from elsewhere. # if [[ $EXTENDED -eq 1 ]]; then diff --git a/InternetArchive/snapshot_metadata b/InternetArchive/snapshot_metadata new file mode 100755 index 0000000..2f34abb --- /dev/null +++ b/InternetArchive/snapshot_metadata @@ -0,0 +1,197 @@ +#!/bin/bash - +#=============================================================================== +# +# FILE: snapshot_metadata +# +# USAGE: ./snapshot_metadata episode_number +# +# DESCRIPTION: Collects metadata from the IA for a given show and stores it +# in the cache. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.2 +# CREATED: 2024-08-16 20:36:51 +# REVISION: 2024-08-17 10:31:15 +# +#=============================================================================== + +set -o nounset # Treat unset variables as an error + +VERSION="0.0.2" + +SCRIPT=${0##*/} +# DIR=${0%/*} + +STDOUT="/dev/fd/2" + +# +# Select the appropriate working directory for the host +# +case $(hostname) in + i7-desktop) + BASEDIR="$HOME/HPR/InternetArchive" + ;; + borg) + BASEDIR="$HOME/IA" + ;; + *) + echo "Wrong host!" + exit 1 + ;; +esac + +cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; } + +# +# Load library functions +# +LIB="$HOME/HPR/function_lib.sh" +[ -e "$LIB" ] || { echo "Unable to source functions"; exit; } +# shellcheck disable=SC1090 +source "$LIB" + +# +# Enable coloured messages +# +define_colours + +# +# Sanity checks +# +IA=$(command -v ia) +[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; } +VIEWD="$BASEDIR/view_derivatives" +[ -e "$VIEWD" ] || { echo "Program '$VIEWD' was not found"; exit 1; } + +# {{{ -- Functions -- _usage + +#=== FUNCTION ================================================================ +# NAME: make_dir +# DESCRIPTION: Make a directory if it doesn't exist, failing gracefully on +# errors. +# PARAMETERS: $1 directory path +# RETURNS: True if success, otherwise exits the caller script +#=============================================================================== +make_dir () { + local dir="${1}" + + if [[ ! -d $dir ]]; then + mkdir -p "$dir" || { + coloured 'red' "Failed to create $dir" + exit 1 + } + fi +} + +#=== FUNCTION ================================================================ +# NAME: _usage +# DESCRIPTION: Reports usage; always exits the script after doing so +# PARAMETERS: 1 - the integer to pass to the 'exit' command +# RETURNS: Nothing +#=============================================================================== +_usage () { + local -i result=${1:-0} + + cat >$STDOUT <<-endusage +${SCRIPT} - version: ${VERSION} + +Usage: ./${SCRIPT} showid + +Collects notes for a show and adds them to the cache directory + +Arguments: + showid The show id in the form 'hpr1234' + +endusage + exit "$result" +} + +# }}} + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#------------------------------------------------------------------------------- +# Argument check +#------------------------------------------------------------------------------- +# Should have one argument +# +if [[ $# != 1 ]]; then + coloured 'red' "Missing argument" + _usage 1 +fi +show="${1,,}" + +# +# Ensure show id is correctly formatted. We want it to be 'hpr1234' +# +if [[ $show =~ (hpr)?([0-9]+) ]]; then + printf -v show 'hpr%04d' "${BASH_REMATCH[2]}" +else + coloured 'red' "Incorrect show specification: $show" + coloured 'yellow' "Use 'hpr9999' or '9999' format" + exit 1 +fi + + +#------------------------------------------------------------------------------- +# Setting up paths +#------------------------------------------------------------------------------- +# +# CACHEDIR is where we store asset details and files +# +CACHEDIR="$BASEDIR/assets" +[ ! -d "$CACHEDIR" ] && { + coloured 'red' "Creating cache directory" + make_dir "$CACHEDIR" +} + +# +# Pointers into the cache: +# LOCAL_ASSETDIR - where the cache for this show lives +# +LOCAL_ASSETDIR="$CACHEDIR/${show}" +[ ! -d "$LOCAL_ASSETDIR" ] && { + coloured 'green' "Creating cache directory for $show" + make_dir "$LOCAL_ASSETDIR" +} + +METADATA="$CACHEDIR/$show/metadata.json" +DERIVED="$CACHEDIR/$show/derived.lis" + +#------------------------------------------------------------------------------- +# Save the IA metadata unless we already have the file +#------------------------------------------------------------------------------- +if [[ ! -e $METADATA ]]; then + if ia metadata "$show" > "$METADATA"; then + coloured 'green' "Created metadata file" + if [[ ! -s $METADATA ]]; then + coloured 'red' "Metadata file is empty" + fi + else + coloured 'red' "Creation of metadata file failed" + exit 1 + fi +else + coloured 'yellow' "Metadata already exists, not replacing it" +fi + +#------------------------------------------------------------------------------- +# Use the collected metadata to view the state of the IA, and collect the derived file names +#------------------------------------------------------------------------------- +coloured 'blue' "Viewing IA files" +"$VIEWD" -verb "$METADATA" + +if "$VIEWD" -list "$METADATA" > "$DERIVED"; then + nfiles="$(wc -l < "$DERIVED")" + coloured 'green' "Saved 'derived' files for show $show ($nfiles)" +else + coloured 'red' "Creation of $DERIVED file failed" +fi + +exit + +# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker diff --git a/InternetArchive/tidy_uploaded b/InternetArchive/tidy_uploaded index ebc59f3..fee3a0a 100755 --- a/InternetArchive/tidy_uploaded +++ b/InternetArchive/tidy_uploaded @@ -13,15 +13,15 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.10 +# VERSION: 0.0.11 # CREATED: 2022-03-30 17:38:01 -# REVISION: 2022-07-30 14:30:43 +# REVISION: 2024-07-29 18:24:26 # #=============================================================================== set -o nounset # Treat unset variables as an error -VERSION="0.0.10" +VERSION="0.0.11" SCRIPT=${0##*/} # DIR=${0%/*} @@ -55,6 +55,8 @@ case $HOSTNAME in *) echo "Wrong host!"; exit 1 ;; esac +# {{{ -- Functions -- exists_in, queued_tasks, movefile, is_empty, _log, _usage + #=== FUNCTION ================================================================ # NAME: exists_in # DESCRIPTION: Checks the existence of a key in an associative array @@ -99,9 +101,12 @@ queued_tasks () { # RETURNS: True if a move was done, otherwise False #=============================================================================== movefile () { - local fromdir="${1:?Usage: movefile fromdir todir path}" - local todir="${2:?Usage: movefile fromdir todir path}" - local path="${3:?Usage: movefile fromdir todir path}" + local fromdir="${1:?Usage: movefile fromdir todir path [FORCE]}" + local todir="${2:?Usage: movefile fromdir todir path [FORCE]}" + local path="${3:?Usage: movefile fromdir todir path [FORCE]}" + local FORCE="${4:-0}" + + [[ ! -v FORCE ]] && FORCE=0 # # Chop up the path. If it's just a file name then $dir and $file are the @@ -126,8 +131,16 @@ movefile () { # TODO: Compare the two files? # if [[ -e $todir/$path ]]; then - echo "File already exists: $todir/$path" - return 1 + if [[ $FORCE -eq 1 ]]; then + echo "File exists: $todir/$path" + echo "FORCE mode is ON so overwriting" + mv --force "$fromdir/$path" "$todir/$path" + echo "Moved $fromdir/$path" + return 0 + else + echo "File already exists: $todir/$path" + return 1 + fi else mv "$fromdir/$path" "$todir/$path" echo "Moved $fromdir/$path" @@ -147,16 +160,28 @@ is_empty() { } #=== FUNCTION ================================================================ -# NAME: _DEBUG -# DESCRIPTION: Writes a message if in DEBUG mode -# PARAMETERS: List of messages +# NAME: _log +# DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script +# using the predefined $LOGREC, a template for 'printf'. If the +# latter is not defined the function will use a default. +# For some reason 'shellcheck' objects to this function. The +# first argument to 'printf' needs to be -1 to make the +# '%(fmt)T' use today's date and time. +# PARAMETERS: 1 - the message to write # RETURNS: Nothing #=============================================================================== -_DEBUG () { - [ "$DEBUG" == 0 ] && return - for msg in "$@"; do - printf 'D> %s\n' "$msg" - done +# shellcheck disable=SC2317 disable=SC2059 +_log () { + local msg="$1" + +# echo "D> $LOGFILE $LOGREC" + [ -v LOGFILE ] || { echo "${FUNCNAME[0]}: \$LOGFILE is not defined"; exit 1; } + [ -v LOGREC ] || { local LOGREC='%(%F %T)T %s\n'; } + +# echo "D> $LOGFILE $LOGREC" + printf "$LOGREC" -1 "$msg" >> "$LOGFILE" + + return } #=== FUNCTION ================================================================ @@ -189,6 +214,11 @@ Options: to stop at. -D Run in debug mode where a lot more information is reported + -F Turn on FORCE mode (normally off). In this mode when + the files being tidied (moved) already exist, they are + overwritten. This is for the very rare case when + a show's audio has to be re-uploaded because of bad + audio or the wrong file being sent. Examples ./tidy_uploaded # Run in (default) dry-run mode @@ -196,11 +226,14 @@ Examples ./tidy_uploaded -d0 # Live mode (without verbose messages) ./tidy_uploaded -c1 # Process 1 show in dry-run mode ./tidy_uploaded -D # Run with debugging enabled + ./tidy_uploaded -F # Run with FORCE mode on endusage exit "$res" } +# }}} + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -208,16 +241,18 @@ endusage # LOGS="$BASEDIR/logs" LOGFILE="$LOGS/$SCRIPT.log" +LOGREC='%(%F %T)T %s\n' # # Process options # -while getopts :c:d:Dhv opt +while getopts :c:d:DFhv opt do case "${opt}" in c) COUNT=$OPTARG;; D) DEBUG=1;; d) DRYRUN=$OPTARG;; + F) FORCE=1;; h) _usage 0;; v) VERBOSE=1;; *) echo "** Unknown option" @@ -239,6 +274,9 @@ if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then fi [[ $DRYRUN -eq 1 ]] && echo "Dry run mode" +FORCE=${FORCE:-0} +[[ $FORCE -eq 1 ]] && echo "Force mode - overwriting existing files" + VERBOSE=${VERBOSE:-0} DEBUG=${DEBUG:-0} @@ -361,12 +399,17 @@ while read -r path; do # # A file on the IA exists in the upload area. Move the # local one if we're not in dry-run mode, otherwise just - # report the move we would do. + # report the move we would do. If FORCE mode is on + # overwrite the file. # if [[ $DRYRUN -eq 0 ]]; then - movefile "$UPLOADS" "$ARCHIVE" "$file" && ((moves++)) + movefile "$UPLOADS" "$ARCHIVE" "$file" "$FORCE" && ((moves++)) else - printf 'Would move %s\n\tto %s\n' "$frompath" "$topath" + if [[ $FORCE -eq 0 ]]; then + printf 'Would move %s\n\tto %s\n' "$frompath" "$topath" + else + printf 'Would move %s\n\toverwriting %s\n' "$frompath" "$topath" + fi fi fi done < "$TMP1" diff --git a/InternetArchive/upload_manager b/InternetArchive/upload_manager index d6e6d45..dd13530 100755 --- a/InternetArchive/upload_manager +++ b/InternetArchive/upload_manager @@ -33,11 +33,12 @@ # #=============================================================================== -use 5.010; +use v5.36; use strict; use warnings; use utf8; #use experimental 'smartmatch'; +# TODO: use experimental::try; use match::smart; diff --git a/InternetArchive/weekly_upload b/InternetArchive/weekly_upload index 80376b0..085126b 100755 --- a/InternetArchive/weekly_upload +++ b/InternetArchive/weekly_upload @@ -8,6 +8,9 @@ # DESCRIPTION: Run the commands necessary to upload a batch of HPR shows to # archive.org # +# ** NOW OBSOLETE ** +# We do uploads differently now. +# # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- @@ -19,6 +22,9 @@ # #=============================================================================== +echo "Obsolete script. Do not use!" +cmd='exit'; $cmd + set -o nounset # Treat unset variables as an error SCRIPT=${0##*/}