hpr-tools/InternetArchive/recover_transcripts

591 lines
19 KiB
Plaintext
Raw Permalink Normal View History

#!/bin/bash -
#===============================================================================
#
# FILE: recover_transcripts
#
# USAGE: ./recover_transcripts item
#
# DESCRIPTION: Intended to be run on `borg`; collects assets from the
# locally-mounted backup disk and places them in a local
# directory (organised to be compatible with the IA), then
# uploads anything that is missing on the IA.
#
# Version 0.1.* looks for assets in the 'eps/' directory and
# copies them to the cache. Also moves the IA copies so all is
# aligned. Many shows earlier than mid 2019 are likely to need
# this addition.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.1.4
# CREATED: 2024-07-14 13:22:58
# REVISION: 2024-08-20 17:38:19
#
#===============================================================================
# set -o nounset # Treat unset variables as an error
VERSION="0.1.4"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Select the appropriate working directory for the host
#
case $(hostname) in
i7-desktop)
echo "To be run only on 'borg'"
exit 1
;;
borg)
BASEDIR="$HOME/IA"
REPAIRS="$BASEDIR/repairs"
BACKUP="/mnt/backup_disk/HPR/HPR-MIRROR"
;;
*)
echo "Wrong host!"
exit 1
;;
esac
cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
# shellcheck disable=SC1090
source "$LIB"
#
# Enable coloured messages
#
define_colours
#
# Sanity checks
#
JQ=$(command -v jq)
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
IA=$(command -v ia)
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
REPIT="$BASEDIR/repair_item"
[ -e "$REPIT" ] || { echo "Program '$REPIT' was not found"; exit 1; }
IADB="$BASEDIR/ia.db"
[ -e "$IADB" ] || { echo "Database '$IADB' was not found"; exit 1; }
# {{{ -- Functions -- _IA_move, queued_tasks, _verbose, _usage
#=== FUNCTION ================================================================
# NAME: _IA_move
# DESCRIPTION: Performs a file move on the IA, with retries if it fails.
# Assumes the existence of functions 'coloured', '_log',
# '_verbose' and '_DEBUG'
# PARAMETERS: $1 IA command to run (as a string)
# $2 The path to move from
# $3 The path to move to
# RETURNS: False if the number of retries is exceeded, otherwise true.
#===============================================================================
_IA_move () {
local from="${1:?Usage _IA_move command from to}"
local to="${2:?Usage _IA_move command from to}"
local retry_threshold=5
local retries=0
local sleeptime=20
local command="ia move \"$from\" \"$to\" --no-derive --no-backup > /dev/null 2>&1"
_DEBUG "$command"
# coloured 'blue' "Moving $from to $to"
#
# Run 'command'. If it succeeds then exit. If it fails enter the 'until'
# loop and report the problem, then sleep and try again. Count the number
# of times this is done, so it doesn't loop forever. If we have reached
# the limit count this as a failure and exit with an error. If we haven't
# retried enough yet, sleep for a while and try again. The intention is to
# catch the case when an upload times out. The 'ia' command is performing
# its own retries per upload when the system is overloaded, but these are
# non-fatal.
#
until eval "$command"; do
coloured 'red' "Failure when moving $from to $to"
((retries++))
_log "$(printf 'Failed to move %s to %s [%d]' "$from" "$to" $retries)"
[ "$retries" -eq "$retry_threshold" ] && {
_verbose \
"$(coloured 'red' "Retry limit reached; abandoning this move")"
return 1
}
_verbose "$(coloured 'blue' "Pausing for $sleeptime seconds and retrying")"
sleep $sleeptime
done # until eval ...
coloured 'green' "Moved $from to $to on the IA"
_log "Moved $from to $to on the IA"
return
}
#=== FUNCTION ================================================================
# NAME: queued_tasks
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
# Writes the number to STDOUT so it can be captured.
# PARAMETERS: $1 IA item (like hpr1192)
# RETURNS: Nothing
#===============================================================================
queued_tasks () {
local item="${1:?Usage: queued_tasks item}"
local -i count=0
count="$(ia tasks "$item" |\
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
echo "$count"
return
}
#=== FUNCTION ================================================================
# NAME: make_dir
# DESCRIPTION: Make a directory if it doesn't exist, failing gracefully on
# errors.
# PARAMETERS: $1 directory path
# RETURNS: True if success, otherwise exits the caller script
#===============================================================================
make_dir () {
local dir="${1}"
if [[ ! -d $dir ]]; then
mkdir -p "$dir" || {
coloured 'red' "Failed to create $dir"
exit 1
}
fi
}
#=== FUNCTION ================================================================
# NAME: _ifbool
# DESCRIPTION: Simplifies conditional expressions when they nned to return
# one of two strings. Use as:
# echo "Hello $(_ifbool 1 'World' 'Everyone')" → "Hello World"
# PARAMETERS: $1 Integer being tested. If 1 then it's true, otherwise
# it's false. Non-numeric is treated as 0/false.
# $2 String returned for True
# $3 String returned for False
# RETURNS: Nothing
#===============================================================================
_ifbool () {
local -i _bool="${1:-0}"
local _t="${2:-true}"
local _f="${3:-false}"
if [ "$_bool" -eq 1 ]; then
echo "$_t"
else
echo "$_f"
fi
return
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Appends a record to the file "$LOGFILE"
# PARAMETERS: $1 Message to write
# RETURNS: Nothing
#===============================================================================
_log () {
local message="${1}"
echo "$(date +%F\ %T) $message" >> "$LOGFILE"
}
#=== FUNCTION ================================================================
# NAME: _verbose
# DESCRIPTION: Writes a message in verbose mode
# PARAMETERS: * message strings to write
# RETURNS: Nothing
#===============================================================================
_verbose () {
[ "$VERBOSE" -eq 0 ] && return
for msg; do
printf '%s\n' "$msg"
done
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Reports usage; always exits the script after doing so
# PARAMETERS: 1 - the integer to pass to the 'exit' command
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i result=${1:-0}
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-D] [-F] [-v] item
Attempts to repair an IA item where the upload has failed for some reason.
Options:
-h Print this help.
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is changed but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-D Run in debug mode where a lot more information is
reported.
-F Ignore (some) interlocks that will cause failure, such
as the existence of the local cache directory for the
item being processed.
-v Run in verbose mode where more information is
reported. Default is off.
Arguments:
item The item in the form 'hpr1234'
endusage
exit "$result"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#-------------------------------------------------------------------------------
# Directories and files
#-------------------------------------------------------------------------------
LOGS="$BASEDIR/logs"
make_dir "${LOGS}"
LOGFILE="$LOGS/$SCRIPT.log"
while getopts :d:DFhv opt
do
case "${opt}" in
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
F) FORCE=1;;
h) _usage 0;;
v) VERBOSE=1;;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
#
# Set option defaults and check their values
#
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $DRYRUN -eq 1 ]] && echo "Dry run mode"
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
FORCE=${FORCE:-0}
VERBOSE=${VERBOSE:-0}
#
# Should have one argument
#
if [[ $# != 1 ]]; then
coloured 'red' "Missing argument"
_usage 1
fi
item="${1}"
#
# Ensure item spec is correctly formatted
#
if [[ $item =~ hpr([0-9]+) ]]; then
printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))"
else
coloured 'red' "Incorrect show specification: $item"
coloured 'yellow' "Use 'hpr9999' format"
exit 1
fi
_DEBUG "Parsed item: $item"
_log "$SCRIPT $VERSION ($(_ifbool "$DRYRUN" 'dry-run' 'live'))"
#
# Having an entry for the show in 'ia.db' is important, so check there is one
#
SQL="select 1 from episodes where id = ${item:3}"
if [[ $(sqlite3 -list "$IADB" "$SQL" 2>/dev/null) -ne 1 ]]; then
coloured 'red' "Unable to find show $item in the local IA database"
coloured 'yellow' "Can't continue"
exit 1
fi
_verbose "$(coloured 'yellow' "Show $item is in the local IA database")"
_log "Show $item is in the local IA database"
#
# It's possible that the show upload failed before anything was uploaded, even
# the metadata. It's never been seen, but it seems wise to cater for it.
#
# TODO: uncomment below; disabled for speed
if ! ia metadata "$item" --exists > /dev/null 2>&1; then
coloured 'red' "This item is not apparently on the IA; can't continue"
exit 1
fi
_verbose "$(coloured 'yellow' "Show $item is on the IA")"
_log "Show $item is on the IA"
#
# Directory paths
#
FROMPARENTDIR="$BACKUP/public_html/eps"
FROMDIR="$FROMPARENTDIR/$item"
TOPARENTDIR="$REPAIRS/$item"
TOASSETDIR="$TOPARENTDIR/$item"
#
# RE to ignore certain files using 'grep -v -E ...'
#
IGNORE="($item\.(flac|mp3|ogg|opus|spx|wav)$)"
#-------------------------------------------------------------------------------
# Check there are asset files on the backup disk before proceeding. At least
# we need the transcripts. If no files at all we can't continue.
#-------------------------------------------------------------------------------
declare -a BACKUPFILES
mapfile -t BACKUPFILES < \
<(find "$FROMPARENTDIR" -type f -name "$item*" | grep -v -E "${IGNORE}")
_DEBUG "$(coloured 'purple' "Backup files")" "${BACKUPFILES[@]}"
if [[ ! -d $FROMDIR || ${#BACKUPFILES[@]} -eq 0 ]]; then
coloured 'red' "No files found in $FROMDIR"
coloured 'red' "Can't continue!"
exit 1
fi
_log "Files found on backup disk ${#BACKUPFILES[*]}"
#-------------------------------------------------------------------------------
# Make the needed local cache directory for later
#-------------------------------------------------------------------------------
if [[ $FORCE -ne 1 && -e $TOPARENTDIR ]]; then
coloured 'red' "Directory $TOPARENTDIR already exists; can't continue".
coloured 'yellow' 'This implies that all files have been copied already.'
coloured 'yellow' "If you're sure, consider running: '$REPIT -X -d0 $item'"
coloured 'yellow' 'Otherwise, consider running again with option -F.'
exit 1
else
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have created directory $TOPARENTDIR"
else
mkdir -p "$TOASSETDIR"
_verbose "$(coloured 'yellow' "Created directory $TOASSETDIR")"
_log "Created directory $TOASSETDIR"
fi
fi
#-------------------------------------------------------------------------------
# Collect asset data from the database
#-------------------------------------------------------------------------------
SQL="select filename from assets where episode_id = ${item:3}"
declare -a IADBASSETS
mapfile -t IADBASSETS < <(sqlite3 -list "$IADB" "$SQL" 2>/dev/null)
_DEBUG "$(coloured 'purple' "SQLite IA DB files")" "${IADBASSETS[@]}"
_log "Files found in ia.db ${#IADBASSETS[*]}"
#-------------------------------------------------------------------------------
# Collect IA data, only original files generated by HPR. We exclude audio
# files from this set.
#-------------------------------------------------------------------------------
JQPROG='.files[] | select(.source == "original" and .format != "Metadata" and '
JQPROG+='.format != "Item Tile") | (.name) | @text'
declare -a IAFILES
mapfile -t IAFILES < \
<(ia metadata "$item" | $JQ -r "$JQPROG" | grep -v -E "${IGNORE}")
_DEBUG "$(coloured 'purple' "IA files (originals)")" "${IAFILES[@]}"
_log "Files found on IA (originals) ${#IAFILES[*]}"
#-------------------------------------------------------------------------------
# Work out whether to copy assets from the backup disk, or whether to move
# files on the IA. Whatever we decide we also need to copy transcripts from
# the backup disk and upload to the IA
#-------------------------------------------------------------------------------
#
# Check each asset from the $IADB database to see if it's on the IA. We'll get back
# a path if it's where we want it, otherwise just a filename.
#
declare -a MOVES
coloured 'purple' "Checking IA files for moves"
#
# If we find an asset by looking for its basename in the list of files we got
# from the IA and if they are the same we need to move such files to the
# sub-directory.
#
for asset in "${IAFILES[@]}"; do
#
# Skip IA files with directories
#
if [[ $asset =~ / ]]; then
continue
fi
IA_match=$( grep "${asset}" <(printf '%s\n' "${IADBASSETS[@]}") )
if [[ $IA_match = "$asset" ]]; then
MOVES+=("$IA_match")
fi
done
#
# If we found any moves then we can move them in the IA item now and copy the
# files from the backup disk to the cache in case we need them. They will
# eventually get deleted by 'cron'.
#
if [[ ${#MOVES[@]} -gt 0 ]]; then
_DEBUG "$(coloured 'purple' "Files to be moved")" "${MOVES[@]}" "----"
mcount=0
for asset in "${MOVES[@]}"; do
# source & destination for IA moves
iafrom="$item/$asset"
iato="$item/$item/$asset"
_DEBUG "\$iafrom: $iafrom" "\$iato: $iato" ""
#
# If IA source and destination are the same no moves are needed. For
# the local cache the later 'rsync' will be enough.
#
if [[ $iafrom != "$iato" ]]; then
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "ia move $iafrom $iato --no-derive --no-backup"
coloured 'yellow' "cp $FROMPARENTDIR/$asset $TOASSETDIR/"
else
#
# Perform the move. If the retries are exceeded things get
# complicated, so just abort so we can try again later.
#
_verbose "$(coloured 'blue' "Moving $iafrom → $iato on IA")"
if _IA_move "$iafrom" "$iato"; then
#
# Update the cache (but only if the move occurred)
#
_verbose "$(coloured 'blue' "Copying from backup disk to cache")"
cp "$FROMPARENTDIR/$asset" "$TOASSETDIR/"
((mcount++))
else
coloured 'red' "Retries exhausted. Aborting recovery"
exit 1
fi
fi
fi
done
#
# Report what was done
#
coloured 'green' "Moved $mcount $(ngettext file files "$mcount")"
_log "Moved $mcount $(ngettext file files "$mcount")"
else
coloured 'yellow' "No moves needed"
_log "No moves needed"
fi
#
# Wait for the IA moves to finish
#
if [[ $DRYRUN -eq 0 ]]; then
if [[ $mcount -gt 0 ]]; then
until [[ $(queued_tasks "$item") -eq 0 ]]; do
coloured 'yellow' "Waiting for IA tasks to complete"
sleep 1m
done
fi
else
if [[ $mcount -gt 0 ]]; then
coloured 'yellow' "Would have waited for any IA tasks to complete"
fi
fi
#-------------------------------------------------------------------------------
# Copy files from the backup disk to the cache
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have copied files from backup disk → cache"
rsync -n -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR"
else
rsync -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR"
_verbose "$(coloured 'yellow' "Copied files from $FROMDIR")"
_log "Copied files from $FROMDIR"
fi
# TODO: Is this needed?
#
# Put any source audio in the right place.
#
# if [[ $DRYRUN -eq 1 ]]; then
# coloured 'yellow' "Would have moved source files if found"
# else
# #
# # Turn on 'nullglob' to get an empty result if the glob expression doesn't
# # match.
# #
# NG=$(shopt -p nullglob)
# shopt -s nullglob
#
# #
# # Any source files should be in repairs/hpr1234/ and should go to the IA
# # in the comparable place. We will not put it on the HPR server though.
# #
# # TODO: Is this right?
# movecount=0
# for file in "$TOPARENTDIR"/*_source.*; do
# if mv "$file" "$TOPARENTDIR"; then
# ((movecount++))
# fi
# done
#
# eval "$NG"
#
# #
# # Show the directories after any move
# #
# if [[ $movecount -gt 0 ]]; then
# _verbose "$(coloured 'yellow' "Moved source file(s)")"
# ls -lR "$REPAIRS/$item/"
# fi
#
# fi
#-------------------------------------------------------------------------------
# Using the cache as the reference upload whatever is missing to the IA
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have found and repaired missing files"
else
_verbose "$(coloured 'yellow' "Finding and repairing missing files")"
_log "Finding and repairing missing files (with $REPIT)"
"$REPIT" -X -d0 "$item"
fi
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker