hpr-tools/InternetArchive/recover_transcripts

591 lines
19 KiB
Plaintext
Raw Normal View History

#!/bin/bash -
#===============================================================================
#
# FILE: recover_transcripts
#
# USAGE: ./recover_transcripts item
#
# DESCRIPTION: Intended to be run on `borg`; collects assets from the
# locally-mounted backup disk and places them in a local
# directory (organised to be compatible with the IA), then
# uploads anything that is missing on the IA.
#
# Version 0.1.* looks for assets in the 'eps/' directory and
# copies them to the cache. Also moves the IA copies so all is
# aligned. Many shows earlier than mid 2019 are likely to need
# this addition.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.1.4
# CREATED: 2024-07-14 13:22:58
# REVISION: 2024-08-20 17:38:19
#
#===============================================================================
# set -o nounset # Treat unset variables as an error
VERSION="0.1.4"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Select the appropriate working directory for the host
#
case $(hostname) in
i7-desktop)
echo "To be run only on 'borg'"
exit 1
;;
borg)
BASEDIR="$HOME/IA"
REPAIRS="$BASEDIR/repairs"
BACKUP="/mnt/backup_disk/HPR/HPR-MIRROR"
;;
*)
echo "Wrong host!"
exit 1
;;
esac
cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
# shellcheck disable=SC1090
source "$LIB"
#
# Enable coloured messages
#
define_colours
#
# Sanity checks
#
JQ=$(command -v jq)
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
IA=$(command -v ia)
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
REPIT="$BASEDIR/repair_item"
[ -e "$REPIT" ] || { echo "Program '$REPIT' was not found"; exit 1; }
IADB="$BASEDIR/ia.db"
[ -e "$IADB" ] || { echo "Database '$IADB' was not found"; exit 1; }
# {{{ -- Functions -- _IA_move, queued_tasks, _verbose, _usage
#=== FUNCTION ================================================================
# NAME: _IA_move
# DESCRIPTION: Performs a file move on the IA, with retries if it fails.
# Assumes the existence of functions 'coloured', '_log',
# '_verbose' and '_DEBUG'
# PARAMETERS: $1 IA command to run (as a string)
# $2 The path to move from
# $3 The path to move to
# RETURNS: False if the number of retries is exceeded, otherwise true.
#===============================================================================
_IA_move () {
local from="${1:?Usage _IA_move command from to}"
local to="${2:?Usage _IA_move command from to}"
local retry_threshold=5
local retries=0
local sleeptime=20
local command="ia move \"$from\" \"$to\" --no-derive --no-backup > /dev/null 2>&1"
_DEBUG "$command"
# coloured 'blue' "Moving $from to $to"
#
# Run 'command'. If it succeeds then exit. If it fails enter the 'until'
# loop and report the problem, then sleep and try again. Count the number
# of times this is done, so it doesn't loop forever. If we have reached
# the limit count this as a failure and exit with an error. If we haven't
# retried enough yet, sleep for a while and try again. The intention is to
# catch the case when an upload times out. The 'ia' command is performing
# its own retries per upload when the system is overloaded, but these are
# non-fatal.
#
until eval "$command"; do
coloured 'red' "Failure when moving $from to $to"
((retries++))
_log "$(printf 'Failed to move %s to %s [%d]' "$from" "$to" $retries)"
[ "$retries" -eq "$retry_threshold" ] && {
_verbose \
"$(coloured 'red' "Retry limit reached; abandoning this move")"
return 1
}
_verbose "$(coloured 'blue' "Pausing for $sleeptime seconds and retrying")"
sleep $sleeptime
done # until eval ...
coloured 'green' "Moved $from to $to on the IA"
_log "Moved $from to $to on the IA"
return
}
#=== FUNCTION ================================================================
# NAME: queued_tasks
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
# Writes the number to STDOUT so it can be captured.
# PARAMETERS: $1 IA item (like hpr1192)
# RETURNS: Nothing
#===============================================================================
queued_tasks () {
local item="${1:?Usage: queued_tasks item}"
local -i count=0
count="$(ia tasks "$item" |\
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
echo "$count"
return
}
#=== FUNCTION ================================================================
# NAME: make_dir
# DESCRIPTION: Make a directory if it doesn't exist, failing gracefully on
# errors.
# PARAMETERS: $1 directory path
# RETURNS: True if success, otherwise exits the caller script
#===============================================================================
make_dir () {
local dir="${1}"
if [[ ! -d $dir ]]; then
mkdir -p "$dir" || {
coloured 'red' "Failed to create $dir"
exit 1
}
fi
}
#=== FUNCTION ================================================================
# NAME: _ifbool
# DESCRIPTION: Simplifies conditional expressions when they nned to return
# one of two strings. Use as:
# echo "Hello $(_ifbool 1 'World' 'Everyone')" → "Hello World"
# PARAMETERS: $1 Integer being tested. If 1 then it's true, otherwise
# it's false. Non-numeric is treated as 0/false.
# $2 String returned for True
# $3 String returned for False
# RETURNS: Nothing
#===============================================================================
_ifbool () {
local -i _bool="${1:-0}"
local _t="${2:-true}"
local _f="${3:-false}"
if [ "$_bool" -eq 1 ]; then
echo "$_t"
else
echo "$_f"
fi
return
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Appends a record to the file "$LOGFILE"
# PARAMETERS: $1 Message to write
# RETURNS: Nothing
#===============================================================================
_log () {
local message="${1}"
echo "$(date +%F\ %T) $message" >> "$LOGFILE"
}
#=== FUNCTION ================================================================
# NAME: _verbose
# DESCRIPTION: Writes a message in verbose mode
# PARAMETERS: * message strings to write
# RETURNS: Nothing
#===============================================================================
_verbose () {
[ "$VERBOSE" -eq 0 ] && return
for msg; do
printf '%s\n' "$msg"
done
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Reports usage; always exits the script after doing so
# PARAMETERS: 1 - the integer to pass to the 'exit' command
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i result=${1:-0}
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-D] [-F] [-v] item
Attempts to repair an IA item where the upload has failed for some reason.
Options:
-h Print this help.
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is changed but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-D Run in debug mode where a lot more information is
reported.
-F Ignore (some) interlocks that will cause failure, such
as the existence of the local cache directory for the
item being processed.
-v Run in verbose mode where more information is
reported. Default is off.
Arguments:
item The item in the form 'hpr1234'
endusage
exit "$result"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#-------------------------------------------------------------------------------
# Directories and files
#-------------------------------------------------------------------------------
LOGS="$BASEDIR/logs"
make_dir "${LOGS}"
LOGFILE="$LOGS/$SCRIPT.log"
while getopts :d:DFhv opt
do
case "${opt}" in
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
F) FORCE=1;;
h) _usage 0;;
v) VERBOSE=1;;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
#
# Set option defaults and check their values
#
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $DRYRUN -eq 1 ]] && echo "Dry run mode"
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
FORCE=${FORCE:-0}
VERBOSE=${VERBOSE:-0}
#
# Should have one argument
#
Updates for missing asset "repair" InternetArchive/recover_transcripts: Bash script to be run on 'borg' which collects files missing on the IA ready for upload as part of the missing asset repair process. InternetArchive/repair_assets: Bash script to take assets from the IA (after they had been repaired on 'borg') and copy them to the HPR server for the notes to access. The local machine, where this was run, was used to store files being uploaded. The planned script to modify the notes to reflect the new file locations was never finished. Notes were edited with Vim using a few macros. InternetArchive/repair_item: Bash script which is best run on 'borg', which repairs an IA item by comparing the files on the IA with the files on 'borg' (or a local machine). These files are either in '/data/IA/uploads/' or in the temporary file hierarchy used by 'recover_transcripts' (which calls it). Used after a normal IA upload to check for and make good any missed file uploads (due to timeouts, etc). Also used during asset repairs, but that project is now finished. InternetArchive/snapshot_metadata: Bash script which collects detailed metadata from the IA in JSON format and saves it locally (run on a local PC). Older shows on the IA often contained derivative files which were identified by the script 'view_derivatives'. These files were never needed, they were IA artefacts, so can be deleted (see the script header for how). InternetArchive/view_derivatives: Perl script to interpret a file of JSON metadata from the IA for an HPR show in order to determine the parent-child hierarchy of files where there may be derivatives. We don't want IA-generated derivatives, but this process was hard to turn off in earlier times. Generates a hierarchical report and a list of unwanted derivatives (see 'snapshot_metadata' for more details of how this was used).
2024-11-23 22:28:52 +00:00
if [[ $# -ne 1 ]]; then
coloured 'red' "Missing argument"
_usage 1
fi
item="${1}"
#
# Ensure item spec is correctly formatted
#
if [[ $item =~ hpr([0-9]+) ]]; then
printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))"
else
coloured 'red' "Incorrect show specification: $item"
coloured 'yellow' "Use 'hpr9999' format"
exit 1
fi
_DEBUG "Parsed item: $item"
_log "$SCRIPT $VERSION ($(_ifbool "$DRYRUN" 'dry-run' 'live'))"
#
# Having an entry for the show in 'ia.db' is important, so check there is one
#
SQL="select 1 from episodes where id = ${item:3}"
if [[ $(sqlite3 -list "$IADB" "$SQL" 2>/dev/null) -ne 1 ]]; then
coloured 'red' "Unable to find show $item in the local IA database"
coloured 'yellow' "Can't continue"
exit 1
fi
_verbose "$(coloured 'yellow' "Show $item is in the local IA database")"
_log "Show $item is in the local IA database"
#
# It's possible that the show upload failed before anything was uploaded, even
# the metadata. It's never been seen, but it seems wise to cater for it.
#
# TODO: uncomment below; disabled for speed
if ! ia metadata "$item" --exists > /dev/null 2>&1; then
coloured 'red' "This item is not apparently on the IA; can't continue"
exit 1
fi
_verbose "$(coloured 'yellow' "Show $item is on the IA")"
_log "Show $item is on the IA"
#
# Directory paths
#
FROMPARENTDIR="$BACKUP/public_html/eps"
FROMDIR="$FROMPARENTDIR/$item"
TOPARENTDIR="$REPAIRS/$item"
TOASSETDIR="$TOPARENTDIR/$item"
#
# RE to ignore certain files using 'grep -v -E ...'
#
IGNORE="($item\.(flac|mp3|ogg|opus|spx|wav)$)"
#-------------------------------------------------------------------------------
# Check there are asset files on the backup disk before proceeding. At least
# we need the transcripts. If no files at all we can't continue.
#-------------------------------------------------------------------------------
declare -a BACKUPFILES
mapfile -t BACKUPFILES < \
<(find "$FROMPARENTDIR" -type f -name "$item*" | grep -v -E "${IGNORE}")
_DEBUG "$(coloured 'purple' "Backup files")" "${BACKUPFILES[@]}"
if [[ ! -d $FROMDIR || ${#BACKUPFILES[@]} -eq 0 ]]; then
coloured 'red' "No files found in $FROMDIR"
coloured 'red' "Can't continue!"
exit 1
fi
_log "Files found on backup disk ${#BACKUPFILES[*]}"
#-------------------------------------------------------------------------------
# Make the needed local cache directory for later
#-------------------------------------------------------------------------------
if [[ $FORCE -ne 1 && -e $TOPARENTDIR ]]; then
coloured 'red' "Directory $TOPARENTDIR already exists; can't continue".
coloured 'yellow' 'This implies that all files have been copied already.'
coloured 'yellow' "If you're sure, consider running: '$REPIT -X -d0 $item'"
coloured 'yellow' 'Otherwise, consider running again with option -F.'
exit 1
else
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have created directory $TOPARENTDIR"
else
mkdir -p "$TOASSETDIR"
_verbose "$(coloured 'yellow' "Created directory $TOASSETDIR")"
_log "Created directory $TOASSETDIR"
fi
fi
#-------------------------------------------------------------------------------
# Collect asset data from the database
#-------------------------------------------------------------------------------
SQL="select filename from assets where episode_id = ${item:3}"
declare -a IADBASSETS
mapfile -t IADBASSETS < <(sqlite3 -list "$IADB" "$SQL" 2>/dev/null)
_DEBUG "$(coloured 'purple' "SQLite IA DB files")" "${IADBASSETS[@]}"
_log "Files found in ia.db ${#IADBASSETS[*]}"
#-------------------------------------------------------------------------------
# Collect IA data, only original files generated by HPR. We exclude audio
# files from this set.
#-------------------------------------------------------------------------------
JQPROG='.files[] | select(.source == "original" and .format != "Metadata" and '
JQPROG+='.format != "Item Tile") | (.name) | @text'
declare -a IAFILES
mapfile -t IAFILES < \
<(ia metadata "$item" | $JQ -r "$JQPROG" | grep -v -E "${IGNORE}")
_DEBUG "$(coloured 'purple' "IA files (originals)")" "${IAFILES[@]}"
_log "Files found on IA (originals) ${#IAFILES[*]}"
#-------------------------------------------------------------------------------
# Work out whether to copy assets from the backup disk, or whether to move
# files on the IA. Whatever we decide we also need to copy transcripts from
# the backup disk and upload to the IA
#-------------------------------------------------------------------------------
#
# Check each asset from the $IADB database to see if it's on the IA. We'll get back
# a path if it's where we want it, otherwise just a filename.
#
declare -a MOVES
coloured 'purple' "Checking IA files for moves"
#
# If we find an asset by looking for its basename in the list of files we got
# from the IA and if they are the same we need to move such files to the
# sub-directory.
#
for asset in "${IAFILES[@]}"; do
#
# Skip IA files with directories
#
if [[ $asset =~ / ]]; then
continue
fi
IA_match=$( grep "${asset}" <(printf '%s\n' "${IADBASSETS[@]}") )
if [[ $IA_match = "$asset" ]]; then
MOVES+=("$IA_match")
fi
done
#
# If we found any moves then we can move them in the IA item now and copy the
# files from the backup disk to the cache in case we need them. They will
# eventually get deleted by 'cron'.
#
if [[ ${#MOVES[@]} -gt 0 ]]; then
_DEBUG "$(coloured 'purple' "Files to be moved")" "${MOVES[@]}" "----"
mcount=0
for asset in "${MOVES[@]}"; do
# source & destination for IA moves
iafrom="$item/$asset"
iato="$item/$item/$asset"
_DEBUG "\$iafrom: $iafrom" "\$iato: $iato" ""
#
# If IA source and destination are the same no moves are needed. For
# the local cache the later 'rsync' will be enough.
#
if [[ $iafrom != "$iato" ]]; then
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "ia move $iafrom $iato --no-derive --no-backup"
coloured 'yellow' "cp $FROMPARENTDIR/$asset $TOASSETDIR/"
else
#
# Perform the move. If the retries are exceeded things get
# complicated, so just abort so we can try again later.
#
_verbose "$(coloured 'blue' "Moving $iafrom → $iato on IA")"
if _IA_move "$iafrom" "$iato"; then
#
# Update the cache (but only if the move occurred)
#
_verbose "$(coloured 'blue' "Copying from backup disk to cache")"
cp "$FROMPARENTDIR/$asset" "$TOASSETDIR/"
((mcount++))
else
coloured 'red' "Retries exhausted. Aborting recovery"
exit 1
fi
fi
fi
done
#
# Report what was done
#
coloured 'green' "Moved $mcount $(ngettext file files "$mcount")"
_log "Moved $mcount $(ngettext file files "$mcount")"
else
coloured 'yellow' "No moves needed"
_log "No moves needed"
fi
#
# Wait for the IA moves to finish
#
if [[ $DRYRUN -eq 0 ]]; then
if [[ $mcount -gt 0 ]]; then
until [[ $(queued_tasks "$item") -eq 0 ]]; do
coloured 'yellow' "Waiting for IA tasks to complete"
sleep 1m
done
fi
else
if [[ $mcount -gt 0 ]]; then
coloured 'yellow' "Would have waited for any IA tasks to complete"
fi
fi
#-------------------------------------------------------------------------------
# Copy files from the backup disk to the cache
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have copied files from backup disk → cache"
rsync -n -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR"
else
rsync -vaP --exclude=index.html "$FROMDIR" "$TOPARENTDIR"
_verbose "$(coloured 'yellow' "Copied files from $FROMDIR")"
_log "Copied files from $FROMDIR"
fi
# TODO: Is this needed?
#
# Put any source audio in the right place.
#
# if [[ $DRYRUN -eq 1 ]]; then
# coloured 'yellow' "Would have moved source files if found"
# else
# #
# # Turn on 'nullglob' to get an empty result if the glob expression doesn't
# # match.
# #
# NG=$(shopt -p nullglob)
# shopt -s nullglob
#
# #
# # Any source files should be in repairs/hpr1234/ and should go to the IA
# # in the comparable place. We will not put it on the HPR server though.
# #
# # TODO: Is this right?
# movecount=0
# for file in "$TOPARENTDIR"/*_source.*; do
# if mv "$file" "$TOPARENTDIR"; then
# ((movecount++))
# fi
# done
#
# eval "$NG"
#
# #
# # Show the directories after any move
# #
# if [[ $movecount -gt 0 ]]; then
# _verbose "$(coloured 'yellow' "Moved source file(s)")"
# ls -lR "$REPAIRS/$item/"
# fi
#
# fi
#-------------------------------------------------------------------------------
# Using the cache as the reference upload whatever is missing to the IA
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "Would have found and repaired missing files"
else
_verbose "$(coloured 'yellow' "Finding and repairing missing files")"
_log "Finding and repairing missing files (with $REPIT)"
"$REPIT" -X -d0 "$item"
fi
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker