dc0f29e957
Database/query2tt2: comment and documentation updates; use of Perl's try/catch. InternetArchive/.make_metadata.cfg: added comments for readability InternetArchive/make_metadata: bug fix needed now that all shows on the HPR server have a directory with assets under it. InternetArchive/repair_assets: new Bash script in development. Collects assets from the IA and uploads them to a new directory on the HPR server. Will run 'fix_asset_links' (to repair asset links for their new directories) once it is ready. InternetArchive/repair_item: Bash script which was originally written to run on 'borg' and upload files to a new IA item when the uploads timed out. Now enhanced to upload missing files recovered from the HPR backup disk, such as transcripts.
518 lines
16 KiB
Bash
Executable File
518 lines
16 KiB
Bash
Executable File
#!/bin/bash -
|
|
#===============================================================================
|
|
#
|
|
# FILE: repair_item
|
|
#
|
|
# USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] itemname
|
|
#
|
|
# DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during
|
|
# the upload.
|
|
#
|
|
# The most common failures are caused by the file upload
|
|
# processes timing out and being aborted (by the 'ia' tool which
|
|
# performs the item creation and the uploads). This failure
|
|
# means that a show being processed on 'borg' does not get all
|
|
# of the components loaded to the IA.
|
|
#
|
|
# This script looks at the files belonging to the show (stored
|
|
# temporarily on 'borg') and determines which have not been
|
|
# uploaded, then takes steps to perform the uploads.
|
|
#
|
|
# Version 0.0.10 onwards has the capability to repair an IA item
|
|
# from the HPR backup disk. This seems to be necessary because
|
|
# the transcripts were not carried over (although we are
|
|
# adding them to the IA for new shows now, older ones were never
|
|
# copied), and there has been a case where none of the assets
|
|
# were on the IA. The method used it to place the backup files
|
|
# in the directory 'repairs' under the local IA or
|
|
# InternetArchive directory. The files are held in the hierarchy
|
|
# '$item/$item/'. The assets are in the lower directory and the
|
|
# source file is in the upper one. This emulates the placement
|
|
# on the IA itself.
|
|
#
|
|
# OPTIONS: ---
|
|
# REQUIREMENTS: ---
|
|
# BUGS: ---
|
|
# NOTES: ---
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
# VERSION: 0.0.10
|
|
# CREATED: 2020-01-05 22:42:46
|
|
# REVISION: 2024-07-12 14:39:38
|
|
#
|
|
#===============================================================================
|
|
|
|
#set -o nounset # Treat unset variables as an error
|
|
|
|
VERSION="0.0.10"
|
|
|
|
SCRIPT=${0##*/}
|
|
# DIR=${0%/*}
|
|
|
|
STDOUT="/dev/fd/2"
|
|
|
|
#
|
|
# Select the appropriate working directory for the host
|
|
#
|
|
case $(hostname) in
|
|
i7-desktop)
|
|
BASEDIR="$HOME/HPR/InternetArchive"
|
|
UPLOADS="$HOME/HPR/IA/uploads"
|
|
REPAIRS="$BASEDIR/repairs"
|
|
;;
|
|
borg)
|
|
BASEDIR="$HOME/IA"
|
|
UPLOADS="/data/IA/uploads"
|
|
REPAIRS="$BASEDIR/repairs"
|
|
;;
|
|
*)
|
|
echo "Wrong host!"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
|
|
|
|
#
|
|
# Load library functions
|
|
#
|
|
LIB="$HOME/bin/function_lib.sh"
|
|
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
|
|
# shellcheck disable=SC1090
|
|
source "$LIB"
|
|
|
|
#
|
|
# Enable coloured messages
|
|
#
|
|
define_colours
|
|
|
|
#
|
|
# Sanity checks
|
|
#
|
|
JQ=$(command -v jq)
|
|
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
|
|
IA=$(command -v ia)
|
|
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
|
|
|
|
#
|
|
# Make temporary files and set traps to delete them
|
|
#
|
|
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
|
|
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
|
|
|
|
|
|
# {{{ -- Functions -- Upload, exists_in, queued_tasks, _DEBUG, _usage
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: Upload
|
|
# DESCRIPTION: Uploads a file to the Internet Archive with various options.
|
|
# Any output from the 'ia' command invocation is saved in
|
|
# a temporary file and the name reported to the caller
|
|
# PARAMETERS: 1 - the item id (e.g. 'hpr1234'
|
|
# 2 - the path to the file for upload
|
|
# 3 - (optional) the path to the file on the IA
|
|
# 4 - (optional) list of options for 'ia upload' enclosed as
|
|
# a string
|
|
# RETURNS: Exit code of last command
|
|
#===============================================================================
|
|
Upload () {
|
|
local id=${1}
|
|
local file=${2}
|
|
local remote=${3:-}
|
|
local options=${4:-}
|
|
|
|
local RES
|
|
|
|
if [[ -e $file ]]; then
|
|
if [[ -z $remote ]]; then
|
|
# shellcheck disable=SC2086
|
|
ia upload ${id} ${file} ${options} > /dev/null 2>&1
|
|
RES=$?
|
|
return $RES
|
|
else
|
|
# shellcheck disable=SC2086
|
|
ia upload ${id} ${file} --remote-name=${remote} ${options} > /dev/null 2>&1
|
|
RES=$?
|
|
return $RES
|
|
fi
|
|
else
|
|
echo "File missing: $file"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: exists_in
|
|
# DESCRIPTION: Checks the existence of a key in an associative array
|
|
# PARAMETERS: $1 array name
|
|
# $2 key value
|
|
# RETURNS: True if the key exists, False otherwise
|
|
#
|
|
# Modified from
|
|
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
|
|
#===============================================================================
|
|
exists_in () {
|
|
# shellcheck disable=SC2086
|
|
eval '[ ${'$1'[$2]+muahaha} ]'
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: queued_tasks
|
|
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
|
|
# Writes the number to STDOUT so it can be captured.
|
|
# PARAMETERS: $1 IA item (like hpr1192)
|
|
# RETURNS: Nothing
|
|
#===============================================================================
|
|
queued_tasks () {
|
|
local item="${1:?Usage: queued_tasks item}"
|
|
local -i count=0
|
|
|
|
count="$(ia tasks "$item" |\
|
|
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
|
|
|
|
echo "$count"
|
|
|
|
return
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: _DEBUG
|
|
# DESCRIPTION: Writes a message if in DEBUG mode
|
|
# PARAMETERS: List of messages
|
|
# RETURNS: Nothing
|
|
#===============================================================================
|
|
_DEBUG () {
|
|
[ "$DEBUG" == 0 ] && return
|
|
for msg in "$@"; do
|
|
printf 'D> %s\n' "$msg"
|
|
done
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: _usage
|
|
# DESCRIPTION: Reports usage; always exits the script after doing so
|
|
# PARAMETERS: 1 - the integer to pass to the 'exit' command
|
|
# RETURNS: Nothing
|
|
#===============================================================================
|
|
_usage () {
|
|
local -i result=${1:-0}
|
|
|
|
cat >$STDOUT <<-endusage
|
|
${SCRIPT} - version: ${VERSION}
|
|
|
|
Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] item
|
|
|
|
Attempts to repair an IA item where the upload has failed for some reason.
|
|
|
|
Options:
|
|
-h Print this help
|
|
-v Run in verbose mode where more information is
|
|
reported. Default is off.
|
|
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
|
|
mode where nothing is changed but the actions that
|
|
will be taken are reported; -d 0 turns off dry-run
|
|
mode and the actions will be carried out.
|
|
-D Run in debug mode where a lot more information is
|
|
reported
|
|
-l N Control the number of files that can be uploaded
|
|
during one run of the script. The range is 1 to
|
|
$DEFLIMIT. This can be helpful when there are upload
|
|
problems.
|
|
-X Run in "extended" mode. In this mode the directory
|
|
holding files to be added to the IA is '~/IA/repairs'
|
|
and the files have most likely come from the HPR
|
|
backup disk and aren't on the IA due some error. We
|
|
want to use the capabilities of ${SCRIPT} to repair
|
|
things and deal with the IA upload problems.
|
|
|
|
Arguments:
|
|
item The item in the form 'hpr1234'
|
|
|
|
endusage
|
|
exit "$result"
|
|
}
|
|
|
|
# }}}
|
|
|
|
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
#
|
|
# Directories and files
|
|
#
|
|
LOGS="$BASEDIR/logs"
|
|
LOGFILE="$LOGS/$SCRIPT.log"
|
|
|
|
#
|
|
# Constants
|
|
#
|
|
DEFLIMIT=20
|
|
|
|
#
|
|
# Process options
|
|
#
|
|
while getopts :d:Dhl:vX opt
|
|
do
|
|
case "${opt}" in
|
|
D) DEBUG=1;;
|
|
d) DRYRUN=$OPTARG;;
|
|
h) _usage 0;;
|
|
l) LIMIT=$OPTARG;;
|
|
v) VERBOSE=1;;
|
|
X) EXTENDED=1;;
|
|
*) echo "** Unknown option"
|
|
_usage 1;;
|
|
esac
|
|
done
|
|
shift $((OPTIND - 1))
|
|
|
|
#
|
|
# Set option defaults and check their values
|
|
#
|
|
VERBOSE=${VERBOSE:-0}
|
|
|
|
DRYRUN=${DRYRUN:-1}
|
|
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
|
|
echo "** Use '-d 0' or '-d 1'"
|
|
_usage 1
|
|
fi
|
|
[[ $VERBOSE -eq 1 && $DRYRUN -eq 1 ]] && echo "Dry run mode"
|
|
|
|
DEBUG=${DEBUG:-0}
|
|
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
|
|
|
|
LIMIT=${LIMIT:-$DEFLIMIT}
|
|
if [[ $LIMIT -lt 1 || $LIMIT -gt $DEFLIMIT ]]; then
|
|
echo "** Use '-l 1' up to '-l $DEFLIMIT' or omit the option"
|
|
_usage 1
|
|
fi
|
|
|
|
EXTENDED=${EXTENDED:-0}
|
|
|
|
#
|
|
# Should have one argument
|
|
#
|
|
if [[ $# != 1 ]]; then
|
|
coloured 'red' "Missing argument"
|
|
_usage 1
|
|
fi
|
|
item="${1}"
|
|
|
|
#
|
|
# Ensure item spec is correctly formatted
|
|
#
|
|
if [[ $item =~ hpr([0-9]+) ]]; then
|
|
printf -v item 'hpr%04d' "${BASH_REMATCH[1]}"
|
|
else
|
|
coloured 'red' "Incorrect show specification: $item"
|
|
coloured 'yellow' "Use 'hpr9999' format"
|
|
exit 1
|
|
fi
|
|
_DEBUG "Parsed item: $item"
|
|
|
|
#
|
|
# It's possible that the show upload failed before anything was uploaded, even
|
|
# the metadata. It's never been seen, but it seems wise to cater for it.
|
|
#
|
|
if ! ia metadata "$item" --exists > /dev/null 2>&1; then
|
|
coloured 'red' "This item is not apparently on the IA; can't continue"
|
|
coloured 'yellow' "Try running the entire upload again from the start"
|
|
exit 1
|
|
fi
|
|
|
|
#
|
|
# The -X (EXTENDED) mode is for when we have to upload files that have
|
|
# mysteriously vanished from the IA. The directories here are equivalent to
|
|
# those used by 'repair_assets'. There is a top-level directory the represents
|
|
# the IA item, and below that a hierarchy defining placement under the item.
|
|
# There is a 'repairs' directory per host in case we need to preair IA stuff
|
|
# from elsewhere.
|
|
#
|
|
if [[ $EXTENDED -eq 1 ]]; then
|
|
coloured 'cyan' "Using 'Extended' mode"
|
|
if [[ ! -e $REPAIRS ]]; then
|
|
mkdir -p "$REPAIRS"
|
|
fi
|
|
UPLOADS="$REPAIRS/$item"
|
|
fi
|
|
|
|
#
|
|
# Declarations
|
|
#
|
|
declare -A fcache
|
|
declare -A iacache
|
|
declare -a missed
|
|
|
|
#
|
|
# Scan the directory 'UPLOADS' where files for upload to the IA are stored and
|
|
# collect everything for this item (show).
|
|
#
|
|
# See the `find' pipeline at the end of the loop which selects only files, not
|
|
# directories. It outputs the last change time and the full file path, sorts
|
|
# on the time, then removes it. This ensures we process the files in time
|
|
# order rather than alphabetic order of their names.
|
|
#
|
|
# TODO: This algorithm is from another script and is not needed here. The
|
|
# order of processing is irrelevant here so simplify the 'find' and the loop.
|
|
# We are only looking for the 'item' specified by the argument, not other
|
|
# ones.
|
|
#
|
|
while read -r path; do
|
|
relpath="${path#"$UPLOADS"/}"
|
|
item="${relpath:0:7}"
|
|
|
|
[[ $VERBOSE -eq 1 ]] && echo "Found $path"
|
|
|
|
_DEBUG "Path: $path"
|
|
_DEBUG "Relative path: $relpath"
|
|
_DEBUG "IA item: $item"
|
|
|
|
if ! exists_in fcache "$relpath"; then
|
|
# shellcheck disable=SC2034
|
|
fcache[$relpath]=1
|
|
fi
|
|
done < <(find "$UPLOADS" -type f -regextype posix-extended \
|
|
-regex ".*$item.*" -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ')
|
|
|
|
#
|
|
# Did we find anything?
|
|
#
|
|
if [[ ${#fcache[@]} -eq 0 ]]; then
|
|
coloured 'red' "No files found for item $item in $UPLOADS"
|
|
coloured 'red' "Can't continue"
|
|
exit 1
|
|
fi
|
|
|
|
#
|
|
# Look to see if there are any tasks queued for this show on the IA servers.
|
|
# If there are we can't continue.
|
|
#
|
|
# TODO: This could be a loop waiting for tasks to complete rather than
|
|
# aborting and asking to be rerun.
|
|
#
|
|
tasks=$(queued_tasks "$item")
|
|
if [[ $tasks -gt 0 ]]; then
|
|
coloured 'red' \
|
|
"Item $item still has $tasks unfinished $(ngettext task tasks "$tasks")"
|
|
coloured 'red' "Allow time for task(s) to finish and try again later"
|
|
exit 1
|
|
fi
|
|
|
|
#
|
|
# Interrogate the IA for the required item contents. If it returns True we can
|
|
# collect its contents, otherwise we can't proceed. The file 'TMP1' contains
|
|
# just a simple list of the files on the IA relating to this item.
|
|
#
|
|
if ia list "$item" > "$TMP1"; then
|
|
while read -r iafile; do
|
|
# shellcheck disable=SC2034
|
|
iacache[$iafile]=1
|
|
done < "$TMP1"
|
|
else
|
|
coloured 'red' "Item $item is not in the IA"
|
|
coloured 'red' "Can't continue"
|
|
exit 1
|
|
fi
|
|
|
|
#
|
|
# Look through the list of files we found and detect any not on the IA
|
|
#
|
|
for path in "${!fcache[@]}"; do
|
|
if ! exists_in iacache "$path"; then
|
|
missed+=("$path")
|
|
fi
|
|
done
|
|
|
|
#
|
|
# Counters and defaults for the loop
|
|
#
|
|
retry_threshold=5
|
|
sleeptime=20
|
|
failures=0
|
|
upload_count=0
|
|
|
|
#
|
|
# If there are missed files we can report what we'd be doing or do it,
|
|
# otherwise we have nothing to do.
|
|
#
|
|
if [[ ${#missed[@]} -eq 0 ]]; then
|
|
coloured 'green' "All expected files for item $item are on the IA"
|
|
else
|
|
mcount="${#missed[@]}"
|
|
coloured 'red' "There $(ngettext "is 1 missing file" "are $mcount missing files" "$mcount"):"
|
|
|
|
[[ $DRYRUN -eq 1 ]] && {
|
|
coloured 'blue' "Dry run: Would have run the following command(s):"
|
|
}
|
|
|
|
for file in "${missed[@]}"; do
|
|
cmd="Upload $item $UPLOADS/$file "
|
|
cmd+="'$file' '--retries=5 --no-derive -H x-archive-keep-old-version:0'"
|
|
|
|
if [[ $DRYRUN -eq 1 ]]; then
|
|
coloured 'yellow' "$cmd"
|
|
else
|
|
retries=0
|
|
|
|
coloured 'blue' "Uploading $file"
|
|
|
|
#
|
|
# Run 'cmd'. If it succeeds then write to the log and loop for the
|
|
# next missing file. If it fails enter the 'until' loop and report
|
|
# and the problem. Count the number of times this is done, so it
|
|
# doesn't loop forever. If we have reached the limit count this as
|
|
# a failure and continue the parent loop (with the next missing
|
|
# file). If we haven't retried enough yet, sleep for a while and
|
|
# try again. The intention is to catch the case when an upload
|
|
# times out. The 'ia' command is performing its own retries per
|
|
# upload when the system is overloaded, but these are non-fatal.
|
|
#
|
|
until eval "$cmd"; do
|
|
coloured 'red' "Failure when uploading $file"
|
|
((retries++))
|
|
|
|
printf '%s Failed to upload %s to the IA [%d]\n' \
|
|
"$(date +%Y%m%d%H%M%S)" "$file" "$retries" >> "$LOGFILE"
|
|
|
|
[ "$retries" -eq "$retry_threshold" ] && {
|
|
((failures++))
|
|
[[ $VERBOSE -eq 1 ]] && \
|
|
coloured 'blue' "Retry limit reached; abandoning this file"
|
|
continue 2
|
|
}
|
|
|
|
[[ $VERBOSE -eq 1 ]] && coloured 'blue' "Pausing for $sleeptime and retrying"
|
|
sleep $sleeptime
|
|
done # until eval ...
|
|
|
|
coloured 'green' "Uploaded $file to the IA"
|
|
echo "$(date +%Y%m%d%H%M%S) Uploaded $file to the IA" >> "$LOGFILE"
|
|
fi
|
|
|
|
#
|
|
# Count actual uploads and dry-run ones the same
|
|
#
|
|
((upload_count++))
|
|
|
|
#
|
|
# Stop the missed file loop if we have reached the limiting number, in
|
|
# dry-run and live mode
|
|
#
|
|
[[ $upload_count -eq $LIMIT ]] && {
|
|
coloured 'blue' "Upload limit ($LIMIT) reached"
|
|
break
|
|
}
|
|
|
|
done # for file in ...
|
|
|
|
fi
|
|
|
|
#
|
|
# Summarise how many upload failures were detected
|
|
#
|
|
if [[ $failures -gt 0 ]]; then
|
|
coloured 'red' \
|
|
"There $(ngettext "was $failures upload failure" "were $failures upload failures" $failures)"
|
|
coloured 'yellow' 'Run this script again to repeat the repair attempt'
|
|
fi
|
|
|
|
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker
|