hpr-tools/InternetArchive/repair_item

454 lines
13 KiB
Plaintext
Raw Normal View History

#!/bin/bash -
#===============================================================================
#
# FILE: repair_item
#
# USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] itemname
#
# DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during
# the upload.
#
# The most common failures are caused by the file upload
# processes timing out and being aborted (by the 'ia' tool which
# performs the item creation and the uploads). This failure
# means that a show being processed on 'borg' does not get all
# of the components loaded to the IA.
#
# This script looks at the files belonging to the show (stored
# temporarily on 'borg') and determines which have not been
# uploaded, then takes steps to perform the uploads.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.6
# CREATED: 2020-01-05 22:42:46
# REVISION: 2024-05-10 12:39:52
#
#===============================================================================
#set -o nounset # Treat unset variables as an error
VERSION="0.0.6"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Select the appropriate working directory for the host
#
case $(hostname) in
i7-desktop)
BASEDIR="$HOME/HPR/InternetArchive"
UPLOADS="$HOME/HPR/IA/uploads"
;;
borg)
BASEDIR="$HOME/IA"
UPLOADS="/data/IA/uploads"
;;
*)
echo "Wrong host!"
exit 1
;;
esac
cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
# shellcheck disable=SC1090
source "$LIB"
#
# Enable coloured messages
#
define_colours
#
# Sanity checks
#
JQ=$(command -v jq)
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
IA=$(command -v ia)
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
# {{{ -- Functions -- Upload, exists_in, queued_tasks, _DEBUG, _usage
#=== FUNCTION ================================================================
# NAME: Upload
# DESCRIPTION: Upload a file to the Internet Archive with various options
# PARAMETERS: 1 - the item id (e.g. 'hpr1234'
# 2 - the path to the file for upload
# 3 - (optional) the path to the file on the IA
# 4 - (optional) list of options for 'ia upload' enclosed as
# a string
# RETURNS: Nothing
#===============================================================================
Upload () {
local id=${1}
local file=${2}
local remote=${3:-}
local options=${4:-}
if [[ -e $file ]]; then
if [[ -z $remote ]]; then
# shellcheck disable=SC2086
ia upload ${id} ${file} ${options}
else
# shellcheck disable=SC2086
ia upload ${id} ${file} --remote-name=${remote} ${options}
fi
else
echo "File missing: $file"
fi
}
#=== FUNCTION ================================================================
# NAME: exists_in
# DESCRIPTION: Checks the existence of a key in an associative array
# PARAMETERS: $1 array name
# $2 key value
# RETURNS: True if the key exists, False otherwise
#
# Modified from
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
#===============================================================================
exists_in () {
# shellcheck disable=SC2086
eval '[ ${'$1'[$2]+muahaha} ]'
}
#=== FUNCTION ================================================================
# NAME: queued_tasks
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
# Writes the number to STDOUT so it can be captured.
# PARAMETERS: $1 IA item (like hpr1192)
# RETURNS: Nothing
#===============================================================================
queued_tasks () {
local item="${1:?Usage: queued_tasks item}"
local -i count=0
count="$(ia tasks "$item" |\
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
echo "$count"
return
}
#=== FUNCTION ================================================================
# NAME: _DEBUG
# DESCRIPTION: Writes a message if in DEBUG mode
# PARAMETERS: List of messages
# RETURNS: Nothing
#===============================================================================
_DEBUG () {
[ "$DEBUG" == 0 ] && return
for msg in "$@"; do
printf 'D> %s\n' "$msg"
done
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Reports usage; always exits the script after doing so
# PARAMETERS: 1 - the integer to pass to the 'exit' command
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i result=${1:-0}
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] [-l N] item
Attempts to repair an IA item where the upload has failed for some reason.
Options:
-h Print this help
-v Run in verbose mode where more information is
reported. Default is off.
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is changed but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-D Run in debug mode where a lot more information is
reported
-l N Control the number of shows that can be uploaded at
once. The range is 1 to $DEFLIMIT.
Arguments:
item The item in the form 'hpr1234'
endusage
exit "$result"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Directories and files
#
LOGS="$BASEDIR/logs"
LOGFILE="$LOGS/$SCRIPT.log"
#
# Constants
#
DEFLIMIT=20
#
# Process options
#
while getopts :d:Dhl:v opt
do
case "${opt}" in
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
h) _usage 0;;
l) LIMIT=$OPTARG;;
v) VERBOSE=1;;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
#
# Set option defaults and check their values
#
VERBOSE=${VERBOSE:-0}
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $VERBOSE -eq 1 && $DRYRUN -eq 1 ]] && echo "Dry run mode"
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
LIMIT=${LIMIT:-$DEFLIMIT}
if [[ $LIMIT -lt 1 || $LIMIT -gt $DEFLIMIT ]]; then
echo "** Use '-l 1' up to '-l $DEFLIMIT' or omit the option"
_usage 1
fi
#
# Should have one argument
#
if [[ $# != 1 ]]; then
coloured 'red' "Missing argument"
_usage 1
fi
item="${1}"
#
# Ensure item spec is correctly formatted
#
if [[ $item =~ hpr([0-9]+) ]]; then
printf -v item 'hpr%04d' "${BASH_REMATCH[1]}"
else
coloured 'red' "Incorrect show specification: $item"
coloured 'yellow' "Use 'hpr9999' format"
exit 1
fi
_DEBUG "Parsed item: $item"
#
# Declarations
#
declare -A fcache
declare -A iacache
declare -a missed
#
# Scan the directory 'UPLOADS' where files for upload to the IA are stored and
# collect everything for this item (show).
#
# See the `find' pipeline at the end of the loop which selects only files, not
# directories. It outputs the last change time and the full file path, sorts
# on the time, then removes it. This ensures we process the files in time
# order rather than alphabetic order of their names.
#
# TODO: This algorithm is from another script and is not needed here. The
# order of processing is irrelevant here so simplify the 'find' and the loop.
# We are only looking for the 'item' specified by the argument, not other
# ones.
#
while read -r path; do
relpath="${path#"$UPLOADS"/}"
item="${relpath:0:7}"
[[ $VERBOSE -eq 1 ]] && echo "Found $path"
_DEBUG "Path: $path"
_DEBUG "Relative path: $relpath"
_DEBUG "IA item: $item"
if ! exists_in fcache "$relpath"; then
# shellcheck disable=SC2034
fcache[$relpath]=1
fi
done < <(find "$UPLOADS" -type f -regextype posix-extended \
-regex ".*$item.*" -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ')
#
# Did we find anything?
#
if [[ ${#fcache[@]} -eq 0 ]]; then
coloured 'red' "No files found for item $item in $UPLOADS"
coloured 'red' "Can't continue"
exit 1
fi
#
# Look to see if there are any tasks queued for this show on the IA servers.
# If there are we can't continue.
#
# TODO: This could be a loop waiting for tasks to complete rather than
# aborting and asking to be rerun.
#
tasks=$(queued_tasks "$item")
if [[ $tasks -gt 0 ]]; then
coloured 'red' \
"Item $item still has $tasks unfinished $(ngettext task tasks "$tasks")"
coloured 'red' "Allow time for task(s) to finish and try again later"
exit 1
fi
#
# Interrogate the IA for the required item contents. If it returns True we can
# collect its contents, otherwise we can't proceed. The file 'TMP1' contains
# just a simple list of the files on the IA relating to this item.
#
if ia list "$item" > "$TMP1"; then
while read -r iafile; do
# shellcheck disable=SC2034
iacache[$iafile]=1
done < "$TMP1"
else
coloured 'red' "Item $item is not in the IA"
coloured 'red' "Can't continue"
exit 1
fi
#
# Look through the list of files we found and detect any not on the IA
#
for path in "${!fcache[@]}"; do
if ! exists_in iacache "$path"; then
missed+=("$path")
fi
done
#
# Counters and defaults for the loop
#
retry_threshold=5
sleeptime=20
failures=0
upload_count=0
#
# If there are missed files we can report what we'd be doing or do it,
# otherwise we have nothing to do.
#
if [[ ${#missed[@]} -eq 0 ]]; then
coloured 'green' "All expected files for item $item are on the IA"
else
mcount="${#missed[@]}"
coloured 'red' "There $(ngettext "is 1 missing file" "are $mcount missing files" "$mcount"):"
[[ $DRYRUN -eq 1 ]] && {
coloured 'blue' "Dry run: Would have run the following command(s):"
}
for file in "${missed[@]}"; do
cmd="Upload $item $UPLOADS/$file "
cmd+="'$file' '--retries=5 --no-derive -H x-archive-keep-old-version:0'"
if [[ $DRYRUN -eq 1 ]]; then
coloured 'yellow' "$cmd"
else
retries=0
printf 'Uploading %s\n' "$file"
#
# Run 'cmd'. If it succeeds then write to the log and loop for the
# next missing file. If it fails enter the 'until' loop and report
# and the problem. Count the number of times this is done, so it
# doesn't loop forever. If we have reached the limit count this as
# a failure and continue the parent loop (with the next missing
# file). If we haven't retried enough yet, sleep for a while and
# try again. The intention is to catch the case when an upload
# times out. The 'ia' command is performing its own retries per
# upload when the system is overloaded, but these are non-fatal.
#
until eval "$cmd"; do
coloured 'red' "Failure when invoking the Upload command!"
((retries++))
printf '%s Failed to upload %s to the IA [%d]\n' \
"$(date +%Y%m%d%H%M%S)" "$file" "$retries" >> "$LOGFILE"
[ "$retries" -eq "$retry_threshold" ] && {
((failures++))
continue 2
}
sleep $sleeptime
done # until eval ...
echo "$(date +%Y%m%d%H%M%S) Uploaded $file to the IA" >> "$LOGFILE"
fi
#
# Count actual uploads and dry-run ones the same
#
((upload_count++))
#
# Stop the missed file loop if we have reached the limiting number, in
# dry-run and live mode
#
[[ $upload_count -eq $LIMIT ]] && {
coloured 'blue' "Upload limit ($LIMIT) reached"
break
}
done # for file in ...
fi
#
# Summarise how many upload failures were detected
#
if [[ $failures -gt 0 ]]; then
coloured 'red' \
"There $(ngettext "was $failures upload failure" "were $failures upload failures" $failures)"
coloured 'yellow' 'Run this script again to repeat the repair attemmpt'
fi
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker