hpr-tools/InternetArchive/repair_item

#!/bin/bash -
#===============================================================================
#
#         FILE: repair_item
#
#        USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] itemname
#
#  DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during
#               the upload (and when recovering deleted files from the
#               changeover to the HPR static site).
#
#               The most common failures are caused by the file upload
#               processes timing out and being aborted (by the 'ia' tool which
#               performs the item creation and the uploads). This failure
#               means that a show being processed on 'borg' does not get all
#               of the components loaded to the IA. This happens during the
#               sequence of running the 'make_metadata' Perl script which
#               generates a CSV file of show data, followed by 'ia metadata
#               --spreadsheet=<CSV file>'. Failures in the second part cause
#               it to be aborted
#
#               This script looks at the files belonging to the show (stored
#               temporarily on 'borg') and determines which have not been
#               uploaded, then takes steps to perform the uploads.
#
#               Version 0.0.12 onwards has the capability to repair an IA item
#               from the HPR backup disk. This seems to be necessary because
#               the transcripts were not carried over (although we are
#               adding them to the IA for new shows now, older ones were never
#               copied), and there has been a case where none of the assets
#               were on the IA. The method used it to place the backup files
#               in the directory 'repairs' under the local IA or
#               InternetArchive directory. The files are held in the hierarchy
#               '$item/$item/'. The assets are in the lower directory and the
#               source file is in the upper one. This emulates the placement
#               on the IA itself.
#
#               This script can be called directly to recover a new show which
#               failed during creation/upload, or by 'recover_transcripts'
#               which is repairing shows with missing assets.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.12
#      CREATED: 2020-01-05 22:42:46
#     REVISION: 2024-09-13 18:19:59
#
#===============================================================================

#set -o nounset                              # Treat unset variables as an error

VERSION="0.0.12"

SCRIPT=${0##*/}
# DIR=${0%/*}

STDOUT="/dev/fd/2"

#
# Select the appropriate working directory for the host
#
case $(hostname) in
    i7-desktop)
        # TODO: consider not allowing this to be run anywhere but on 'borg'
        BASEDIR="$HOME/HPR/InternetArchive"
        UPLOADS="$HOME/HPR/IA/uploads"
        REPAIRS="$BASEDIR/repairs"
        ;;
    borg)
        BASEDIR="$HOME/IA"
        UPLOADS="/data/IA/uploads"
        REPAIRS="$BASEDIR/repairs"
        ;;
    *)
        echo "Wrong host!"
        exit 1
        ;;
esac

cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }

#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
# shellcheck disable=SC1090
source "$LIB"

#
# Enable coloured messages
#
define_colours

#
# Sanity checks
#
JQ=$(command -v jq)
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
IA=$(command -v ia)
[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }

#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT


# {{{ -- Functions -- Upload, exists_in, queued_tasks, _usage

#===  FUNCTION  ================================================================
#         NAME: Upload
#  DESCRIPTION: Uploads a file to the Internet Archive with various options.
#               Any output from the 'ia' command invocation is saved in
#               a temporary file and the name reported to the caller
#   PARAMETERS: 1 - the item id (e.g. 'hpr1234'
#               2 - the path to the file for upload
#               3 - (optional) the path to the file on the IA
#               4 - (optional) list of options for 'ia upload' enclosed as
#                   a string
#      RETURNS: Exit code of last command
#===============================================================================
Upload () {
    local id=${1}
    local file=${2}
    local remote=${3:-}
    local options=${4:-}

    local RES

    if [[ -e $file ]]; then
        if [[ -z $remote ]]; then
            # shellcheck disable=SC2086
            ia upload ${id} ${file} ${options} > /dev/null 2>&1
            RES=$?
            return $RES
        else
            # shellcheck disable=SC2086
            ia upload ${id} ${file} --remote-name=${remote} ${options} > /dev/null 2>&1
            RES=$?
            return $RES
        fi
    else
        echo "File missing: $file"
        return 1
    fi
}

#===  FUNCTION  ================================================================
#         NAME: exists_in
#  DESCRIPTION: Checks the existence of a key in an associative array
#   PARAMETERS: $1      array name
#               $2      key value
#      RETURNS: True if the key exists, False otherwise
#
# Modified from
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
#===============================================================================
exists_in () {
    # shellcheck disable=SC2086
    eval '[ ${'$1'[$2]+muahaha} ]'
}

#===  FUNCTION  ================================================================
#         NAME: queued_tasks
#  DESCRIPTION: Queries the IA for any queued or running tasks for an item.
#               Writes the number to STDOUT so it can be captured.
#   PARAMETERS: $1      IA item (like hpr1192)
#      RETURNS: Nothing
#===============================================================================
queued_tasks () {
    local item="${1:?Usage: queued_tasks item}"
    local -i count=0

    count="$(ia tasks "$item" |\
        jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"

    echo "$count"

    return
}

#===  FUNCTION  ================================================================
#         NAME: _usage
#  DESCRIPTION: Reports usage; always exits the script after doing so
#   PARAMETERS: 1 - the integer to pass to the 'exit' command
#      RETURNS: Nothing
#===============================================================================
_usage () {
    local -i result=${1:-0}

    cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}

Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] item

Attempts to repair an IA item where the upload has failed for some reason.

Options:
  -h                    Print this help
  -v                    Run in verbose mode where more information is
                        reported. Default is off.
  -d 0|1                Dry run: -d 1 (the default) runs the script in dry-run
                        mode where nothing is changed but the actions that
                        will be taken are reported; -d 0 turns off dry-run
                        mode and the actions will be carried out.
  -D                    Run in debug mode where a lot more information is
                        reported
  -l N                  Control the number of files that can be uploaded
                        during one run of the script. The range is 1 to
                        $DEFLIMIT. This can be helpful when there are upload
                        problems.
  -X                    Run in "extended" mode. In this mode the directory
                        holding files to be added to the IA is '~/IA/repairs'
                        and the files have most likely come from the HPR
                        backup disk and aren't on the IA due some error. We
                        want to use the capabilities of ${SCRIPT} to repair
                        things and deal with the IA upload problems.

Arguments:
    item                The item in the form 'hpr1234'

endusage
    exit "$result"
}

# }}}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#
# Directories and files
#
LOGS="$BASEDIR/logs"
LOGFILE="$LOGS/$SCRIPT.log"

#
# Constants
#
DEFLIMIT=20

#
# Process options
#
while getopts :d:Dhl:vX opt
do
    case "${opt}" in
        D) DEBUG=1;;
        d) DRYRUN=$OPTARG;;
        h) _usage 0;;
        l) LIMIT=$OPTARG;;
        v) VERBOSE=1;;
        X) EXTENDED=1;;
        *) echo "** Unknown option"
           _usage 1;;
    esac
done
shift $((OPTIND - 1))

#
# Set option defaults and check their values
#
VERBOSE=${VERBOSE:-0}

DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
    echo "** Use '-d 0' or '-d 1'"
    _usage 1
fi
[[ $VERBOSE -eq 1 && $DRYRUN -eq 1 ]] && echo "Dry run mode"

DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"

LIMIT=${LIMIT:-$DEFLIMIT}
if [[ $LIMIT -lt 1 || $LIMIT -gt $DEFLIMIT ]]; then
    echo "** Use '-l 1' up to '-l $DEFLIMIT' or omit the option"
    _usage 1
fi

EXTENDED=${EXTENDED:-0}

#
# Should have one argument
#
if [[ $# != 1 ]]; then
    coloured 'red' "Missing argument"
    _usage 1
fi
item="${1}"

#
# Ensure item spec is correctly formatted. Have to cater for leading zeroes
# being interpreted as octal.
#
if [[ $item =~ hpr([0-9]+) ]]; then
    printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))"
else
    coloured 'red' "Incorrect show specification: $item"
    coloured 'yellow' "Use 'hpr9999' format"
    exit 1
fi
_DEBUG "Parsed item: $item"

#
# It's possible that the show upload failed before anything was uploaded, even
# the metadata. It's rarely seen, but it seems wise to cater for it.
#
if ! ia metadata "$item" --exists > /dev/null 2>&1; then
    coloured 'red' "This item is not apparently on the IA; can't continue"
    coloured 'yellow' "Try running the entire upload again from the start"
    exit 1
fi

#
# The -X (EXTENDED) mode is for when we have to upload files that have
# mysteriously vanished from the IA. The directories here are equivalent to
# those used by 'repair_assets'. There is a top-level directory the represents
# the IA item, and below that a hierarchy defining placement under the item.
# There is a 'repairs' directory per host in case we need to repair IA stuff
# from elsewhere.
#
if [[ $EXTENDED -eq 1 ]]; then
    coloured 'cyan' "Using 'Extended' mode"
    if [[ ! -e $REPAIRS ]]; then
        mkdir -p "$REPAIRS"
    fi
    UPLOADS="$REPAIRS/$item"
fi

#
# Declarations
#
declare -A fcache
declare -A iacache
declare -a missed

#
# Scan the directory 'UPLOADS' where files for upload to the IA are stored and
# collect everything for this item (show).
#
# See the `find' pipeline at the end of the loop which selects only files, not
# directories. It outputs the last change time and the full file path, sorts
# on the time, then removes it. This ensures we process the files in time
# order rather than alphabetic order of their names.
#
# TODO: This algorithm is from another script and is not needed here. The
# order of processing is irrelevant here so simplify the 'find' and the loop.
# We are only looking for the 'item' specified by the argument, not other
# ones.
#
while read -r path; do
    relpath="${path#"$UPLOADS"/}"
    item="${relpath:0:7}"

    [[ $VERBOSE -eq 1 ]] && echo "Found $path"

    _DEBUG "Path:          $path"
    _DEBUG "Relative path: $relpath"
    _DEBUG "IA item:       $item"

    if ! exists_in fcache "$relpath"; then
        # shellcheck disable=SC2034
        fcache[$relpath]=1
    fi
done < <(find "$UPLOADS" -type f -regextype posix-extended \
    -regex ".*$item.*" -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ')

#
# Did we find anything?
#
if [[ ${#fcache[@]} -eq 0 ]]; then
    coloured 'red' "No files found for item $item in $UPLOADS"
    coloured 'red' "Can't continue"
    exit 1
fi

#
# Look to see if there are any tasks queued for this show on the IA servers.
# If there are we can't continue.
#
# TODO: This could be a loop waiting for tasks to complete rather than
# aborting and asking to be rerun.
#
tasks=$(queued_tasks "$item")
if [[ $tasks -gt 0 ]]; then
    coloured 'red' \
        "Item $item still has $tasks unfinished $(ngettext task tasks "$tasks")"
    coloured 'red' "Allow time for task(s) to finish and try again later"
    exit 1
fi

#
# Interrogate the IA for the required item contents. If it returns True we can
# collect its contents, otherwise we can't proceed. The file 'TMP1' contains
# just a simple list of the files on the IA relating to this item.
#
if ia list "$item" > "$TMP1"; then
    while read -r iafile; do
        # shellcheck disable=SC2034
        iacache[$iafile]=1
    done < "$TMP1"
else
    coloured 'red' "Item $item is not in the IA"
    coloured 'red' "Can't continue"
    exit 1
fi

#
# Look through the list of files we found and detect any not on the IA
#
for path in "${!fcache[@]}"; do
    if ! exists_in iacache "$path"; then
        missed+=("$path")
    fi
done

#
# Counters and defaults for the loop
#
retry_threshold=5
sleeptime=20
failures=0
upload_count=0

#
# If there are missed files we can report what we'd be doing or do it,
# otherwise we have nothing to do.
#
if [[ ${#missed[@]} -eq 0 ]]; then
    coloured 'green' "All expected files for item $item are on the IA"
else
    mcount="${#missed[@]}"
    coloured 'red' "There $(ngettext "is 1 missing file" "are $mcount missing files" "$mcount"):"

    [[ $DRYRUN -eq 1 ]] && {
        coloured 'blue' "Dry run: Would have run the following command(s):"
    }

    for file in "${missed[@]}"; do
        cmd="Upload $item $UPLOADS/$file "
        cmd+="'$file' '--retries=5 --no-derive -H x-archive-keep-old-version:0'"

        if [[ $DRYRUN -eq 1 ]]; then
            coloured 'yellow' "$cmd"
        else
            retries=0

            coloured 'blue' "Uploading $file"

            #
            # Run 'cmd'. If it succeeds then write to the log and loop for the
            # next missing file. If it fails enter the 'until' loop and report
            # and the problem. Count the number of times this is done, so it
            # doesn't loop forever. If we have reached the limit count this as
            # a failure and continue the parent loop (with the next missing
            # file). If we haven't retried enough yet, sleep for a while and
            # try again. The intention is to catch the case when an upload
            # times out. The 'ia' command is performing its own retries per
            # upload when the system is overloaded, but these are non-fatal.
            #
            until eval "$cmd"; do
                coloured 'red' "Failure when uploading $file"
                ((retries++))

                printf '%s Failed to upload %s to the IA [%d]\n' \
                    "$(date +%Y%m%d%H%M%S)" "$file" "$retries" >> "$LOGFILE"

                [ "$retries" -eq "$retry_threshold" ] && {
                    ((failures++))
                    [[ $VERBOSE -eq 1 ]] && \
                        coloured 'blue' "Retry limit reached; abandoning this file"
                    continue 2
                }

                [[ $VERBOSE -eq 1 ]] && coloured 'blue' "Pausing for $sleeptime and retrying"
                sleep $sleeptime
            done # until eval ...

            coloured 'green' "Uploaded $file to the IA"
            echo "$(date +%Y%m%d%H%M%S) Uploaded $file to the IA" >> "$LOGFILE"
        fi

        #
        # Count actual uploads and dry-run ones the same
        #
        ((upload_count++))

        #
        # Stop the missed file loop if we have reached the limiting number, in
        # dry-run and live mode, but not extended mode
        #
        [[ $EXTENDED -eq 0 && $upload_count -eq $LIMIT ]] && {
            coloured 'blue' "Upload limit ($LIMIT) reached"
            break
        }

    done # for file in ...

fi

#
# Summarise how many upload failures were detected
#
if [[ $failures -gt 0 ]]; then
    coloured 'red' \
        "There $(ngettext "was $failures upload failure" "were $failures upload failures" $failures)"
    coloured 'yellow' 'Run this script again to repeat the repair attempt'
fi

# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker