Moved project directories and files to an empty local repo

2024-06-04 16:35:44 +01:00
parent 2d2b937a9b
commit 38abbcdd39
271 changed files with 55348 additions and 0 deletions
--- a/InternetArchive/repair_item
+++ b/InternetArchive/repair_item
@@ -0,0 +1,453 @@
+#!/bin/bash -
+#===============================================================================
+#
+#         FILE: repair_item
+#
+#        USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] itemname
+#
+#  DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during
+#               the upload.
+#
+#               The most common failures are caused by the file upload
+#               processes timing out and being aborted (by the 'ia' tool which
+#               performs the item creation and the uploads). This failure
+#               means that a show being processed on 'borg' does not get all
+#               of the components loaded to the IA.
+#
+#               This script looks at the files belonging to the show (stored
+#               temporarily on 'borg') and determines which have not been
+#               uploaded, then takes steps to perform the uploads.
+#
+#      OPTIONS: ---
+# REQUIREMENTS: ---
+#         BUGS: ---
+#        NOTES: ---
+#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
+#      VERSION: 0.0.6
+#      CREATED: 2020-01-05 22:42:46
+#     REVISION: 2024-05-10 12:39:52
+#
+#===============================================================================
+
+#set -o nounset                              # Treat unset variables as an error
+
+VERSION="0.0.6"
+
+SCRIPT=${0##*/}
+# DIR=${0%/*}
+
+STDOUT="/dev/fd/2"
+
+#
+# Select the appropriate working directory for the host
+#
+case $(hostname) in
+    i7-desktop)
+        BASEDIR="$HOME/HPR/InternetArchive"
+        UPLOADS="$HOME/HPR/IA/uploads"
+        ;;
+    borg)
+        BASEDIR="$HOME/IA"
+        UPLOADS="/data/IA/uploads"
+        ;;
+    *)
+        echo "Wrong host!"
+        exit 1
+        ;;
+esac
+
+cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
+
+#
+# Load library functions
+#
+LIB="$HOME/bin/function_lib.sh"
+[ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
+# shellcheck disable=SC1090
+source "$LIB"
+
+#
+# Enable coloured messages
+#
+define_colours
+
+#
+# Sanity checks
+#
+JQ=$(command -v jq)
+[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
+IA=$(command -v ia)
+[ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
+
+#
+# Make temporary files and set traps to delete them
+#
+TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
+trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
+
+
+# {{{ -- Functions -- Upload, exists_in, queued_tasks, _DEBUG, _usage
+
+#===  FUNCTION  ================================================================
+#         NAME: Upload
+#  DESCRIPTION: Upload a file to the Internet Archive with various options
+#   PARAMETERS: 1 - the item id (e.g. 'hpr1234'
+#               2 - the path to the file for upload
+#               3 - (optional) the path to the file on the IA
+#               4 - (optional) list of options for 'ia upload' enclosed as
+#                   a string
+#      RETURNS: Nothing
+#===============================================================================
+Upload () {
+    local id=${1}
+    local file=${2}
+    local remote=${3:-}
+    local options=${4:-}
+
+    if [[ -e $file ]]; then
+        if [[ -z $remote ]]; then
+            # shellcheck disable=SC2086
+            ia upload ${id} ${file} ${options}
+        else
+            # shellcheck disable=SC2086
+            ia upload ${id} ${file} --remote-name=${remote} ${options}
+        fi
+    else
+        echo "File missing: $file"
+    fi
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: exists_in
+#  DESCRIPTION: Checks the existence of a key in an associative array
+#   PARAMETERS: $1      array name
+#               $2      key value
+#      RETURNS: True if the key exists, False otherwise
+#
+# Modified from
+# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
+#===============================================================================
+exists_in () {
+    # shellcheck disable=SC2086
+    eval '[ ${'$1'[$2]+muahaha} ]'
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: queued_tasks
+#  DESCRIPTION: Queries the IA for any queued or running tasks for an item.
+#               Writes the number to STDOUT so it can be captured.
+#   PARAMETERS: $1      IA item (like hpr1192)
+#      RETURNS: Nothing
+#===============================================================================
+queued_tasks () {
+    local item="${1:?Usage: queued_tasks item}"
+    local -i count=0
+
+    count="$(ia tasks "$item" |\
+        jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
+
+    echo "$count"
+
+    return
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: _DEBUG
+#  DESCRIPTION: Writes a message if in DEBUG mode
+#   PARAMETERS: List of messages
+#      RETURNS: Nothing
+#===============================================================================
+_DEBUG () {
+    [ "$DEBUG" == 0 ] && return
+    for msg in "$@"; do
+        printf 'D> %s\n' "$msg"
+    done
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: _usage
+#  DESCRIPTION: Reports usage; always exits the script after doing so
+#   PARAMETERS: 1 - the integer to pass to the 'exit' command
+#      RETURNS: Nothing
+#===============================================================================
+_usage () {
+    local -i result=${1:-0}
+
+    cat >$STDOUT <<-endusage
+${SCRIPT} - version: ${VERSION}
+
+Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] [-l N] item
+
+Attempts to repair an IA item where the upload has failed for some reason.
+
+Options:
+  -h                    Print this help
+  -v                    Run in verbose mode where more information is
+                        reported. Default is off.
+  -d 0|1                Dry run: -d 1 (the default) runs the script in dry-run
+                        mode where nothing is changed but the actions that
+                        will be taken are reported; -d 0 turns off dry-run
+                        mode and the actions will be carried out.
+  -D                    Run in debug mode where a lot more information is
+                        reported
+  -l N                  Control the number of shows that can be uploaded at
+                        once. The range is 1 to $DEFLIMIT.
+
+Arguments:
+    item                The item in the form 'hpr1234'
+
+endusage
+    exit "$result"
+}
+
+# }}}
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#
+# Directories and files
+#
+LOGS="$BASEDIR/logs"
+LOGFILE="$LOGS/$SCRIPT.log"
+
+#
+# Constants
+#
+DEFLIMIT=20
+
+#
+# Process options
+#
+while getopts :d:Dhl:v opt
+do
+    case "${opt}" in
+        D) DEBUG=1;;
+        d) DRYRUN=$OPTARG;;
+        h) _usage 0;;
+        l) LIMIT=$OPTARG;;
+        v) VERBOSE=1;;
+        *) echo "** Unknown option"
+           _usage 1;;
+    esac
+done
+shift $((OPTIND - 1))
+
+#
+# Set option defaults and check their values
+#
+VERBOSE=${VERBOSE:-0}
+
+DRYRUN=${DRYRUN:-1}
+if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
+    echo "** Use '-d 0' or '-d 1'"
+    _usage 1
+fi
+[[ $VERBOSE -eq 1 && $DRYRUN -eq 1 ]] && echo "Dry run mode"
+
+DEBUG=${DEBUG:-0}
+[[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
+
+LIMIT=${LIMIT:-$DEFLIMIT}
+if [[ $LIMIT -lt 1 || $LIMIT -gt $DEFLIMIT ]]; then
+    echo "** Use '-l 1' up to '-l $DEFLIMIT' or omit the option"
+    _usage 1
+fi
+
+#
+# Should have one argument
+#
+if [[ $# != 1 ]]; then
+    coloured 'red' "Missing argument"
+    _usage 1
+fi
+item="${1}"
+
+#
+# Ensure item spec is correctly formatted
+#
+if [[ $item =~ hpr([0-9]+) ]]; then
+    printf -v item 'hpr%04d' "${BASH_REMATCH[1]}"
+else
+    coloured 'red' "Incorrect show specification: $item"
+    coloured 'yellow' "Use 'hpr9999' format"
+    exit 1
+fi
+_DEBUG "Parsed item: $item"
+
+#
+# Declarations
+#
+declare -A fcache
+declare -A iacache
+declare -a missed
+
+#
+# Scan the directory 'UPLOADS' where files for upload to the IA are stored and
+# collect everything for this item (show).
+#
+# See the `find' pipeline at the end of the loop which selects only files, not
+# directories. It outputs the last change time and the full file path, sorts
+# on the time, then removes it. This ensures we process the files in time
+# order rather than alphabetic order of their names.
+#
+# TODO: This algorithm is from another script and is not needed here. The
+# order of processing is irrelevant here so simplify the 'find' and the loop.
+# We are only looking for the 'item' specified by the argument, not other
+# ones.
+#
+while read -r path; do
+    relpath="${path#"$UPLOADS"/}"
+    item="${relpath:0:7}"
+
+    [[ $VERBOSE -eq 1 ]] && echo "Found $path"
+
+    _DEBUG "Path:          $path"
+    _DEBUG "Relative path: $relpath"
+    _DEBUG "IA item:       $item"
+
+    if ! exists_in fcache "$relpath"; then
+        # shellcheck disable=SC2034
+        fcache[$relpath]=1
+    fi
+done < <(find "$UPLOADS" -type f -regextype posix-extended \
+    -regex ".*$item.*" -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ')
+
+#
+# Did we find anything?
+#
+if [[ ${#fcache[@]} -eq 0 ]]; then
+    coloured 'red' "No files found for item $item in $UPLOADS"
+    coloured 'red' "Can't continue"
+    exit 1
+fi
+
+#
+# Look to see if there are any tasks queued for this show on the IA servers.
+# If there are we can't continue.
+#
+# TODO: This could be a loop waiting for tasks to complete rather than
+# aborting and asking to be rerun.
+#
+tasks=$(queued_tasks "$item")
+if [[ $tasks -gt 0 ]]; then
+    coloured 'red' \
+        "Item $item still has $tasks unfinished $(ngettext task tasks "$tasks")"
+    coloured 'red' "Allow time for task(s) to finish and try again later"
+    exit 1
+fi
+
+#
+# Interrogate the IA for the required item contents. If it returns True we can
+# collect its contents, otherwise we can't proceed. The file 'TMP1' contains
+# just a simple list of the files on the IA relating to this item.
+#
+if ia list "$item" > "$TMP1"; then
+    while read -r iafile; do
+        # shellcheck disable=SC2034
+        iacache[$iafile]=1
+    done < "$TMP1"
+else
+    coloured 'red' "Item $item is not in the IA"
+    coloured 'red' "Can't continue"
+    exit 1
+fi
+
+#
+# Look through the list of files we found and detect any not on the IA
+#
+for path in "${!fcache[@]}"; do
+    if ! exists_in iacache "$path"; then
+        missed+=("$path")
+    fi
+done
+
+#
+# Counters and defaults for the loop
+#
+retry_threshold=5
+sleeptime=20
+failures=0
+upload_count=0
+
+#
+# If there are missed files we can report what we'd be doing or do it,
+# otherwise we have nothing to do.
+#
+if [[ ${#missed[@]} -eq 0 ]]; then
+    coloured 'green' "All expected files for item $item are on the IA"
+else
+    mcount="${#missed[@]}"
+    coloured 'red' "There $(ngettext "is 1 missing file" "are $mcount missing files" "$mcount"):"
+
+    [[ $DRYRUN -eq 1 ]] && {
+        coloured 'blue' "Dry run: Would have run the following command(s):"
+    }
+
+    for file in "${missed[@]}"; do
+        cmd="Upload $item $UPLOADS/$file "
+        cmd+="'$file' '--retries=5 --no-derive -H x-archive-keep-old-version:0'"
+
+        if [[ $DRYRUN -eq 1 ]]; then
+            coloured 'yellow' "$cmd"
+        else
+            retries=0
+
+            printf 'Uploading %s\n' "$file"
+
+            #
+            # Run 'cmd'. If it succeeds then write to the log and loop for the
+            # next missing file. If it fails enter the 'until' loop and report
+            # and the problem. Count the number of times this is done, so it
+            # doesn't loop forever. If we have reached the limit count this as
+            # a failure and continue the parent loop (with the next missing
+            # file). If we haven't retried enough yet, sleep for a while and
+            # try again. The intention is to catch the case when an upload
+            # times out. The 'ia' command is performing its own retries per
+            # upload when the system is overloaded, but these are non-fatal.
+            #
+            until eval "$cmd"; do
+                coloured 'red' "Failure when invoking the Upload command!"
+                ((retries++))
+
+                printf '%s Failed to upload %s to the IA [%d]\n' \
+                    "$(date +%Y%m%d%H%M%S)" "$file" "$retries" >> "$LOGFILE"
+
+                [ "$retries" -eq "$retry_threshold" ] && {
+                    ((failures++))
+                    continue 2
+                }
+
+                sleep $sleeptime
+            done # until eval ...
+
+            echo "$(date +%Y%m%d%H%M%S) Uploaded $file to the IA" >> "$LOGFILE"
+        fi
+
+        #
+        # Count actual uploads and dry-run ones the same
+        #
+        ((upload_count++))
+
+        #
+        # Stop the missed file loop if we have reached the limiting number, in
+        # dry-run and live mode
+        #
+        [[ $upload_count -eq $LIMIT ]] && {
+            coloured 'blue' "Upload limit ($LIMIT) reached"
+            break
+        }
+
+    done # for file in ...
+
+fi
+
+#
+# Summarise how many upload failures were detected
+#
+if [[ $failures -gt 0 ]]; then
+    coloured 'red' \
+        "There $(ngettext "was $failures upload failure" "were $failures upload failures" $failures)"
+    coloured 'yellow' 'Run this script again to repeat the repair attemmpt'
+fi
+
+# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker