forked from HPR/hpr-tools
		
	InternetArchive/future_upload: Added logging and debugging
InternetArchive/ia_db.sql: Added new tables
InternetArchive/recover_transcripts: New script to run on 'borg' and
    copy missing files from the backup disk to the IA
InternetArchive/repair_assets: More comments, including one about a bug in the design.
InternetArchive/repair_item: Fix relating to octal numbers (if there are
    leading zeroes in a number). '_DEBUG' is now in the function
    library. Added comments to explain obscure stuff.
InternetArchive/snapshot_metadata: New Bash script (to run on my
    desktop) which collects metadata for a show and stores in in the
    '~/HPR/IA/assets' directory. Runs 'view_derivatives' on it to find
    derivative files for deletion.
InternetArchive/tidy_uploaded: Moves files and directories containing
    uploaded files into a holding area for later backup. Added
    debugging, logging and a 'force' mode.
InternetArchive/upload_manager: Manages 'ia.db' (on my workstation).
    Needs many updates which have just started to be added.
InternetArchive/weekly_upload: Old script, now obsolete.
		
	
		
			
				
	
	
		
			516 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			516 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/bin/bash -
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: repair_item
 | |
| #
 | |
| #        USAGE: ./repair_item [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] itemname
 | |
| #
 | |
| #  DESCRIPTION: Repairs an IA "item" (HPR show) if something has failed during
 | |
| #               the upload (and when recovering deleted files from the
 | |
| #               changeover to the HPR static site).
 | |
| #
 | |
| #               The most common failures are caused by the file upload
 | |
| #               processes timing out and being aborted (by the 'ia' tool which
 | |
| #               performs the item creation and the uploads). This failure
 | |
| #               means that a show being processed on 'borg' does not get all
 | |
| #               of the components loaded to the IA. This happens during the
 | |
| #               sequence of running the 'make_metadata' Perl script which
 | |
| #               generates a CSV file of show data, followed by 'ia metadata
 | |
| #               --spreadsheet=<CSV file>'. Failures in the second part cause
 | |
| #               it to be aborted
 | |
| #
 | |
| #               This script looks at the files belonging to the show (stored
 | |
| #               temporarily on 'borg') and determines which have not been
 | |
| #               uploaded, then takes steps to perform the uploads.
 | |
| #
 | |
| #               Version 0.0.11 onwards has the capability to repair an IA item
 | |
| #               from the HPR backup disk. This seems to be necessary because
 | |
| #               the transcripts were not carried over (although we are
 | |
| #               adding them to the IA for new shows now, older ones were never
 | |
| #               copied), and there has been a case where none of the assets
 | |
| #               were on the IA. The method used it to place the backup files
 | |
| #               in the directory 'repairs' under the local IA or
 | |
| #               InternetArchive directory. The files are held in the hierarchy
 | |
| #               '$item/$item/'. The assets are in the lower directory and the
 | |
| #               source file is in the upper one. This emulates the placement
 | |
| #               on the IA itself.
 | |
| #
 | |
| #               This script can be called directly to recover a new show which
 | |
| #               failed during creation/upload, or by 'recover_transcripts'
 | |
| #               which is repairing shows with missing assets.
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.11
 | |
| #      CREATED: 2020-01-05 22:42:46
 | |
| #     REVISION: 2024-07-20 17:06:10
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| #set -o nounset                              # Treat unset variables as an error
 | |
| 
 | |
| VERSION="0.0.11"
 | |
| 
 | |
| SCRIPT=${0##*/}
 | |
| # DIR=${0%/*}
 | |
| 
 | |
| STDOUT="/dev/fd/2"
 | |
| 
 | |
| #
 | |
| # Select the appropriate working directory for the host
 | |
| #
 | |
| case $(hostname) in
 | |
|     i7-desktop)
 | |
|         # TODO: consider not allowing this to be run anywhere but on 'borg'
 | |
|         BASEDIR="$HOME/HPR/InternetArchive"
 | |
|         UPLOADS="$HOME/HPR/IA/uploads"
 | |
|         REPAIRS="$BASEDIR/repairs"
 | |
|         ;;
 | |
|     borg)
 | |
|         BASEDIR="$HOME/IA"
 | |
|         UPLOADS="/data/IA/uploads"
 | |
|         REPAIRS="$BASEDIR/repairs"
 | |
|         ;;
 | |
|     *)
 | |
|         echo "Wrong host!"
 | |
|         exit 1
 | |
|         ;;
 | |
| esac
 | |
| 
 | |
| cd "$BASEDIR" || { echo "Failed to cd to $BASEDIR"; exit 1; }
 | |
| 
 | |
| #
 | |
| # Load library functions
 | |
| #
 | |
| LIB="$HOME/bin/function_lib.sh"
 | |
| [ -e "$LIB" ] || { echo "Unable to source functions"; exit; }
 | |
| # shellcheck disable=SC1090
 | |
| source "$LIB"
 | |
| 
 | |
| #
 | |
| # Enable coloured messages
 | |
| #
 | |
| define_colours
 | |
| 
 | |
| #
 | |
| # Sanity checks
 | |
| #
 | |
| JQ=$(command -v jq)
 | |
| [ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
 | |
| IA=$(command -v ia)
 | |
| [ -n "$IA" ] || { echo "Program 'ia' was not found"; exit 1; }
 | |
| 
 | |
| #
 | |
| # Make temporary files and set traps to delete them
 | |
| #
 | |
| TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
 | |
| trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
 | |
| 
 | |
| 
 | |
| # {{{ -- Functions -- Upload, exists_in, queued_tasks, _usage
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Upload
 | |
| #  DESCRIPTION: Uploads a file to the Internet Archive with various options.
 | |
| #               Any output from the 'ia' command invocation is saved in
 | |
| #               a temporary file and the name reported to the caller
 | |
| #   PARAMETERS: 1 - the item id (e.g. 'hpr1234'
 | |
| #               2 - the path to the file for upload
 | |
| #               3 - (optional) the path to the file on the IA
 | |
| #               4 - (optional) list of options for 'ia upload' enclosed as
 | |
| #                   a string
 | |
| #      RETURNS: Exit code of last command
 | |
| #===============================================================================
 | |
| Upload () {
 | |
|     local id=${1}
 | |
|     local file=${2}
 | |
|     local remote=${3:-}
 | |
|     local options=${4:-}
 | |
| 
 | |
|     local RES
 | |
| 
 | |
|     if [[ -e $file ]]; then
 | |
|         if [[ -z $remote ]]; then
 | |
|             # shellcheck disable=SC2086
 | |
|             ia upload ${id} ${file} ${options} > /dev/null 2>&1
 | |
|             RES=$?
 | |
|             return $RES
 | |
|         else
 | |
|             # shellcheck disable=SC2086
 | |
|             ia upload ${id} ${file} --remote-name=${remote} ${options} > /dev/null 2>&1
 | |
|             RES=$?
 | |
|             return $RES
 | |
|         fi
 | |
|     else
 | |
|         echo "File missing: $file"
 | |
|         return 1
 | |
|     fi
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: exists_in
 | |
| #  DESCRIPTION: Checks the existence of a key in an associative array
 | |
| #   PARAMETERS: $1      array name
 | |
| #               $2      key value
 | |
| #      RETURNS: True if the key exists, False otherwise
 | |
| #
 | |
| # Modified from
 | |
| # https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
 | |
| #===============================================================================
 | |
| exists_in () {
 | |
|     # shellcheck disable=SC2086
 | |
|     eval '[ ${'$1'[$2]+muahaha} ]'
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: queued_tasks
 | |
| #  DESCRIPTION: Queries the IA for any queued or running tasks for an item.
 | |
| #               Writes the number to STDOUT so it can be captured.
 | |
| #   PARAMETERS: $1      IA item (like hpr1192)
 | |
| #      RETURNS: Nothing
 | |
| #===============================================================================
 | |
| queued_tasks () {
 | |
|     local item="${1:?Usage: queued_tasks item}"
 | |
|     local -i count=0
 | |
| 
 | |
|     count="$(ia tasks "$item" |\
 | |
|         jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
 | |
| 
 | |
|     echo "$count"
 | |
| 
 | |
|     return
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: _usage
 | |
| #  DESCRIPTION: Reports usage; always exits the script after doing so
 | |
| #   PARAMETERS: 1 - the integer to pass to the 'exit' command
 | |
| #      RETURNS: Nothing
 | |
| #===============================================================================
 | |
| _usage () {
 | |
|     local -i result=${1:-0}
 | |
| 
 | |
|     cat >$STDOUT <<-endusage
 | |
| ${SCRIPT} - version: ${VERSION}
 | |
| 
 | |
| Usage: ./${SCRIPT} [-h] [-v] [-d {0|1}] [-D] [-l N] [-X] item
 | |
| 
 | |
| Attempts to repair an IA item where the upload has failed for some reason.
 | |
| 
 | |
| Options:
 | |
|   -h                    Print this help
 | |
|   -v                    Run in verbose mode where more information is
 | |
|                         reported. Default is off.
 | |
|   -d 0|1                Dry run: -d 1 (the default) runs the script in dry-run
 | |
|                         mode where nothing is changed but the actions that
 | |
|                         will be taken are reported; -d 0 turns off dry-run
 | |
|                         mode and the actions will be carried out.
 | |
|   -D                    Run in debug mode where a lot more information is
 | |
|                         reported
 | |
|   -l N                  Control the number of files that can be uploaded
 | |
|                         during one run of the script. The range is 1 to
 | |
|                         $DEFLIMIT. This can be helpful when there are upload
 | |
|                         problems.
 | |
|   -X                    Run in "extended" mode. In this mode the directory
 | |
|                         holding files to be added to the IA is '~/IA/repairs'
 | |
|                         and the files have most likely come from the HPR
 | |
|                         backup disk and aren't on the IA due some error. We
 | |
|                         want to use the capabilities of ${SCRIPT} to repair
 | |
|                         things and deal with the IA upload problems.
 | |
| 
 | |
| Arguments:
 | |
|     item                The item in the form 'hpr1234'
 | |
| 
 | |
| endusage
 | |
|     exit "$result"
 | |
| }
 | |
| 
 | |
| # }}}
 | |
| 
 | |
| #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| 
 | |
| #
 | |
| # Directories and files
 | |
| #
 | |
| LOGS="$BASEDIR/logs"
 | |
| LOGFILE="$LOGS/$SCRIPT.log"
 | |
| 
 | |
| #
 | |
| # Constants
 | |
| #
 | |
| DEFLIMIT=20
 | |
| 
 | |
| #
 | |
| # Process options
 | |
| #
 | |
| while getopts :d:Dhl:vX opt
 | |
| do
 | |
|     case "${opt}" in
 | |
|         D) DEBUG=1;;
 | |
|         d) DRYRUN=$OPTARG;;
 | |
|         h) _usage 0;;
 | |
|         l) LIMIT=$OPTARG;;
 | |
|         v) VERBOSE=1;;
 | |
|         X) EXTENDED=1;;
 | |
|         *) echo "** Unknown option"
 | |
|            _usage 1;;
 | |
|     esac
 | |
| done
 | |
| shift $((OPTIND - 1))
 | |
| 
 | |
| #
 | |
| # Set option defaults and check their values
 | |
| #
 | |
| VERBOSE=${VERBOSE:-0}
 | |
| 
 | |
| DRYRUN=${DRYRUN:-1}
 | |
| if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
 | |
|     echo "** Use '-d 0' or '-d 1'"
 | |
|     _usage 1
 | |
| fi
 | |
| [[ $VERBOSE -eq 1 && $DRYRUN -eq 1 ]] && echo "Dry run mode"
 | |
| 
 | |
| DEBUG=${DEBUG:-0}
 | |
| [[ $DEBUG -eq 1 ]] && coloured 'yellow' "Debug mode"
 | |
| 
 | |
| LIMIT=${LIMIT:-$DEFLIMIT}
 | |
| if [[ $LIMIT -lt 1 || $LIMIT -gt $DEFLIMIT ]]; then
 | |
|     echo "** Use '-l 1' up to '-l $DEFLIMIT' or omit the option"
 | |
|     _usage 1
 | |
| fi
 | |
| 
 | |
| EXTENDED=${EXTENDED:-0}
 | |
| 
 | |
| #
 | |
| # Should have one argument
 | |
| #
 | |
| if [[ $# != 1 ]]; then
 | |
|     coloured 'red' "Missing argument"
 | |
|     _usage 1
 | |
| fi
 | |
| item="${1}"
 | |
| 
 | |
| #
 | |
| # Ensure item spec is correctly formatted. Have to cater for leading zeroes
 | |
| # being interpreted as octal.
 | |
| #
 | |
| if [[ $item =~ hpr([0-9]+) ]]; then
 | |
|     printf -v item 'hpr%04d' "$((10#${BASH_REMATCH[1]}))"
 | |
| else
 | |
|     coloured 'red' "Incorrect show specification: $item"
 | |
|     coloured 'yellow' "Use 'hpr9999' format"
 | |
|     exit 1
 | |
| fi
 | |
| _DEBUG "Parsed item: $item"
 | |
| 
 | |
| #
 | |
| # It's possible that the show upload failed before anything was uploaded, even
 | |
| # the metadata. It's rarely seen, but it seems wise to cater for it.
 | |
| #
 | |
| if ! ia metadata "$item" --exists > /dev/null 2>&1; then
 | |
|     coloured 'red' "This item is not apparently on the IA; can't continue"
 | |
|     coloured 'yellow' "Try running the entire upload again from the start"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| #
 | |
| # The -X (EXTENDED) mode is for when we have to upload files that have
 | |
| # mysteriously vanished from the IA. The directories here are equivalent to
 | |
| # those used by 'repair_assets'. There is a top-level directory the represents
 | |
| # the IA item, and below that a hierarchy defining placement under the item.
 | |
| # There is a 'repairs' directory per host in case we need to repair IA stuff
 | |
| # from elsewhere.
 | |
| #
 | |
| if [[ $EXTENDED -eq 1 ]]; then
 | |
|     coloured 'cyan' "Using 'Extended' mode"
 | |
|     if [[ ! -e $REPAIRS ]]; then
 | |
|         mkdir -p "$REPAIRS"
 | |
|     fi
 | |
|     UPLOADS="$REPAIRS/$item"
 | |
| fi
 | |
| 
 | |
| #
 | |
| # Declarations
 | |
| #
 | |
| declare -A fcache
 | |
| declare -A iacache
 | |
| declare -a missed
 | |
| 
 | |
| #
 | |
| # Scan the directory 'UPLOADS' where files for upload to the IA are stored and
 | |
| # collect everything for this item (show).
 | |
| #
 | |
| # See the `find' pipeline at the end of the loop which selects only files, not
 | |
| # directories. It outputs the last change time and the full file path, sorts
 | |
| # on the time, then removes it. This ensures we process the files in time
 | |
| # order rather than alphabetic order of their names.
 | |
| #
 | |
| # TODO: This algorithm is from another script and is not needed here. The
 | |
| # order of processing is irrelevant here so simplify the 'find' and the loop.
 | |
| # We are only looking for the 'item' specified by the argument, not other
 | |
| # ones.
 | |
| #
 | |
| while read -r path; do
 | |
|     relpath="${path#"$UPLOADS"/}"
 | |
|     item="${relpath:0:7}"
 | |
| 
 | |
|     [[ $VERBOSE -eq 1 ]] && echo "Found $path"
 | |
| 
 | |
|     _DEBUG "Path:          $path"
 | |
|     _DEBUG "Relative path: $relpath"
 | |
|     _DEBUG "IA item:       $item"
 | |
| 
 | |
|     if ! exists_in fcache "$relpath"; then
 | |
|         # shellcheck disable=SC2034
 | |
|         fcache[$relpath]=1
 | |
|     fi
 | |
| done < <(find "$UPLOADS" -type f -regextype posix-extended \
 | |
|     -regex ".*$item.*" -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ')
 | |
| 
 | |
| #
 | |
| # Did we find anything?
 | |
| #
 | |
| if [[ ${#fcache[@]} -eq 0 ]]; then
 | |
|     coloured 'red' "No files found for item $item in $UPLOADS"
 | |
|     coloured 'red' "Can't continue"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| #
 | |
| # Look to see if there are any tasks queued for this show on the IA servers.
 | |
| # If there are we can't continue.
 | |
| #
 | |
| # TODO: This could be a loop waiting for tasks to complete rather than
 | |
| # aborting and asking to be rerun.
 | |
| #
 | |
| tasks=$(queued_tasks "$item")
 | |
| if [[ $tasks -gt 0 ]]; then
 | |
|     coloured 'red' \
 | |
|         "Item $item still has $tasks unfinished $(ngettext task tasks "$tasks")"
 | |
|     coloured 'red' "Allow time for task(s) to finish and try again later"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| #
 | |
| # Interrogate the IA for the required item contents. If it returns True we can
 | |
| # collect its contents, otherwise we can't proceed. The file 'TMP1' contains
 | |
| # just a simple list of the files on the IA relating to this item.
 | |
| #
 | |
| if ia list "$item" > "$TMP1"; then
 | |
|     while read -r iafile; do
 | |
|         # shellcheck disable=SC2034
 | |
|         iacache[$iafile]=1
 | |
|     done < "$TMP1"
 | |
| else
 | |
|     coloured 'red' "Item $item is not in the IA"
 | |
|     coloured 'red' "Can't continue"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| #
 | |
| # Look through the list of files we found and detect any not on the IA
 | |
| #
 | |
| for path in "${!fcache[@]}"; do
 | |
|     if ! exists_in iacache "$path"; then
 | |
|         missed+=("$path")
 | |
|     fi
 | |
| done
 | |
| 
 | |
| #
 | |
| # Counters and defaults for the loop
 | |
| #
 | |
| retry_threshold=5
 | |
| sleeptime=20
 | |
| failures=0
 | |
| upload_count=0
 | |
| 
 | |
| #
 | |
| # If there are missed files we can report what we'd be doing or do it,
 | |
| # otherwise we have nothing to do.
 | |
| #
 | |
| if [[ ${#missed[@]} -eq 0 ]]; then
 | |
|     coloured 'green' "All expected files for item $item are on the IA"
 | |
| else
 | |
|     mcount="${#missed[@]}"
 | |
|     coloured 'red' "There $(ngettext "is 1 missing file" "are $mcount missing files" "$mcount"):"
 | |
| 
 | |
|     [[ $DRYRUN -eq 1 ]] && {
 | |
|         coloured 'blue' "Dry run: Would have run the following command(s):"
 | |
|     }
 | |
| 
 | |
|     for file in "${missed[@]}"; do
 | |
|         cmd="Upload $item $UPLOADS/$file "
 | |
|         cmd+="'$file' '--retries=5 --no-derive -H x-archive-keep-old-version:0'"
 | |
| 
 | |
|         if [[ $DRYRUN -eq 1 ]]; then
 | |
|             coloured 'yellow' "$cmd"
 | |
|         else
 | |
|             retries=0
 | |
| 
 | |
|             coloured 'blue' "Uploading $file"
 | |
| 
 | |
|             #
 | |
|             # Run 'cmd'. If it succeeds then write to the log and loop for the
 | |
|             # next missing file. If it fails enter the 'until' loop and report
 | |
|             # and the problem. Count the number of times this is done, so it
 | |
|             # doesn't loop forever. If we have reached the limit count this as
 | |
|             # a failure and continue the parent loop (with the next missing
 | |
|             # file). If we haven't retried enough yet, sleep for a while and
 | |
|             # try again. The intention is to catch the case when an upload
 | |
|             # times out. The 'ia' command is performing its own retries per
 | |
|             # upload when the system is overloaded, but these are non-fatal.
 | |
|             #
 | |
|             until eval "$cmd"; do
 | |
|                 coloured 'red' "Failure when uploading $file"
 | |
|                 ((retries++))
 | |
| 
 | |
|                 printf '%s Failed to upload %s to the IA [%d]\n' \
 | |
|                     "$(date +%Y%m%d%H%M%S)" "$file" "$retries" >> "$LOGFILE"
 | |
| 
 | |
|                 [ "$retries" -eq "$retry_threshold" ] && {
 | |
|                     ((failures++))
 | |
|                     [[ $VERBOSE -eq 1 ]] && \
 | |
|                         coloured 'blue' "Retry limit reached; abandoning this file"
 | |
|                     continue 2
 | |
|                 }
 | |
| 
 | |
|                 [[ $VERBOSE -eq 1 ]] && coloured 'blue' "Pausing for $sleeptime and retrying"
 | |
|                 sleep $sleeptime
 | |
|             done # until eval ...
 | |
| 
 | |
|             coloured 'green' "Uploaded $file to the IA"
 | |
|             echo "$(date +%Y%m%d%H%M%S) Uploaded $file to the IA" >> "$LOGFILE"
 | |
|         fi
 | |
| 
 | |
|         #
 | |
|         # Count actual uploads and dry-run ones the same
 | |
|         #
 | |
|         ((upload_count++))
 | |
| 
 | |
|         #
 | |
|         # Stop the missed file loop if we have reached the limiting number, in
 | |
|         # dry-run and live mode
 | |
|         #
 | |
|         [[ $upload_count -eq $LIMIT ]] && {
 | |
|             coloured 'blue' "Upload limit ($LIMIT) reached"
 | |
|             break
 | |
|         }
 | |
| 
 | |
|     done # for file in ...
 | |
| 
 | |
| fi
 | |
| 
 | |
| #
 | |
| # Summarise how many upload failures were detected
 | |
| #
 | |
| if [[ $failures -gt 0 ]]; then
 | |
|     coloured 'red' \
 | |
|         "There $(ngettext "was $failures upload failure" "were $failures upload failures" $failures)"
 | |
|     coloured 'yellow' 'Run this script again to repeat the repair attempt'
 | |
| fi
 | |
| 
 | |
| # vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker
 |