hpr-tools/InternetArchive/tidy_uploaded
Dave Morriss 19030fee71 Updates for show "repair" processing
InternetArchive/future_upload: Added logging and debugging

InternetArchive/ia_db.sql: Added new tables

InternetArchive/recover_transcripts: New script to run on 'borg' and
    copy missing files from the backup disk to the IA

InternetArchive/repair_assets: More comments, including one about a bug in the design.

InternetArchive/repair_item: Fix relating to octal numbers (if there are
    leading zeroes in a number). '_DEBUG' is now in the function
    library. Added comments to explain obscure stuff.

InternetArchive/snapshot_metadata: New Bash script (to run on my
    desktop) which collects metadata for a show and stores in in the
    '~/HPR/IA/assets' directory. Runs 'view_derivatives' on it to find
    derivative files for deletion.

InternetArchive/tidy_uploaded: Moves files and directories containing
    uploaded files into a holding area for later backup. Added
    debugging, logging and a 'force' mode.

InternetArchive/upload_manager: Manages 'ia.db' (on my workstation).
    Needs many updates which have just started to be added.

InternetArchive/weekly_upload: Old script, now obsolete.
2024-08-22 13:13:38 +01:00

498 lines
16 KiB
Bash
Executable File

#!/bin/bash -
#===============================================================================
#
# FILE: tidy_uploaded
#
# USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT]
#
# DESCRIPTION: Relocates HPR audio and other show-related files on 'borg'
# after their shows have been uploaded to the Internet Archive
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.11
# CREATED: 2022-03-30 17:38:01
# REVISION: 2024-07-29 18:24:26
#
#===============================================================================
set -o nounset # Treat unset variables as an error
VERSION="0.0.11"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit 1; }
# shellcheck disable=SC1090
source "$LIB"
#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
#
# Configure depending whether local or on the VPS
#
case $HOSTNAME in
borg) BASEDIR="$HOME/InternetArchive"
UPLOADS="/data/IA/uploads"
ARCHIVE="/data/IA/done" ;;
i7-desktop) BASEDIR="$HOME/HPR/InternetArchive"
UPLOADS="$HOME/HPR/IA/uploads"
ARCHIVE="$HOME/HPR/IA/done";;
*) echo "Wrong host!"; exit 1 ;;
esac
# {{{ -- Functions -- exists_in, queued_tasks, movefile, is_empty, _log, _usage
#=== FUNCTION ================================================================
# NAME: exists_in
# DESCRIPTION: Checks the existence of a key in an associative array
# PARAMETERS: $1 array name
# $2 key value
# RETURNS: True if the key exists, False otherwise
#
# Modified from
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
#===============================================================================
exists_in () {
# shellcheck disable=SC2086
eval '[ ${'$1'[$2]+muahaha} ]'
}
#=== FUNCTION ================================================================
# NAME: queued_tasks
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
# Writes the number to STDOUT so it can be captured.
# PARAMETERS: $1 IA item (like hpr1192)
# RETURNS: Nothing
#===============================================================================
queued_tasks () {
local item="${1:?Usage: queued_tasks item}"
local -i count=0
count="$(ia tasks "$item" |\
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
echo "$count"
return
}
#=== FUNCTION ================================================================
# NAME: movefile
# DESCRIPTION: Moves a file to a new place, catering for any directories in
# the path
# PARAMETERS: $1 directory to move form
# $2 directory to move to
# $3 file (or sub-path to move)
# RETURNS: True if a move was done, otherwise False
#===============================================================================
movefile () {
local fromdir="${1:?Usage: movefile fromdir todir path [FORCE]}"
local todir="${2:?Usage: movefile fromdir todir path [FORCE]}"
local path="${3:?Usage: movefile fromdir todir path [FORCE]}"
local FORCE="${4:-0}"
[[ ! -v FORCE ]] && FORCE=0
#
# Chop up the path. If it's just a file name then $dir and $file are the
# same, in which case we make $dir empty.
#
local dir="${path%/*}"
local file="${path##*/}"
[[ $dir = "$file" ]] && dir=''
#
# If we have a directory in the path check it exists in the 'to' directory
# and create it if not
#
if [[ -n $dir ]]; then
if [[ ! -d $dir ]]; then
mkdir -p "$todir/$dir"
fi
fi
#
# Does the file exist already?
# TODO: Compare the two files?
#
if [[ -e $todir/$path ]]; then
if [[ $FORCE -eq 1 ]]; then
echo "File exists: $todir/$path"
echo "FORCE mode is ON so overwriting"
mv --force "$fromdir/$path" "$todir/$path"
echo "Moved $fromdir/$path"
return 0
else
echo "File already exists: $todir/$path"
return 1
fi
else
mv "$fromdir/$path" "$todir/$path"
echo "Moved $fromdir/$path"
return 0
fi
}
#=== FUNCTION ================================================================
# NAME: is_empty
# DESCRIPTION: Check whether a directory is empty (of files)
# PARAMETERS: $1 Directory to test
# RETURNS: True if empty (of files), otherwise false
#===============================================================================
is_empty() {
test -z "$(find "$1" -mindepth 1 -type f -printf X -quit)"
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script
# using the predefined $LOGREC, a template for 'printf'. If the
# latter is not defined the function will use a default.
# For some reason 'shellcheck' objects to this function. The
# first argument to 'printf' needs to be -1 to make the
# '%(fmt)T' use today's date and time.
# PARAMETERS: 1 - the message to write
# RETURNS: Nothing
#===============================================================================
# shellcheck disable=SC2317 disable=SC2059
_log () {
local msg="$1"
# echo "D> $LOGFILE $LOGREC"
[ -v LOGFILE ] || { echo "${FUNCNAME[0]}: \$LOGFILE is not defined"; exit 1; }
[ -v LOGREC ] || { local LOGREC='%(%F %T)T %s\n'; }
# echo "D> $LOGFILE $LOGREC"
printf "$LOGREC" -1 "$msg" >> "$LOGFILE"
return
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Report usage
# PARAMETERS: 1 [optional] exit value
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i res="${1:-0}"
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-v] [-c COUNT] [-d {0|1}] [-D]
Moves HPR audio and other show-related files on 'borg' after their shows
have been uploaded to the Internet Archive. Files to be uploaded are in the
directory ${UPLOADS} and they are moved to the directory ${ARCHIVE}.
Options:
-h Print this help
-v Run in verbose mode where more information is reported
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is moved but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-c COUNT Count of shows to process. If omitted or zero then all
shows will be processed, otherwise this is the number
to stop at.
-D Run in debug mode where a lot more information is
reported
-F Turn on FORCE mode (normally off). In this mode when
the files being tidied (moved) already exist, they are
overwritten. This is for the very rare case when
a show's audio has to be re-uploaded because of bad
audio or the wrong file being sent.
Examples
./tidy_uploaded # Run in (default) dry-run mode
./tidy_uploaded -v # Dry-run mode with verbose messages
./tidy_uploaded -d0 # Live mode (without verbose messages)
./tidy_uploaded -c1 # Process 1 show in dry-run mode
./tidy_uploaded -D # Run with debugging enabled
./tidy_uploaded -F # Run with FORCE mode on
endusage
exit "$res"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Directories and files
#
LOGS="$BASEDIR/logs"
LOGFILE="$LOGS/$SCRIPT.log"
LOGREC='%(%F %T)T %s\n'
#
# Process options
#
while getopts :c:d:DFhv opt
do
case "${opt}" in
c) COUNT=$OPTARG;;
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
F) FORCE=1;;
h) _usage 0;;
v) VERBOSE=1;;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
COUNT=${COUNT:-0}
if [[ ! $COUNT =~ ^[0-9]+$ ]]; then
echo "** Use a numeric argument with -c"
_usage 1
fi
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $DRYRUN -eq 1 ]] && echo "Dry run mode"
FORCE=${FORCE:-0}
[[ $FORCE -eq 1 ]] && echo "Force mode - overwriting existing files"
VERBOSE=${VERBOSE:-0}
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && echo "Debug mode"
#
# Should have no arguments
#
if [[ $# != 0 ]]; then
echo "** ${SCRIPT} takes no arguments"
_usage 1
fi
#
# Declarations
#
declare -A seen
declare -a dirs
# lastitem=
ind=0
#
# Scan the directory 'UPLOADS' where files for upload to the IA are stored.
#
# See the `find' pipeline at the end of the loop which outputs the last change
# time and the full file path, sorts on the time, then removes it. This
# ensures we process the files in time order rather than alphabetic order of
# their names.
#
while read -r path; do
#
# Extract the path relative to $UPLOADS and the IA item name from the
# returned path. Here $relpath will be the filename or a sub-directory and
# filename, and $item will be the IA identifier like 'hpr1192'.
#
relpath="${path#"$UPLOADS"/}"
item="${relpath:0:7}"
[[ $VERBOSE -eq 1 ]] && echo "Found $path"
_DEBUG "Path: $path"
_DEBUG "Relative path: $relpath"
_DEBUG "IA item: $item"
#
# Detect that the item prefix has changed. If it has we're processing
# a new IA identifier, so work on this one
#
# If we have seen this item before we don't need to process it, so just
# skip this loop iteration
#
#
# Never seen before, so process it
#
if ! exists_in seen "$item"; then
# shellcheck disable=SC2034
seen[$item]=1
#
# Count this item and stop the loop if we've reached the requested
# count. We want the value of $ind to be the number of shows
# processed, so adjust it if we stopped after incrementing it.
#
((ind++))
if [[ $COUNT -gt 0 ]]; then
if [[ $ind -gt $COUNT ]]; then
((ind--))
break
fi
echo "[ Show #$ind ]"
fi
#
# Look to see if there are any tasks queued for this show. If there
# are we'll skip it just now.
#
tasks=$(queued_tasks "$item")
if [[ $tasks -gt 0 ]]; then
echo "** Item $item still has $tasks unfinished " \
"$(ngettext task tasks "$tasks")"
echo "** Skipping to the next item"
continue
fi
[[ $VERBOSE -eq 1 ]] && echo "Checking IA for $item"
#
# Interrogate the IA for the item we're working on. If it returns True
# we can proceed with tidying. The file 'TMP1' contains just a simple
# list of the files on the IIA relating to this item.
#
if ia list "$item" > "$TMP1"; then
#
# Save any directory associated with this item. This means that
# directories with names that don't conform to the "^hpr[0-9]{4}"
# pattern will be ignored, but this it *not* expected to happen.
# Note that directories without corresponding audio will not be
# cleaned up by this method, but again this is not expected to
# happen.
# TODO: be alert to such issues!
#
dirpath="$UPLOADS/$item"
if [[ -d "$dirpath" ]]; then
echo "Storing directory: $item"
dirs+=("$item")
fi
moves=0
#
# Scan the returned list to see if any files we have are online.
# Move to the ARCHIVE directory when there's a match.
#
while read -r file; do
frompath="$UPLOADS/$file"
topath="$ARCHIVE/$file"
if [[ -e "$frompath" ]]; then
#
# A file on the IA exists in the upload area. Move the
# local one if we're not in dry-run mode, otherwise just
# report the move we would do. If FORCE mode is on
# overwrite the file.
#
if [[ $DRYRUN -eq 0 ]]; then
movefile "$UPLOADS" "$ARCHIVE" "$file" "$FORCE" && ((moves++))
else
if [[ $FORCE -eq 0 ]]; then
printf 'Would move %s\n\tto %s\n' "$frompath" "$topath"
else
printf 'Would move %s\n\toverwriting %s\n' "$frompath" "$topath"
fi
fi
fi
done < "$TMP1"
#
# Log this item
#
[[ $DRYRUN -eq 0 ]] && \
printf '%s moved %d %s for %s\n' "$(date +%Y%m%d%H%M%S)" \
"$moves" "$(ngettext file files "$moves")" "$item" >> "$LOGFILE"
else
printf 'Skipping %s; not in the IA\n' "$item"
fi
else
#
# Ignore all but the first file belonging to an IA identifier
#
_DEBUG "Skipped $path - repeated show number"
continue
fi
done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ')
# Old 'find' used:
# done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort)
#
# No shows processed? There was nothing to do
#
if [[ $ind -eq 0 ]]; then
[[ $DRYRUN -eq 0 ]] && echo "Nothing to do"
exit
fi
_DEBUG "Number of shows scanned: $ind"
# _DEBUG "Accumulated directories (${#dirs[*]}): $(printf '/%s/ ' "${dirs[*]}")"
#
# If there are no directories just exit.
#
[[ -v dirs ]] || exit
#
# By an (as yet) unknown process we might get duplicates, so remove them here.
#
# mapfile -t dirs < <(printf "%s\n" "${dirs[*]}" | uniq)
declare -A unique
for e in "${dirs[@]}"; do unique[$e]=1; done
dirs=( "${!unique[@]}" )
# mapfile -t dirs < <(printf '%s\n' "${!unique[@]}")
_DEBUG "Directories to process (${#dirs[*]}): $(printf '>%s< ' "${dirs[*]}")"
#
# Clean up any empty directories. These may exist because we moved their
# contents one file at a time. We only deal with the directories we've visited
# though.
#
for dir in "${dirs[@]}"; do
path="$UPLOADS/$dir"
if [[ $DRYRUN -eq 0 ]]; then
if is_empty "$path"; then
rm -rf "$path"
RES=$?
if [[ $RES -eq 0 ]]; then
echo "Deleted $path"
echo "$(date +%Y%m%d%H%M%S) deleted empty directory $path" >> "$LOGFILE"
else
echo "Failed to delete: $path"
fi
else
echo "Directory is not empty: $path"
echo "Not deleted!"
fi
else
echo "Would delete directory $path"
fi
done
exit
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker