hpr-tools/InternetArchive/tidy_uploaded
Dave Morriss 0f1e727487 New 'reformat_html', plus some cleaning
InternetArchive/future_upload: now updates the state of shows

InternetArchive/reformat_html: new Perl script to reformat the HTML
    originally found in the HPR database in the 'notes' field to the format
    required in the 'description' field of an item on the IA. It reads
    from STDIN and writes to STDOUT.
2025-02-13 11:24:27 +00:00

495 lines
15 KiB
Bash
Executable File

#!/bin/bash -
#===============================================================================
#
# FILE: tidy_uploaded
#
# USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT]
#
# DESCRIPTION: Relocates HPR audio and other show-related files on 'borg'
# after their shows have been uploaded to the Internet Archive.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.11
# CREATED: 2022-03-30 17:38:01
# REVISION: 2024-07-29 18:24:26
#
#===============================================================================
set -o nounset # Treat unset variables as an error
VERSION="0.0.11"
SCRIPT=${0##*/}
# DIR=${0%/*}
STDOUT="/dev/fd/2"
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit 1; }
# shellcheck disable=SC1090
source "$LIB"
#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
#
# Configure depending whether local or on 'borg'
#
case $HOSTNAME in
borg) BASEDIR="$HOME/InternetArchive"
UPLOADS="/data/IA/uploads"
ARCHIVE="/data/IA/done" ;;
i7-desktop) BASEDIR="$HOME/HPR/InternetArchive"
UPLOADS="$HOME/HPR/IA/uploads"
ARCHIVE="$HOME/HPR/IA/done";;
*) echo "Wrong host!"; exit 1 ;;
esac
# {{{ -- Functions -- exists_in, queued_tasks, movefile, is_empty, _log, _usage
#=== FUNCTION ================================================================
# NAME: exists_in
# DESCRIPTION: Checks the existence of a key in an associative array
# PARAMETERS: $1 array name
# $2 key value
# RETURNS: True if the key exists, False otherwise
#
# Modified from
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
#===============================================================================
exists_in () {
# shellcheck disable=SC2086
eval '[ ${'$1'[$2]+muahaha} ]'
}
#=== FUNCTION ================================================================
# NAME: queued_tasks
# DESCRIPTION: Queries the IA for any queued or running tasks for an item.
# Writes the number to STDOUT so it can be captured.
# PARAMETERS: $1 IA item (like hpr1192)
# RETURNS: Nothing
#===============================================================================
queued_tasks () {
local item="${1:?Usage: queued_tasks item}"
local -i count=0
count="$(ia tasks "$item" |\
jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"
echo "$count"
return
}
#=== FUNCTION ================================================================
# NAME: movefile
# DESCRIPTION: Moves a file to a new place, catering for any directories in
# the path
# PARAMETERS: $1 directory to move from
# $2 directory to move to
# $3 file (or sub-path to move)
# RETURNS: True if a move was done, otherwise False
#===============================================================================
movefile () {
local fromdir="${1:?Usage: movefile fromdir todir path [FORCE]}"
local todir="${2:?Usage: movefile fromdir todir path [FORCE]}"
local path="${3:?Usage: movefile fromdir todir path [FORCE]}"
local FORCE="${4:-0}"
[[ ! -v FORCE ]] && FORCE=0
#
# Chop up the path. If it's just a file name then $dir and $file are the
# same, in which case we make $dir empty.
#
local dir="${path%/*}"
local file="${path##*/}"
[[ $dir = "$file" ]] && dir=''
#
# If we have a directory in the path check it exists in the 'to' directory
# and create it if not
#
if [[ -n $dir ]]; then
if [[ ! -d $dir ]]; then
mkdir -p "$todir/$dir"
fi
fi
#
# Does the file exist already?
# TODO: Compare the two files?
#
if [[ -e $todir/$path ]]; then
if [[ $FORCE -eq 1 ]]; then
echo "File exists: $todir/$path"
echo "FORCE mode is ON so overwriting"
mv --force "$fromdir/$path" "$todir/$path"
echo "Moved $fromdir/$path"
return 0
else
echo "File already exists: $todir/$path"
return 1
fi
else
mv "$fromdir/$path" "$todir/$path"
echo "Moved $fromdir/$path"
return 0
fi
}
#=== FUNCTION ================================================================
# NAME: is_empty
# DESCRIPTION: Check whether a directory is empty (of files)
# PARAMETERS: $1 Directory to test
# RETURNS: True if empty (of files), otherwise false
#===============================================================================
is_empty() {
test -z "$(find "$1" -mindepth 1 -type f -printf X -quit)"
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script
# using the predefined $LOGREC, a template for 'printf'. If the
# latter is not defined the function will use a default.
# For some reason 'shellcheck' objects to this function. The
# first argument to 'printf' needs to be -1 to make the
# '%(fmt)T' use today's date and time.
# PARAMETERS: 1 - the message to write
# RETURNS: Nothing
#===============================================================================
# shellcheck disable=SC2317 disable=SC2059
_log () {
local msg="$1"
# echo "D> $LOGFILE $LOGREC"
[ -v LOGFILE ] || { echo "${FUNCNAME[0]}: \$LOGFILE is not defined"; exit 1; }
[ -v LOGREC ] || { local LOGREC='%(%F %T)T %s\n'; }
# echo "D> $LOGFILE $LOGREC"
printf "$LOGREC" -1 "$msg" >> "$LOGFILE"
return
}
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Report usage
# PARAMETERS: 1 [optional] exit value
# RETURNS: Nothing
#===============================================================================
_usage () {
local -i res="${1:-0}"
cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}
Usage: ./${SCRIPT} [-h] [-v] [-c COUNT] [-d {0|1}] [-D]
Moves HPR audio and other show-related files on 'borg' after their shows
have been uploaded to the Internet Archive. Files to be uploaded are in the
directory ${UPLOADS} and they are moved to the directory ${ARCHIVE}.
Options:
-h Print this help
-v Run in verbose mode where more information is reported
-d 0|1 Dry run: -d 1 (the default) runs the script in dry-run
mode where nothing is moved but the actions that
will be taken are reported; -d 0 turns off dry-run
mode and the actions will be carried out.
-c COUNT Count of shows to process. If omitted or zero then all
shows will be processed, otherwise this is the number
to stop at.
-D Run in debug mode where a lot more information is
reported
-F Turn on FORCE mode (normally off). In this mode when
the files being tidied (moved) already exist, they are
overwritten. This is for the very rare case when
a show's audio has to be re-uploaded because of bad
audio or the wrong file being sent.
Examples
./tidy_uploaded # Run in (default) dry-run mode
./tidy_uploaded -v # Dry-run mode with verbose messages
./tidy_uploaded -d0 # Live mode (without verbose messages)
./tidy_uploaded -c1 # Process 1 show in dry-run mode
./tidy_uploaded -D # Run with debugging enabled
./tidy_uploaded -F # Run with FORCE mode on
endusage
exit "$res"
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Directories and files
#
LOGS="$BASEDIR/logs"
LOGFILE="$LOGS/$SCRIPT.log"
LOGREC='%(%F %T)T %s\n'
#
# Process options
#
while getopts :c:d:DFhv opt
do
case "${opt}" in
c) COUNT=$OPTARG;;
D) DEBUG=1;;
d) DRYRUN=$OPTARG;;
F) FORCE=1;;
h) _usage 0;;
v) VERBOSE=1;;
*) echo "** Unknown option"
_usage 1;;
esac
done
shift $((OPTIND - 1))
COUNT=${COUNT:-0}
if [[ ! $COUNT =~ ^[0-9]+$ ]]; then
echo "** Use a numeric argument with -c"
_usage 1
fi
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
_usage 1
fi
[[ $DRYRUN -eq 1 ]] && echo "Dry run mode"
FORCE=${FORCE:-0}
[[ $FORCE -eq 1 ]] && echo "Force mode - overwriting existing files"
VERBOSE=${VERBOSE:-0}
DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && echo "Debug mode"
#
# Should have no arguments
#
if [[ $# != 0 ]]; then
echo "** ${SCRIPT} takes no arguments"
_usage 1
fi
#
# Declarations
#
declare -A seen
declare -a dirs
# lastitem=
ind=0
#
# Scan the directory 'UPLOADS' where files for upload to the IA are stored.
#
# See the `find' pipeline at the end of the loop which outputs the last change
# time and the full file path, sorts on the time, then removes it. This
# ensures we process the files in time order rather than alphabetic order of
# their names.
#
while read -r path; do
#
# Extract the path relative to $UPLOADS and the IA item name from the
# returned path. Here $relpath will be the filename or a sub-directory and
# filename, and $item will be the IA identifier like 'hpr1192'.
#
relpath="${path#"$UPLOADS"/}"
item="${relpath:0:7}"
[[ $VERBOSE -eq 1 ]] && echo "Found $path"
_DEBUG "Path: $path"
_DEBUG "Relative path: $relpath"
_DEBUG "IA item: $item"
#
# Detect that the item prefix has changed. If it has we're processing
# a new IA identifier, so work on this one
#
# If we have seen this item before we don't need to process it, so just
# skip this loop iteration
#
#
# Never seen before, so process it
#
if ! exists_in seen "$item"; then
# shellcheck disable=SC2034
seen[$item]=1
#
# Count this item and stop the loop if we've reached the requested
# count. We want the value of $ind to be the number of shows
# processed, so adjust it if we stopped after incrementing it.
#
((ind++))
if [[ $COUNT -gt 0 ]]; then
if [[ $ind -gt $COUNT ]]; then
((ind--))
break
fi
echo "[ Show #$ind ]"
fi
#
# Look to see if there are any tasks queued for this show. If there
# are we'll skip it just now.
#
tasks=$(queued_tasks "$item")
if [[ $tasks -gt 0 ]]; then
echo "** Item $item still has $tasks unfinished" \
"$(ngettext task tasks "$tasks")"
echo "** Skipping to the next item"
continue
fi
[[ $VERBOSE -eq 1 ]] && echo "Checking IA for $item"
#
# Interrogate the IA for the item we're working on. If it returns True
# we can proceed with tidying. The file 'TMP1' contains just a simple
# list of the files on the IIA relating to this item.
#
if ia list "$item" > "$TMP1"; then
#
# Save any directory associated with this item. This means that
# directories with names that don't conform to the "^hpr[0-9]{4}"
# pattern will be ignored, but this it *not* expected to happen.
# Note that directories without corresponding audio will not be
# cleaned up by this method, but again this is not expected to
# happen.
# TODO: be alert to such issues!
#
dirpath="$UPLOADS/$item"
if [[ -d "$dirpath" ]]; then
echo "Storing directory: $item"
dirs+=("$item")
fi
moves=0
#
# Scan the returned list to see if any files we have are online.
# Move to the ARCHIVE directory when there's a match.
#
while read -r file; do
frompath="$UPLOADS/$file"
topath="$ARCHIVE/$file"
if [[ -e "$frompath" ]]; then
#
# A file on the IA exists in the upload area. Move the
# local one if we're not in dry-run mode, otherwise just
# report the move we would do. If FORCE mode is on
# overwrite the file.
#
if [[ $DRYRUN -eq 0 ]]; then
movefile "$UPLOADS" "$ARCHIVE" "$file" "$FORCE" && ((moves++))
else
if [[ $FORCE -eq 0 ]]; then
printf 'Would move %s\n\tto %s\n' "$frompath" "$topath"
else
printf 'Would move %s\n\toverwriting %s\n' "$frompath" "$topath"
fi
fi
fi
done < "$TMP1"
#
# Log this item
#
[[ $DRYRUN -eq 0 ]] && \
printf '%s moved %d %s for %s\n' "$(date +%Y%m%d%H%M%S)" \
"$moves" "$(ngettext file files "$moves")" "$item" >> "$LOGFILE"
else
printf 'Skipping %s; not in the IA\n' "$item"
fi
else
#
# Ignore all but the first file belonging to an IA identifier
#
_DEBUG "Skipped $path - repeated show number"
continue
fi
done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ')
#
# No shows processed? There was nothing to do
#
if [[ $ind -eq 0 ]]; then
[[ $DRYRUN -eq 0 ]] && echo "Nothing to do"
exit
fi
_DEBUG "Number of shows scanned: $ind"
# _DEBUG "Accumulated directories (${#dirs[*]}): $(printf '/%s/ ' "${dirs[*]}")"
#
# If there are no directories just exit.
#
[[ -v dirs ]] || exit
#
# By an (as yet) unknown process we might get duplicates, so remove them here.
#
# mapfile -t dirs < <(printf "%s\n" "${dirs[*]}" | uniq)
declare -A unique
for e in "${dirs[@]}"; do unique[$e]=1; done
dirs=( "${!unique[@]}" )
# mapfile -t dirs < <(printf '%s\n' "${!unique[@]}")
_DEBUG "Directories to process (${#dirs[*]}): $(printf '>%s< ' "${dirs[*]}")"
#
# Clean up any empty directories. These may exist because we moved their
# contents one file at a time. We only deal with the directories we've visited
# though.
#
for dir in "${dirs[@]}"; do
path="$UPLOADS/$dir"
if [[ $DRYRUN -eq 0 ]]; then
if is_empty "$path"; then
rm -rf "$path"
RES=$?
if [[ $RES -eq 0 ]]; then
echo "Deleted $path"
echo "$(date +%Y%m%d%H%M%S) deleted empty directory $path" >> "$LOGFILE"
else
echo "Failed to delete: $path"
fi
else
echo "Directory is not empty: $path"
echo "Not deleted!"
fi
else
echo "Would delete directory $path"
fi
done
exit
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker