hpr-tools/InternetArchive/tidy_uploaded

#!/bin/bash -
#===============================================================================
#
#         FILE: tidy_uploaded
#
#        USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT]
#
#  DESCRIPTION: Relocates HPR audio and other show-related files on 'borg'
#               after their shows have been uploaded to the Internet Archive
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.11
#      CREATED: 2022-03-30 17:38:01
#     REVISION: 2024-07-29 18:24:26
#
#===============================================================================

set -o nounset                              # Treat unset variables as an error

VERSION="0.0.11"

SCRIPT=${0##*/}
# DIR=${0%/*}

STDOUT="/dev/fd/2"

#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "Unable to source functions"; exit 1; }
# shellcheck disable=SC1090
source "$LIB"

#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; }
trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT

#
# Configure depending whether local or on the VPS
#
case $HOSTNAME in
    borg)       BASEDIR="$HOME/InternetArchive"
                UPLOADS="/data/IA/uploads"
                ARCHIVE="/data/IA/done" ;;
    i7-desktop) BASEDIR="$HOME/HPR/InternetArchive"
                UPLOADS="$HOME/HPR/IA/uploads"
                ARCHIVE="$HOME/HPR/IA/done";;
    *)          echo "Wrong host!"; exit 1 ;;
esac

# {{{ -- Functions -- exists_in, queued_tasks, movefile, is_empty, _log, _usage

#===  FUNCTION  ================================================================
#         NAME: exists_in
#  DESCRIPTION: Checks the existence of a key in an associative array
#   PARAMETERS: $1      array name
#               $2      key value
#      RETURNS: True if the key exists, False otherwise
#
# Modified from
# https://stackoverflow.com/questions/13219634/easiest-way-to-check-for-an-index-or-a-key-in-an-array
#===============================================================================
exists_in () {
    # shellcheck disable=SC2086
    eval '[ ${'$1'[$2]+muahaha} ]'
}

#===  FUNCTION  ================================================================
#         NAME: queued_tasks
#  DESCRIPTION: Queries the IA for any queued or running tasks for an item.
#               Writes the number to STDOUT so it can be captured.
#   PARAMETERS: $1      IA item (like hpr1192)
#      RETURNS: Nothing
#===============================================================================
queued_tasks () {
    local item="${1:?Usage: queued_tasks item}"
    local -i count=0

    count="$(ia tasks "$item" |\
        jq -s '[.[] | if .category == "catalog" then .status else empty end] | length')"

    echo "$count"

    return
}

#===  FUNCTION  ================================================================
#         NAME: movefile
#  DESCRIPTION: Moves a file to a new place, catering for any directories in
#               the path
#   PARAMETERS: $1      directory to move form
#               $2      directory to move to
#               $3      file (or sub-path to move)
#      RETURNS: True if a move was done, otherwise False
#===============================================================================
movefile () {
    local fromdir="${1:?Usage: movefile fromdir todir path [FORCE]}"
    local todir="${2:?Usage: movefile fromdir todir path [FORCE]}"
    local path="${3:?Usage: movefile fromdir todir path [FORCE]}"
    local FORCE="${4:-0}"

    [[ ! -v FORCE ]] && FORCE=0

    #
    # Chop up the path. If it's just a file name then $dir and $file are the
    # same, in which case we make $dir empty.
    #
    local dir="${path%/*}"
    local file="${path##*/}"
    [[ $dir = "$file" ]] && dir=''

    #
    # If we have a directory in the path check it exists in the 'to' directory
    # and create it if not
    #
    if [[ -n $dir ]]; then
        if [[ ! -d $dir ]]; then
            mkdir -p "$todir/$dir"
        fi
    fi

    #
    # Does the file exist already?
    # TODO: Compare the two files?
    #
    if [[ -e $todir/$path ]]; then
        if [[ $FORCE -eq 1 ]]; then
            echo "File exists: $todir/$path"
            echo "FORCE mode is ON so overwriting"
            mv --force "$fromdir/$path" "$todir/$path"
            echo "Moved $fromdir/$path"
            return 0
        else
            echo "File already exists: $todir/$path"
            return 1
        fi
    else
        mv "$fromdir/$path" "$todir/$path"
        echo "Moved $fromdir/$path"
        return 0
    fi

}

#===  FUNCTION  ================================================================
#         NAME: is_empty
#  DESCRIPTION: Check whether a directory is empty (of files)
#   PARAMETERS: $1      Directory to test
#      RETURNS: True if empty (of files), otherwise false
#===============================================================================
is_empty() {
    test -z "$(find "$1" -mindepth 1 -type f -printf X -quit)"
}

#===  FUNCTION  ================================================================
#         NAME: _log
#  DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script
#               using the predefined $LOGREC, a template for 'printf'. If the
#               latter is not defined the function will use a default.
#               For some reason 'shellcheck' objects to this function. The
#               first argument to 'printf' needs to be -1 to make the
#               '%(fmt)T' use today's date and time.
#   PARAMETERS: 1 - the message to write
#      RETURNS: Nothing
#===============================================================================
# shellcheck disable=SC2317 disable=SC2059
_log () {
    local msg="$1"

#     echo "D> $LOGFILE $LOGREC"
    [ -v LOGFILE ] || { echo "${FUNCNAME[0]}: \$LOGFILE is not defined"; exit 1; }
    [ -v LOGREC ] || { local LOGREC='%(%F %T)T %s\n'; }

#     echo "D> $LOGFILE $LOGREC"
    printf "$LOGREC" -1 "$msg" >> "$LOGFILE"

    return
}

#===  FUNCTION  ================================================================
#         NAME: _usage
#  DESCRIPTION: Report usage
#   PARAMETERS: 1       [optional] exit value
#      RETURNS: Nothing
#===============================================================================
_usage () {
    local -i res="${1:-0}"

    cat >$STDOUT <<-endusage
${SCRIPT} - version: ${VERSION}

Usage: ./${SCRIPT} [-h] [-v] [-c COUNT] [-d {0|1}] [-D]

Moves HPR audio and other show-related files on 'borg' after their shows
have been uploaded to the Internet Archive. Files to be uploaded are in the
directory ${UPLOADS} and they are moved to the directory ${ARCHIVE}.

Options:
  -h                    Print this help
  -v                    Run in verbose mode where more information is reported
  -d 0|1                Dry run: -d 1 (the default) runs the script in dry-run
                        mode where nothing is moved but the actions that
                        will be taken are reported; -d 0 turns off dry-run
                        mode and the actions will be carried out.
  -c COUNT              Count of shows to process. If omitted or zero then all
                        shows will be processed, otherwise this is the number
                        to stop at.
  -D                    Run in debug mode where a lot more information is
                        reported
  -F                    Turn on FORCE mode (normally off). In this mode when
                        the files being tidied (moved) already exist, they are
                        overwritten. This is for the very rare case when
                        a show's audio has to be re-uploaded because of bad
                        audio or the wrong file being sent.

Examples
    ./tidy_uploaded             # Run in (default) dry-run mode
    ./tidy_uploaded -v          # Dry-run mode with verbose messages
    ./tidy_uploaded -d0         # Live mode (without verbose messages)
    ./tidy_uploaded -c1         # Process 1 show in dry-run mode
    ./tidy_uploaded -D          # Run with debugging enabled
    ./tidy_uploaded -F          # Run with FORCE mode on

endusage
    exit "$res"
}

# }}}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#
# Directories and files
#
LOGS="$BASEDIR/logs"
LOGFILE="$LOGS/$SCRIPT.log"
LOGREC='%(%F %T)T %s\n'

#
# Process options
#
while getopts :c:d:DFhv opt
do
    case "${opt}" in
        c) COUNT=$OPTARG;;
        D) DEBUG=1;;
        d) DRYRUN=$OPTARG;;
        F) FORCE=1;;
        h) _usage 0;;
        v) VERBOSE=1;;
        *) echo "** Unknown option"
           _usage 1;;
    esac
done
shift $((OPTIND - 1))

COUNT=${COUNT:-0}
if [[ ! $COUNT =~ ^[0-9]+$ ]]; then
    echo "** Use a numeric argument with -c"
    _usage 1
fi

DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
    echo "** Use '-d 0' or '-d 1'"
    _usage 1
fi
[[ $DRYRUN -eq 1 ]] && echo "Dry run mode"

FORCE=${FORCE:-0}
[[ $FORCE -eq 1 ]] && echo "Force mode - overwriting existing files"

VERBOSE=${VERBOSE:-0}

DEBUG=${DEBUG:-0}
[[ $DEBUG -eq 1 ]] && echo "Debug mode"

#
# Should have no arguments
#
if [[ $# != 0 ]]; then
    echo "** ${SCRIPT} takes no arguments"
    _usage 1
fi

#
# Declarations
#
declare -A seen
declare -a dirs
# lastitem=
ind=0

#
# Scan the directory 'UPLOADS' where files for upload to the IA are stored.
#
# See the `find' pipeline at the end of the loop which outputs the last change
# time and the full file path, sorts on the time, then removes it. This
# ensures we process the files in time order rather than alphabetic order of
# their names.
#
while read -r path; do
    #
    # Extract the path relative to $UPLOADS and the IA item name from the
    # returned path. Here $relpath will be the filename or a sub-directory and
    # filename, and $item will be the IA identifier like 'hpr1192'.
    #
    relpath="${path#"$UPLOADS"/}"
    item="${relpath:0:7}"

    [[ $VERBOSE -eq 1 ]] && echo "Found $path"

    _DEBUG "Path:          $path"
    _DEBUG "Relative path: $relpath"
    _DEBUG "IA item:       $item"

    #
    # Detect that the item prefix has changed. If it has we're processing
    # a new IA identifier, so work on this one
    #
    # If we have seen this item before we don't need to process it, so just
    # skip this loop iteration
    #

    #
    # Never seen before, so process it
    #
    if ! exists_in seen "$item"; then
        # shellcheck disable=SC2034
        seen[$item]=1

        #
        # Count this item and stop the loop if we've reached the requested
        # count. We want the value of $ind to be the number of shows
        # processed, so adjust it if we stopped after incrementing it.
        #
        ((ind++))
        if [[ $COUNT -gt 0 ]]; then
            if [[ $ind -gt $COUNT ]]; then
                ((ind--))
                break
            fi
            echo "[ Show #$ind ]"
        fi

        #
        # Look to see if there are any tasks queued for this show. If there
        # are we'll skip it just now.
        #
        tasks=$(queued_tasks "$item")
        if [[ $tasks -gt 0 ]]; then
            echo "** Item $item still has $tasks unfinished " \
                "$(ngettext task tasks "$tasks")"
            echo "** Skipping to the next item"
            continue
        fi

        [[ $VERBOSE -eq 1 ]] && echo "Checking IA for $item"

        #
        # Interrogate the IA for the item we're working on. If it returns True
        # we can proceed with tidying. The file 'TMP1' contains just a simple
        # list of the files on the IIA relating to this item.
        #
        if ia list "$item" > "$TMP1"; then
            #
            # Save any directory associated with this item. This means that
            # directories with names that don't conform to the "^hpr[0-9]{4}"
            # pattern will be ignored, but this it *not* expected to happen.
            # Note that directories without corresponding audio will not be
            # cleaned up by this method, but again this is not expected to
            # happen.
            # TODO: be alert to such issues!
            #
            dirpath="$UPLOADS/$item"
            if [[ -d "$dirpath" ]]; then
                echo "Storing directory: $item"
                dirs+=("$item")
            fi

            moves=0

            #
            # Scan the returned list to see if any files we have are online.
            # Move to the ARCHIVE directory when there's a match.
            #
            while read -r file; do
                frompath="$UPLOADS/$file"
                topath="$ARCHIVE/$file"

                if [[ -e "$frompath" ]]; then
                    #
                    # A file on the IA exists in the upload area. Move the
                    # local one if we're not in dry-run mode, otherwise just
                    # report the move we would do. If FORCE mode is on
                    # overwrite the file.
                    #
                    if [[ $DRYRUN -eq 0 ]]; then
                        movefile "$UPLOADS" "$ARCHIVE" "$file" "$FORCE" && ((moves++))
                    else
                        if [[ $FORCE -eq 0 ]]; then
                            printf 'Would move %s\n\tto %s\n' "$frompath" "$topath"
                        else
                            printf 'Would move %s\n\toverwriting %s\n' "$frompath" "$topath"
                        fi
                    fi
                fi
            done < "$TMP1"

            #
            # Log this item
            #
            [[ $DRYRUN -eq 0 ]] && \
                printf '%s moved %d %s for %s\n' "$(date +%Y%m%d%H%M%S)" \
                    "$moves" "$(ngettext file files "$moves")" "$item" >> "$LOGFILE"

        else
            printf 'Skipping %s; not in the IA\n' "$item"
        fi
    else
        #
        # Ignore all but the first file belonging to an IA identifier
        #
        _DEBUG "Skipped $path - repeated show number"
        continue
    fi

done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ')

# Old 'find' used:
# done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort)

#
# No shows processed? There was nothing to do
#
if [[ $ind -eq 0 ]]; then
    [[ $DRYRUN -eq 0 ]] && echo "Nothing to do"
    exit
fi

_DEBUG "Number of shows scanned: $ind"
# _DEBUG "Accumulated directories (${#dirs[*]}): $(printf '/%s/ ' "${dirs[*]}")"

#
# If there are no directories just exit.
#
[[ -v dirs ]] || exit

#
# By an (as yet) unknown process we might get duplicates, so remove them here.
#
# mapfile -t dirs < <(printf "%s\n" "${dirs[*]}" | uniq)
declare -A unique
for e in "${dirs[@]}"; do unique[$e]=1; done
dirs=( "${!unique[@]}" )
# mapfile -t dirs < <(printf '%s\n' "${!unique[@]}")

_DEBUG "Directories to process (${#dirs[*]}): $(printf '>%s< ' "${dirs[*]}")"

#
# Clean up any empty directories. These may exist because we moved their
# contents one file at a time. We only deal with the directories we've visited
# though.
#
for dir in "${dirs[@]}"; do
    path="$UPLOADS/$dir"

    if [[ $DRYRUN -eq 0 ]]; then
        if is_empty "$path"; then
            rm -rf "$path"
            RES=$?
            if [[ $RES -eq 0 ]]; then
                echo "Deleted $path"
                echo "$(date +%Y%m%d%H%M%S) deleted empty directory $path" >> "$LOGFILE"
            else
                echo "Failed to delete: $path"
            fi
        else
            echo "Directory is not empty: $path"
            echo "Not deleted!"
        fi
    else
        echo "Would delete directory $path"
    fi

done

exit

# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker