diff --git a/InternetArchive/recover_transcripts b/InternetArchive/recover_transcripts index c1d23e5..fca9f4b 100755 --- a/InternetArchive/recover_transcripts +++ b/InternetArchive/recover_transcripts @@ -307,7 +307,7 @@ VERBOSE=${VERBOSE:-0} # # Should have one argument # -if [[ $# != 1 ]]; then +if [[ $# -ne 1 ]]; then coloured 'red' "Missing argument" _usage 1 fi diff --git a/InternetArchive/repair_assets b/InternetArchive/repair_assets index 912236c..d1489d4 100755 --- a/InternetArchive/repair_assets +++ b/InternetArchive/repair_assets @@ -6,7 +6,7 @@ # USAGE: ./repair_assets showid # # DESCRIPTION: Given a show where there was a directory of asset files on the -# old HPR server whichj got lost in the migration, rebuild it +# old HPR server which got lost in the migration, rebuild it # and fill it with assets from the IA. Modify the show notes to # point to these recovered assets. # @@ -15,15 +15,15 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.8 +# VERSION: 0.0.10 # CREATED: 2024-05-10 21:26:31 -# REVISION: 2024-08-23 11:55:25 +# REVISION: 2024-10-02 17:34:47 # #=============================================================================== # set -o nounset # Treat unset variables as an error -VERSION="0.0.8" +VERSION="0.0.10" SCRIPT=${0##*/} # DIR=${0%/*} @@ -96,6 +96,38 @@ trap 'cleanup_temp $TMP1 $TMP2' SIGHUP SIGINT SIGPIPE SIGTERM EXIT # $3 Name of array to receive list of missing assets # RETURNS: Nothing #=============================================================================== +# find_missing () { +# local -n IA="${1}" +# local -n HPR="${2}" +# local output="${3}" +# +# local -A hIA hHPR +# local i key +# +# # +# # Make a hash keyed by the IA file base names from an indexed array +# # +# for (( i=0; i<${#IA[@]}; i++ )); do +# hIA+=([${IA[$i]##*/}]=${IA[$i]}) +# done +# +# # +# # Make a hash keyed by the HPR file base names from an indexed array +# # +# for (( i=0; i<${#HPR[@]}; i++ )); do +# hHPR+=([${HPR[$i]##*/}]=${HPR[$i]}) +# done +# +# # +# # Use the basename keys to check what's missing, but return the full path +# # names. +# # +# for key in "${!hIA[@]}"; do +# if ! exists_in hHPR "$key"; then +# eval "$output+=('${hIA[$key]}')" +# fi +# done +# } find_missing () { local -n IA="${1}" local -n HPR="${2}" @@ -105,26 +137,29 @@ find_missing () { local i key # - # Make a hash keyed by the IA file base names from an indexed array + # Make a hash keyed by the full IA paths from an indexed array # for (( i=0; i<${#IA[@]}; i++ )); do - hIA+=([${IA[$i]##*/}]=${IA[$i]}) + hIA+=([${IA[$i]}]=$i) done # - # Make a hash keyed by the HPR file base names from an indexed array + # Make a hash keyed by the HPR file paths from an indexed array, but + # remove the first element for parity with the IA paths. We are going to + # copy the IA paths, not these, so we never need the full paths again + # here. # for (( i=0; i<${#HPR[@]}; i++ )); do - hHPR+=([${HPR[$i]##*/}]=${HPR[$i]}) + hHPR+=([${HPR[$i]#*/}]=$i) done # - # Use the basename keys to check what's missing, but return the full path - # names. + # Use the full path keys to check what's missing, and return the IA full + # path names. # for key in "${!hIA[@]}"; do if ! exists_in hHPR "$key"; then - eval "$output+=('${hIA[$key]}')" + eval "$output+=('$key')" fi done } @@ -267,10 +302,13 @@ fi show="${1,,}" # -# Ensure show id is correctly formatted. We want it to be 'hpr1234' +# Ensure show id is correctly formatted. We want it to be 'hpr1234' but we +# allow the 'hpr' bit to be omitted, as well as any leading zeroes. We need to +# handle the weirdness of "leading zero means octal" though, but we always +# store it as 'hpr1234' once processed. # if [[ $show =~ (hpr)?([0-9]+) ]]; then - printf -v show 'hpr%04d' "${BASH_REMATCH[2]}" + printf -v show 'hpr%04d' "$((10#${BASH_REMATCH[2]}))" else coloured 'red' "Incorrect show specification: $show" coloured 'yellow' "Use 'hpr9999' or '9999' format" @@ -443,37 +481,45 @@ ignore_re="index.html$" # Run the command and save the output. Save the asset names returned in an # array. TODO: Handle errors from the command # -if [[ $DRYRUN -eq 0 ]]; then - eval "$command" > "$TMP2" - RES=$? - if [[ $RES -eq 0 ]]; then - _verbose "$(coloured 'green' "Remote command successful")" - while read -r hprfile; do - if [[ ! $hprfile =~ $ignore_re ]]; then - hpr_asset+=("${hprfile}") - fi - done < "$TMP2" - _verbose "$(coloured 'green' "Assets found on HPR server = ${#hpr_asset[@]}")" - _verbose "$(printf '%s\n' "${hpr_asset[@]}")" - _log "Assets found on HPR server = ${#hpr_asset[@]}" - else - coloured 'red' "Remote command failed" - _log "Failed while searching for HPR assets" - exit 1 - fi +# +# NOTE: We also want to interrogate the HPR state in dry-run mode +# +# if [[ $DRYRUN -eq 0 ]]; then +# else +# coloured 'yellow' "Would have searched for assets on the HPR server" +# fi + +eval "$command" > "$TMP2" +RES=$? +if [[ $RES -eq 0 ]]; then + _verbose "$(coloured 'green' "Remote command successful")" + while read -r hprfile; do + if [[ ! $hprfile =~ $ignore_re ]]; then + hpr_asset+=("${hprfile}") + fi + done < "$TMP2" + _verbose "$(coloured 'green' "Assets found on HPR server = ${#hpr_asset[@]}")" + _verbose "$(printf '%s\n' "${hpr_asset[@]}")" + _log "Assets found on HPR server = ${#hpr_asset[@]}" else - coloured 'yellow' "Would have searched for assets on the HPR server" + coloured 'red' "Remote command failed" + _log "Failed while searching for HPR assets" + exit 1 fi #------------------------------------------------------------------------------- # Compare the two asset lists and return what's missing on the HPR server #------------------------------------------------------------------------------- -# TODO: This algorithm does not handle the instance where there are pictures -# in one directory and a lower directory containing thumbnails, AND THE FILE -# NAMES ARE THE SAME! +# TODO: The algorithm in find_missing does not handle the instance where there +# are pictures in one directory and a lower directory containing thumbnails, +# AND THE FILE NAMES ARE THE SAME! # declare -a missing -find_missing ia_asset hpr_asset missing +if [[ ${#hpr_asset[@]} -eq 0 ]]; then + missing=( "${ia_asset[@]}" ) +else + find_missing ia_asset hpr_asset missing +fi _verbose "$(coloured 'cyan' "** missing (${#missing[@]}):")" _verbose "$(printf '%s\n' "${missing[@]}")" diff --git a/InternetArchive/repair_item b/InternetArchive/repair_item index 107997e..692094b 100755 --- a/InternetArchive/repair_item +++ b/InternetArchive/repair_item @@ -23,7 +23,7 @@ # temporarily on 'borg') and determines which have not been # uploaded, then takes steps to perform the uploads. # -# Version 0.0.11 onwards has the capability to repair an IA item +# Version 0.0.12 onwards has the capability to repair an IA item # from the HPR backup disk. This seems to be necessary because # the transcripts were not carried over (although we are # adding them to the IA for new shows now, older ones were never @@ -44,15 +44,15 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.11 +# VERSION: 0.0.12 # CREATED: 2020-01-05 22:42:46 -# REVISION: 2024-07-20 17:06:10 +# REVISION: 2024-09-13 18:19:59 # #=============================================================================== #set -o nounset # Treat unset variables as an error -VERSION="0.0.11" +VERSION="0.0.12" SCRIPT=${0##*/} # DIR=${0%/*} @@ -492,9 +492,9 @@ else # # Stop the missed file loop if we have reached the limiting number, in - # dry-run and live mode + # dry-run and live mode, but not extended mode # - [[ $upload_count -eq $LIMIT ]] && { + [[ $EXTENDED -eq 0 && $upload_count -eq $LIMIT ]] && { coloured 'blue' "Upload limit ($LIMIT) reached" break } diff --git a/InternetArchive/snapshot_metadata b/InternetArchive/snapshot_metadata index 2f34abb..7983064 100755 --- a/InternetArchive/snapshot_metadata +++ b/InternetArchive/snapshot_metadata @@ -5,23 +5,28 @@ # # USAGE: ./snapshot_metadata episode_number # -# DESCRIPTION: Collects metadata from the IA for a given show and stores it -# in the cache. +# DESCRIPTION: Collects JSON metadata from the IA for a given show and stores +# it in the cache. Runs 'view_derivatives' on the JSON to +# display the derivatives if any, and to save their names if +# found, for deletion. +# Deletion is performed thus (external to this script): +# +# cat assets/hpr$(./next_repair)/derived.lis | xargs ia delete hpr$(./next_repair) --no-backup # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.2 +# VERSION: 0.0.3 # CREATED: 2024-08-16 20:36:51 -# REVISION: 2024-08-17 10:31:15 +# REVISION: 2024-10-02 17:40:13 # #=============================================================================== set -o nounset # Treat unset variables as an error -VERSION="0.0.2" +VERSION="0.0.3" SCRIPT=${0##*/} # DIR=${0%/*} @@ -126,17 +131,19 @@ fi show="${1,,}" # -# Ensure show id is correctly formatted. We want it to be 'hpr1234' +# Ensure show id is correctly formatted. We want it to be 'hpr1234' but we +# allow the 'hpr' bit to be omitted, as well as any leading zeroes. We need to +# handle the weirdness of "leading zero means octal" though, but we always +# store it as 'hpr1234' once processed. # if [[ $show =~ (hpr)?([0-9]+) ]]; then - printf -v show 'hpr%04d' "${BASH_REMATCH[2]}" + printf -v show 'hpr%04d' "$((10#${BASH_REMATCH[2]}))" else coloured 'red' "Incorrect show specification: $show" coloured 'yellow' "Use 'hpr9999' or '9999' format" exit 1 fi - #------------------------------------------------------------------------------- # Setting up paths #------------------------------------------------------------------------------- diff --git a/InternetArchive/view_derivatives b/InternetArchive/view_derivatives index 28ed5f6..d3da19a 100755 --- a/InternetArchive/view_derivatives +++ b/InternetArchive/view_derivatives @@ -44,9 +44,9 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.2 +# VERSION: 0.0.4 # CREATED: 2024-08-12 16:26:29 -# REVISION: 2024-08-17 13:44:44 +# REVISION: 2024-09-17 17:03:27 # #=============================================================================== @@ -71,7 +71,7 @@ use Data::Dumper; # # Version number (Incremented by Vim) # -our $VERSION = '0.0.2'; +our $VERSION = '0.0.4'; # # Script and directory names @@ -170,11 +170,12 @@ die "Empty JSON?\n" unless (@jsonbuffer); my $md = $jsonbuffer[0]; # -# Collect the identifier from the parsed JSON and define the one derived file -# we don't want to delete. +# Collect the identifier from the parsed JSON and define the derived files we +# don't want to delete. (Found cases of audio files being "derived" in 1672 +# and 1664) # my $identifier = $md->{metadata}->{identifier}; -my $item_png = "${identifier}.png"; +my $skip_re = qr{^${identifier}\.(flac|mp3|ogg|opus|png|spx|wav)$}; # # Build a hash from the original and derived files referenced in the metadata. @@ -257,9 +258,13 @@ if ($verbose > 0) { say '-' x 10; } +# +# List derived files that can be deleted, being careful not to delete the +# audio or the PNG image created by IA code. +# if ($list_derived) { foreach my $file ( sort(@derived) ) { - say "$file" unless ($file eq $item_png); + say "$file" unless ($file =~ $skip_re); } } @@ -377,49 +382,157 @@ __END__ =head1 NAME -view_derivatives - +view_derivatives - a tool to analyse IA metadata =head1 VERSION -The initial template usually just has: - -This documentation refers to view_derivatives version 0.0.2 - +This documentation refers to view_derivatives version 0.0.4 =head1 USAGE - # Brief working invocation example(s) here showing the most common usage(s) + view_derivatives [-help] [-documentation|-man] [-debug=N] [-[no]dry-run] + [-verbose [-verbose] ...] [-[no]list_derived] metadata_file - # This section will be as far as many users ever read - # so make it as educational and exemplary as possible. + # Parse the metadata and report the relationships beteen files + view_derivatives -verb METADATA + # Parse the metadata and write out a list of derived files for potential + # deletion. + view_derivatives -list_derived METADATA > FILE =head1 REQUIRED ARGUMENTS -A complete list of every argument that must appear on the command line. -when the application is invoked, explaining what each of them does, any -restrictions on where each one may appear (i.e. flags that must appear -before or after filenames), and how the various arguments and options -may interact (e.g. mutual exclusions, required combinations, etc.) +The name of a file created by the following command: -If all of the application's arguments are optional this section -may be omitted entirely. + ia metadata "show" > metadata_file +The file is expected to contain one JSON object (in a one-element array). If +it contains more objects only the first will be processed. =head1 OPTIONS -A complete list of every available option with which the application -can be invoked, explaining what each does, and listing any restrictions, -or interactions. +=over 4 -If the application has no options this section may be omitted entirely. +=item B<-help> +Prints a brief help message describing the usage of the program, and then exits. + +=item B<-documentation> B<-man> + +Displays the entirety of the documentation (using a pager), and then exits. To +generate a PDF version use the I tool from +I. This can be +installed with the cpan tool as App::pod2pdf. Use the command: + + pod2pdf view_derivatives --out=view_derivatives.pdf + +=item B<-debug=N> + +Selects a level of debugging. Debug information consists of a line or series +of lines prefixed with the characters 'D>': + +=over 4 + +=item B<0> + +No debug output is generated: this is the default + +=item B<3> + +Prints all data structures from options + +=back + +(The debug levels need work!) + +=item B<-[no]dry-run> + +Enable/disable dry run mode (default off) + +=item B<-verbose> + +Sets the verbosity level. If the option is omitted then the level is zero (no +verbose output). Thereafter, for each occurrence of the option the verbosity +level is incremented. Only levels 1 and 2 are currently catered for. Any +levels above 2 produce the same result as level 2. + +=item B<-[no]list_derived> + +This option is off by default. Turning it on causes the script to write all +derived files to standard output. If the verbosity level is zero this is the +only output from the script. + +The idea is that at verbosity level 1 or 2 information is displayed about the +relationship of files in the metadata, for human consumption. If +B<-nolist_derived> is the setting (or default) then this is all that is shown. + +If the verbosity level is zero and B<-list_derived> is on then only the list +of derived files will be generated, and this can be used to delete the files +from the IA. =head1 DESCRIPTION -A full description of the application and its features. -May include numerous subsections (i.e. =head2, =head3, etc.) +=head2 OVERVIEW +Items on the IA (Internet Archive, or I) consist of metadata and +files. Each item generated for HPR is a show or episode. Most files comprising +the episode on the IA are those which are part of the episode on the HPR +server. A few extra files are created by the IA software, but these are part +of the metadata (HTML details, upload date, etc.) + +By default the IA software will create additional files which are derived from +the original files. Typical examples are other audio formats, such as Ogg or +Mp3. We have been disabling this derivation process for several years for +various reasons, preferring to generate our own derivatives. IA-generated +audio derivatives do not have ID3 and similar tags, whereas HPR-generated +audio formats do. + +Historically it was difficult to disable the derivation process. Even though +there were settings to do this they apparently didn't work on all of the +servers making up the IA, and so older items may have many derived files. + +This script assists with identifying unwanted derivatives and with their +deletion. + +=head2 METADATA + +The metadata for an item can be obtained (by a registered user) from the IA +using the B tool. Its format is JSON, and this script uses a JSON module +to parse it. + +=head2 FILE RELATIONSHIPS + +The JSON metadata contains details of all files comprising the IA item. +It contains details such as the name, size, and type of each file. It also +categorises files into groups such as I and I. Files which +are derived have parents. The script uses this to build tree-like data +structures of derived files based on the original files. All children of an +original file will be derived, but some derived files may also have children. + +The derivatives can be classified simply as children of original files or of +derived files. These are what are listed if required and what are used in the +deletion process. + +=head2 DELETING UNWANTED DERIVATIVES + +The simplest method is to pipe the output from the script with verbose level +zero and with B<-list_derived> enabled into B in order to run a command +which will delete the unwanted derivatives. + +One usage is: + + ./view_derivatives -list_derived metadata.json |\ + xargs ia delete hpr1234 --no-backup + +This will generate a list of files to be deleted, then pipe them to B +which will construct a command by appending the names to the command template. + +This approach is not ideal since it does not handle the case where there is +nothing to delete. The script B manages this situation by +generating the metadata and saving it in a file, then it runs +B on this file and generates a file of derivatives. If this +file is not empty it can be used to perform the deletions, and otherwise no +attempt will be made. =head1 DIAGNOSTICS @@ -468,23 +581,18 @@ special cases that are not (yet) handled, etc. The initial template usually just has: There are no known bugs in this module. -Please report problems to () +Please report problems to Dave Morriss (dave.morriss@gmail.com) Patches are welcome. + =head1 AUTHOR - () +Dave Morriss (dave.morriss@gmail.com) =head1 LICENCE AND COPYRIGHT -Copyright (c) (). All rights reserved. - -Followed by whatever licence you wish to release it under. -For Perl code that is often just: - -This module is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. See perldoc perlartistic. +Copyright (c) 2024 Dave Morriss (dave.morriss@gmail.com). All rights reserved. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of