Compare commits

..

No commits in common. "7e925621f4e43e4f465e4680e8979206752788c9" and "a4c24296efe2b8761873a018d975bb967104a7c8" have entirely different histories.

5 changed files with 88 additions and 249 deletions

View File

@ -307,7 +307,7 @@ VERBOSE=${VERBOSE:-0}
#
# Should have one argument
#
if [[ $# -ne 1 ]]; then
if [[ $# != 1 ]]; then
coloured 'red' "Missing argument"
_usage 1
fi

View File

@ -6,7 +6,7 @@
# USAGE: ./repair_assets showid
#
# DESCRIPTION: Given a show where there was a directory of asset files on the
# old HPR server which got lost in the migration, rebuild it
# old HPR server whichj got lost in the migration, rebuild it
# and fill it with assets from the IA. Modify the show notes to
# point to these recovered assets.
#
@ -15,15 +15,15 @@
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.10
# VERSION: 0.0.8
# CREATED: 2024-05-10 21:26:31
# REVISION: 2024-10-02 17:34:47
# REVISION: 2024-08-23 11:55:25
#
#===============================================================================
# set -o nounset # Treat unset variables as an error
VERSION="0.0.10"
VERSION="0.0.8"
SCRIPT=${0##*/}
# DIR=${0%/*}
@ -96,38 +96,6 @@ trap 'cleanup_temp $TMP1 $TMP2' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
# $3 Name of array to receive list of missing assets
# RETURNS: Nothing
#===============================================================================
# find_missing () {
# local -n IA="${1}"
# local -n HPR="${2}"
# local output="${3}"
#
# local -A hIA hHPR
# local i key
#
# #
# # Make a hash keyed by the IA file base names from an indexed array
# #
# for (( i=0; i<${#IA[@]}; i++ )); do
# hIA+=([${IA[$i]##*/}]=${IA[$i]})
# done
#
# #
# # Make a hash keyed by the HPR file base names from an indexed array
# #
# for (( i=0; i<${#HPR[@]}; i++ )); do
# hHPR+=([${HPR[$i]##*/}]=${HPR[$i]})
# done
#
# #
# # Use the basename keys to check what's missing, but return the full path
# # names.
# #
# for key in "${!hIA[@]}"; do
# if ! exists_in hHPR "$key"; then
# eval "$output+=('${hIA[$key]}')"
# fi
# done
# }
find_missing () {
local -n IA="${1}"
local -n HPR="${2}"
@ -137,29 +105,26 @@ find_missing () {
local i key
#
# Make a hash keyed by the full IA paths from an indexed array
# Make a hash keyed by the IA file base names from an indexed array
#
for (( i=0; i<${#IA[@]}; i++ )); do
hIA+=([${IA[$i]}]=$i)
hIA+=([${IA[$i]##*/}]=${IA[$i]})
done
#
# Make a hash keyed by the HPR file paths from an indexed array, but
# remove the first element for parity with the IA paths. We are going to
# copy the IA paths, not these, so we never need the full paths again
# here.
# Make a hash keyed by the HPR file base names from an indexed array
#
for (( i=0; i<${#HPR[@]}; i++ )); do
hHPR+=([${HPR[$i]#*/}]=$i)
hHPR+=([${HPR[$i]##*/}]=${HPR[$i]})
done
#
# Use the full path keys to check what's missing, and return the IA full
# path names.
# Use the basename keys to check what's missing, but return the full path
# names.
#
for key in "${!hIA[@]}"; do
if ! exists_in hHPR "$key"; then
eval "$output+=('$key')"
eval "$output+=('${hIA[$key]}')"
fi
done
}
@ -302,13 +267,10 @@ fi
show="${1,,}"
#
# Ensure show id is correctly formatted. We want it to be 'hpr1234' but we
# allow the 'hpr' bit to be omitted, as well as any leading zeroes. We need to
# handle the weirdness of "leading zero means octal" though, but we always
# store it as 'hpr1234' once processed.
# Ensure show id is correctly formatted. We want it to be 'hpr1234'
#
if [[ $show =~ (hpr)?([0-9]+) ]]; then
printf -v show 'hpr%04d' "$((10#${BASH_REMATCH[2]}))"
printf -v show 'hpr%04d' "${BASH_REMATCH[2]}"
else
coloured 'red' "Incorrect show specification: $show"
coloured 'yellow' "Use 'hpr9999' or '9999' format"
@ -481,45 +443,37 @@ ignore_re="index.html$"
# Run the command and save the output. Save the asset names returned in an
# array. TODO: Handle errors from the command
#
#
# NOTE: We also want to interrogate the HPR state in dry-run mode
#
# if [[ $DRYRUN -eq 0 ]]; then
# else
# coloured 'yellow' "Would have searched for assets on the HPR server"
# fi
eval "$command" > "$TMP2"
RES=$?
if [[ $RES -eq 0 ]]; then
_verbose "$(coloured 'green' "Remote command successful")"
while read -r hprfile; do
if [[ ! $hprfile =~ $ignore_re ]]; then
hpr_asset+=("${hprfile}")
fi
done < "$TMP2"
_verbose "$(coloured 'green' "Assets found on HPR server = ${#hpr_asset[@]}")"
_verbose "$(printf '%s\n' "${hpr_asset[@]}")"
_log "Assets found on HPR server = ${#hpr_asset[@]}"
if [[ $DRYRUN -eq 0 ]]; then
eval "$command" > "$TMP2"
RES=$?
if [[ $RES -eq 0 ]]; then
_verbose "$(coloured 'green' "Remote command successful")"
while read -r hprfile; do
if [[ ! $hprfile =~ $ignore_re ]]; then
hpr_asset+=("${hprfile}")
fi
done < "$TMP2"
_verbose "$(coloured 'green' "Assets found on HPR server = ${#hpr_asset[@]}")"
_verbose "$(printf '%s\n' "${hpr_asset[@]}")"
_log "Assets found on HPR server = ${#hpr_asset[@]}"
else
coloured 'red' "Remote command failed"
_log "Failed while searching for HPR assets"
exit 1
fi
else
coloured 'red' "Remote command failed"
_log "Failed while searching for HPR assets"
exit 1
coloured 'yellow' "Would have searched for assets on the HPR server"
fi
#-------------------------------------------------------------------------------
# Compare the two asset lists and return what's missing on the HPR server
#-------------------------------------------------------------------------------
# TODO: The algorithm in find_missing does not handle the instance where there
# are pictures in one directory and a lower directory containing thumbnails,
# AND THE FILE NAMES ARE THE SAME!
# TODO: This algorithm does not handle the instance where there are pictures
# in one directory and a lower directory containing thumbnails, AND THE FILE
# NAMES ARE THE SAME!
#
declare -a missing
if [[ ${#hpr_asset[@]} -eq 0 ]]; then
missing=( "${ia_asset[@]}" )
else
find_missing ia_asset hpr_asset missing
fi
find_missing ia_asset hpr_asset missing
_verbose "$(coloured 'cyan' "** missing (${#missing[@]}):")"
_verbose "$(printf '%s\n' "${missing[@]}")"

View File

@ -23,7 +23,7 @@
# temporarily on 'borg') and determines which have not been
# uploaded, then takes steps to perform the uploads.
#
# Version 0.0.12 onwards has the capability to repair an IA item
# Version 0.0.11 onwards has the capability to repair an IA item
# from the HPR backup disk. This seems to be necessary because
# the transcripts were not carried over (although we are
# adding them to the IA for new shows now, older ones were never
@ -44,15 +44,15 @@
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.12
# VERSION: 0.0.11
# CREATED: 2020-01-05 22:42:46
# REVISION: 2024-09-13 18:19:59
# REVISION: 2024-07-20 17:06:10
#
#===============================================================================
#set -o nounset # Treat unset variables as an error
VERSION="0.0.12"
VERSION="0.0.11"
SCRIPT=${0##*/}
# DIR=${0%/*}
@ -492,9 +492,9 @@ else
#
# Stop the missed file loop if we have reached the limiting number, in
# dry-run and live mode, but not extended mode
# dry-run and live mode
#
[[ $EXTENDED -eq 0 && $upload_count -eq $LIMIT ]] && {
[[ $upload_count -eq $LIMIT ]] && {
coloured 'blue' "Upload limit ($LIMIT) reached"
break
}

View File

@ -5,28 +5,23 @@
#
# USAGE: ./snapshot_metadata episode_number
#
# DESCRIPTION: Collects JSON metadata from the IA for a given show and stores
# it in the cache. Runs 'view_derivatives' on the JSON to
# display the derivatives if any, and to save their names if
# found, for deletion.
# Deletion is performed thus (external to this script):
#
# cat assets/hpr$(./next_repair)/derived.lis | xargs ia delete hpr$(./next_repair) --no-backup
# DESCRIPTION: Collects metadata from the IA for a given show and stores it
# in the cache.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.3
# VERSION: 0.0.2
# CREATED: 2024-08-16 20:36:51
# REVISION: 2024-10-02 17:40:13
# REVISION: 2024-08-17 10:31:15
#
#===============================================================================
set -o nounset # Treat unset variables as an error
VERSION="0.0.3"
VERSION="0.0.2"
SCRIPT=${0##*/}
# DIR=${0%/*}
@ -131,19 +126,17 @@ fi
show="${1,,}"
#
# Ensure show id is correctly formatted. We want it to be 'hpr1234' but we
# allow the 'hpr' bit to be omitted, as well as any leading zeroes. We need to
# handle the weirdness of "leading zero means octal" though, but we always
# store it as 'hpr1234' once processed.
# Ensure show id is correctly formatted. We want it to be 'hpr1234'
#
if [[ $show =~ (hpr)?([0-9]+) ]]; then
printf -v show 'hpr%04d' "$((10#${BASH_REMATCH[2]}))"
printf -v show 'hpr%04d' "${BASH_REMATCH[2]}"
else
coloured 'red' "Incorrect show specification: $show"
coloured 'yellow' "Use 'hpr9999' or '9999' format"
exit 1
fi
#-------------------------------------------------------------------------------
# Setting up paths
#-------------------------------------------------------------------------------

View File

@ -44,9 +44,9 @@
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.4
# VERSION: 0.0.2
# CREATED: 2024-08-12 16:26:29
# REVISION: 2024-09-17 17:03:27
# REVISION: 2024-08-17 13:44:44
#
#===============================================================================
@ -71,7 +71,7 @@ use Data::Dumper;
#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.4';
our $VERSION = '0.0.2';
#
# Script and directory names
@ -170,12 +170,11 @@ die "Empty JSON?\n" unless (@jsonbuffer);
my $md = $jsonbuffer[0];
#
# Collect the identifier from the parsed JSON and define the derived files we
# don't want to delete. (Found cases of audio files being "derived" in 1672
# and 1664)
# Collect the identifier from the parsed JSON and define the one derived file
# we don't want to delete.
#
my $identifier = $md->{metadata}->{identifier};
my $skip_re = qr{^${identifier}\.(flac|mp3|ogg|opus|png|spx|wav)$};
my $item_png = "${identifier}.png";
#
# Build a hash from the original and derived files referenced in the metadata.
@ -258,13 +257,9 @@ if ($verbose > 0) {
say '-' x 10;
}
#
# List derived files that can be deleted, being careful not to delete the
# audio or the PNG image created by IA code.
#
if ($list_derived) {
foreach my $file ( sort(@derived) ) {
say "$file" unless ($file =~ $skip_re);
say "$file" unless ($file eq $item_png);
}
}
@ -382,157 +377,49 @@ __END__
=head1 NAME
view_derivatives - a tool to analyse IA metadata
view_derivatives - <One line description of application's purpose>
=head1 VERSION
This documentation refers to view_derivatives version 0.0.4
The initial template usually just has:
This documentation refers to view_derivatives version 0.0.2
=head1 USAGE
view_derivatives [-help] [-documentation|-man] [-debug=N] [-[no]dry-run]
[-verbose [-verbose] ...] [-[no]list_derived] metadata_file
# Brief working invocation example(s) here showing the most common usage(s)
# Parse the metadata and report the relationships beteen files
view_derivatives -verb METADATA
# This section will be as far as many users ever read
# so make it as educational and exemplary as possible.
# Parse the metadata and write out a list of derived files for potential
# deletion.
view_derivatives -list_derived METADATA > FILE
=head1 REQUIRED ARGUMENTS
The name of a file created by the following command:
A complete list of every argument that must appear on the command line.
when the application is invoked, explaining what each of them does, any
restrictions on where each one may appear (i.e. flags that must appear
before or after filenames), and how the various arguments and options
may interact (e.g. mutual exclusions, required combinations, etc.)
ia metadata "show" > metadata_file
If all of the application's arguments are optional this section
may be omitted entirely.
The file is expected to contain one JSON object (in a one-element array). If
it contains more objects only the first will be processed.
=head1 OPTIONS
=over 4
A complete list of every available option with which the application
can be invoked, explaining what each does, and listing any restrictions,
or interactions.
=item B<-help>
If the application has no options this section may be omitted entirely.
Prints a brief help message describing the usage of the program, and then exits.
=item B<-documentation> B<-man>
Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf. Use the command:
pod2pdf view_derivatives --out=view_derivatives.pdf
=item B<-debug=N>
Selects a level of debugging. Debug information consists of a line or series
of lines prefixed with the characters 'D>':
=over 4
=item B<0>
No debug output is generated: this is the default
=item B<3>
Prints all data structures from options
=back
(The debug levels need work!)
=item B<-[no]dry-run>
Enable/disable dry run mode (default off)
=item B<-verbose>
Sets the verbosity level. If the option is omitted then the level is zero (no
verbose output). Thereafter, for each occurrence of the option the verbosity
level is incremented. Only levels 1 and 2 are currently catered for. Any
levels above 2 produce the same result as level 2.
=item B<-[no]list_derived>
This option is off by default. Turning it on causes the script to write all
derived files to standard output. If the verbosity level is zero this is the
only output from the script.
The idea is that at verbosity level 1 or 2 information is displayed about the
relationship of files in the metadata, for human consumption. If
B<-nolist_derived> is the setting (or default) then this is all that is shown.
If the verbosity level is zero and B<-list_derived> is on then only the list
of derived files will be generated, and this can be used to delete the files
from the IA.
=head1 DESCRIPTION
=head2 OVERVIEW
A full description of the application and its features.
May include numerous subsections (i.e. =head2, =head3, etc.)
Items on the IA (Internet Archive, or I<archive.org>) consist of metadata and
files. Each item generated for HPR is a show or episode. Most files comprising
the episode on the IA are those which are part of the episode on the HPR
server. A few extra files are created by the IA software, but these are part
of the metadata (HTML details, upload date, etc.)
By default the IA software will create additional files which are derived from
the original files. Typical examples are other audio formats, such as Ogg or
Mp3. We have been disabling this derivation process for several years for
various reasons, preferring to generate our own derivatives. IA-generated
audio derivatives do not have ID3 and similar tags, whereas HPR-generated
audio formats do.
Historically it was difficult to disable the derivation process. Even though
there were settings to do this they apparently didn't work on all of the
servers making up the IA, and so older items may have many derived files.
This script assists with identifying unwanted derivatives and with their
deletion.
=head2 METADATA
The metadata for an item can be obtained (by a registered user) from the IA
using the B<ia> tool. Its format is JSON, and this script uses a JSON module
to parse it.
=head2 FILE RELATIONSHIPS
The JSON metadata contains details of all files comprising the IA item.
It contains details such as the name, size, and type of each file. It also
categorises files into groups such as I<original> and I<derived>. Files which
are derived have parents. The script uses this to build tree-like data
structures of derived files based on the original files. All children of an
original file will be derived, but some derived files may also have children.
The derivatives can be classified simply as children of original files or of
derived files. These are what are listed if required and what are used in the
deletion process.
=head2 DELETING UNWANTED DERIVATIVES
The simplest method is to pipe the output from the script with verbose level
zero and with B<-list_derived> enabled into B<xargs> in order to run a command
which will delete the unwanted derivatives.
One usage is:
./view_derivatives -list_derived metadata.json |\
xargs ia delete hpr1234 --no-backup
This will generate a list of files to be deleted, then pipe them to B<xargs>
which will construct a command by appending the names to the command template.
This approach is not ideal since it does not handle the case where there is
nothing to delete. The script B<snapshot_metadata> manages this situation by
generating the metadata and saving it in a file, then it runs
B<view_derivatives> on this file and generates a file of derivatives. If this
file is not empty it can be used to perform the deletions, and otherwise no
attempt will be made.
=head1 DIAGNOSTICS
@ -581,18 +468,23 @@ special cases that are not (yet) handled, etc.
The initial template usually just has:
There are no known bugs in this module.
Please report problems to Dave Morriss (dave.morriss@gmail.com)
Please report problems to <Maintainer name(s)> (<contact address>)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (dave.morriss@gmail.com)
<Author name(s)> (<contact address>)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2024 Dave Morriss (dave.morriss@gmail.com). All rights reserved.
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
Followed by whatever licence you wish to release it under.
For Perl code that is often just:
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of