diff --git a/InternetArchive/view_derivatives b/InternetArchive/view_derivatives new file mode 100755 index 0000000..8d211aa --- /dev/null +++ b/InternetArchive/view_derivatives @@ -0,0 +1,500 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: view_derivatives +# +# USAGE: ./view_derivatives metadata_file +# +# DESCRIPTION: Experimental! +# Reads a JSON file containing IA metadata for an item (HPR +# show) and processes 'derived' files from the IA. Their +# relationships are usually simple, but sometimes can be rather +# weird, so building a representation of the hierarchy might be +# a way to understand what's there and maybe make a list for the +# 'ia delete' command to work on! +# Each 'original' file may be accompanied by 'derived' files. So +# an image file might generate a thumbnail for example. The +# image will be marked in the JSON metadata as "original" and +# the derived file as a "derivative" with the file it was built +# from being its "parent". There can also be "derivative" files +# built from other "derivative"s in rare cases. This has been +# seen when an EPUB file is used to build a PDF "derivative", +# and then other versions of the PDF are generated as +# "derivatives" with the PDF as the "parent". +# The point is that we do not want the "derivatives". We (HPR) +# generate all of the versions of a given file we require: +# audio types and thumbnails, for example. The tools we use can +# disable the generation of "derivatives", so it has been rare +# to see them in recent times. However, with older shows we +# either couldn't stop the "derive" process, or some IA servers +# ignored our '--no-derive' options. Now (August 2024) we are +# cleaning up the HPR collection, and thus this script has been +# developed. +# Special note: any cleaning up of IA items needs to be done +# before files are moved around. The parental relationships are +# stored in the metadata, and do not track file movements. This +# script cannot analyse metadata from after such rearrangements! +# [It may be possible to adjust these metadata fields, but it +# does not seem to be necessary if the correct sequence of +# changes is adhered to - find derivatives, delete derivatives, +# move files] +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.2 +# CREATED: 2024-08-12 16:26:29 +# REVISION: 2024-08-17 13:44:44 +# +#=============================================================================== + +use v5.36; +use strict; +use warnings; +use feature qw{ postderef say signatures state try }; +no warnings + qw{ experimental::postderef experimental::signatures experimental::try }; + +use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 + +use Carp; +use Getopt::Long; +use Pod::Usage; + +use File::Slurper qw{ read_text read_lines }; +use JSON; + +use Data::Dumper; + +# +# Version number (Incremented by Vim) +# +our $VERSION = '0.0.2'; + +# +# Script and directory names +# +( my $PROG = $0 ) =~ s|.*/||mx; +( my $DIR = $0 ) =~ s|/?[^/]*$||mx; +$DIR = '.' unless $DIR; + +#------------------------------------------------------------------------------- +# Declarations +#------------------------------------------------------------------------------- +# +# Constants and other declarations +# +my $basedir = "$ENV{HOME}/HPR/InternetArchive"; +my $outputfile = "$basedir/$PROG.svg"; +my $logfile = "$basedir/$PROG.log"; + +my ( $file, $filebuffer, $json, @jsonbuffer ); +my ( %filetree, @original, @derived ); + +#------------------------------------------------------------------------------- +# Options +#------------------------------------------------------------------------------- +# +# Option defaults +# +my $DEFDEBUG = 0; + +# +# Options and arguments +# +my %options; +Options( \%options ); + +# +# Default help shows minimal information +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) + if ( $options{'help'} ); + +# +# The -documentation or -man option shows the full POD documentation through +# a pager for convenience +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) + if ( $options{'documentation'} ); + +my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); +my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); +my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 ); + +my $list_derived + = ( defined( $options{list_derived} ) ? $options{list_derived} : 0 ); + +# +# Get the filename argument (metadata in JSON format) +# +$file = shift; +die "File of IA metadata is required\n" unless ($file); + +# +# Read the entire JSON file into a buffer +# +try { + $filebuffer = read_text($file); +} + +catch ($e) { + die "Failed to read JSON file $file\n"; +} + +# +# Parse the JSON from the buffer +# +try { + $json = JSON->new; + @jsonbuffer = $json->incr_parse($filebuffer); +} + +catch ($e) { + die "Failed to parse JSON\n"; +} + +_debug($DEBUG > 3, '@jsonbuffer: ' . Dumper(\@jsonbuffer)); + +# +# Check that this is the type of JSON we need +# +die "Empty JSON?\n" unless (@jsonbuffer); + +# +# We got an array of one element from 'ia metadata '. Use $md to +# reference that one element +# +my $md = $jsonbuffer[0]; + +# +# Collect the identifier from the parsed JSON and define the one derived file +# we don't want to delete. +# +my $identifier = $md->{metadata}->{identifier}; +my $item_png = "${identifier}.png"; + +# +# Build a hash from the original and derived files referenced in the metadata. +# The hash will contain all but a number of files which are not of interest +# when sorting out which file came from which. At the end of the loop the +# structure will contain back links with child files pointing to parents. The +# forward child links get added later. +# +# The original names of files are also kept in an array to help with +# traversing the main hash. +# +foreach my $file ( @{ $md->{files} } ) { + my $fname = $file->{name}; + + printf "%-40s %-10s %s\n", $fname, $file->{source}, + coalesce( $file->{original}, 'null' ) + if ($verbose > 1); + + unless ( $file->{source} =~ /[Mm]etadata/ + || $file->{format} + =~ /[Mm]etadata|Item Tile|Columbia Peaks|Spectrogram/ ) + { + $filetree{$fname} = { + source => $file->{source}, + parent => $file->{original}, + format => $file->{format}, + children => [], + }; + + if ( $file->{source} eq 'original' ) { + push( @original, $fname ); + } + elsif ( $file->{source} eq 'derivative' ) { + push( @derived, $fname ); + } + } + +} + +# +# Visit all the nodes for derived files. They point to their parent node, so +# we can visit this node and add the children to an internal array. +# +foreach my $file (@derived) { + if ( defined( $filetree{$file}->{parent} ) ) { + push( + @{ $filetree{ $filetree{$file}->{parent} }->{children} }, + $file + ); + } +} + +# +# The trees are built, so we can dump the final result +# +_debug( + $DEBUG > 2, + '%filetree' + . sprintf( '[%d]: ', scalar( keys(%filetree) ) ) + . Dumper( \%filetree ) +); + +_debug($DEBUG > 2, '@original: ' . Dumper(\@original)); +_debug($DEBUG > 2, '@derived: ' . Dumper(\@derived)); + +# +# If the verbosity level is greater than 0 scan all the 'original' files in +# the '$filetree' hash. For each one walk its 'children' array if there's +# anything in it, and recurse into each node it points to. It's possible to +# have further levels than the one below the parent, but very little seems to +# be set up this way so far. +# +if ($verbose > 0) { + foreach my $file ( sort(@original) ) { + # + # Print the top level node and recurse into any children and print them + # + display_nodes( \%filetree, 0, $file ); + } + say '-' x 10; +} + +if ($list_derived) { + foreach my $file ( sort(@derived) ) { + say "$file" unless ($file eq $item_png); + } +} + +exit; + +#=== FUNCTION ================================================================ +# NAME: display_nodes +# PURPOSE: Walks a tree of original and derived files in the IA metadata +# PARAMETERS: $tree Hashref pointing to the file tree +# $level The current level within the tree +# $key Hash key for the current node +# RETURNS: Nothing +# DESCRIPTION: Recurses through a tree +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub display_nodes { + my ($tree, $level, $key) = @_; + + # + # Report this node + # + printf "%s%d: %s\n", "\t" x $level, $level, $key; + + # + # If there are children we use each to recurse to a lower level + # + if (scalar(@{$tree->{$key}->{children}}) > 0) { + $level++; + + foreach my $child (@{$tree->{$key}->{children}}) { + display_nodes($tree, $level, $child); + } + } + + return; +} + +#=== FUNCTION ================================================================ +# NAME: coalesce +# PURPOSE: To find the first defined argument and return it +# PARAMETERS: Arbitrary number of arguments +# RETURNS: The first defined argument or undef if there are none +# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never +# returned when doing so might be a problem. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub coalesce { + foreach (@_) { + return $_ if defined($_); + } + return undef; ## no critic +} + +#=== FUNCTION ================================================================ +# NAME: _debug +# PURPOSE: Prints debug reports +# PARAMETERS: $active Boolean: 1 for print, 0 for no print +# $message Message to print +# RETURNS: Nothing +# DESCRIPTION: Outputs a message if $active is true. It removes any trailing +# newline and then adds one in the 'print' to the caller doesn't +# have to bother. Prepends the message with 'D> ' to show it's +# a debug message. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub _debug { + my ( $active, $message ) = @_; + + chomp($message); + print "D> $message\n" if $active; +} + +#=== FUNCTION ================================================================ +# NAME: Options +# PURPOSE: Processes command-line options +# PARAMETERS: $optref Hash reference to hold the options +# RETURNS: Undef +# DESCRIPTION: Process the options we want to offer. See the documentation +# for details +# THROWS: no exceptions +# COMMENTS: none +# SEE ALSO: n/a +#=============================================================================== +sub Options { + my ($optref) = @_; + + my @options = ( + "help", "documentation|man", "debug=i", "dry-run!", + "verbose+", "list_derived!" + ); + + if ( !GetOptions( $optref, @options ) ) { + pod2usage( + -msg => "$PROG version $VERSION\n", + -exitval => 1, + -verbose => 0 + ); + } + + return; +} + +__END__ + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Application Documentation +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#{{{ + +=head1 NAME + + - + +=head1 VERSION + +The initial template usually just has: + +This documentation refers to version 0.0.2 + + +=head1 USAGE + + # Brief working invocation example(s) here showing the most common usage(s) + + # This section will be as far as many users ever read + # so make it as educational and exemplary as possible. + + +=head1 REQUIRED ARGUMENTS + +A complete list of every argument that must appear on the command line. +when the application is invoked, explaining what each of them does, any +restrictions on where each one may appear (i.e. flags that must appear +before or after filenames), and how the various arguments and options +may interact (e.g. mutual exclusions, required combinations, etc.) + +If all of the application's arguments are optional this section +may be omitted entirely. + + +=head1 OPTIONS + +A complete list of every available option with which the application +can be invoked, explaining what each does, and listing any restrictions, +or interactions. + +If the application has no options this section may be omitted entirely. + + +=head1 DESCRIPTION + +A full description of the application and its features. +May include numerous subsections (i.e. =head2, =head3, etc.) + + +=head1 DIAGNOSTICS + +A list of every error and warning message that the application can generate +(even the ones that will "never happen"), with a full explanation of each +problem, one or more likely causes, and any suggested remedies. If the +application generates exit status codes (e.g. under Unix) then list the exit +status associated with each error. + + +=head1 CONFIGURATION AND ENVIRONMENT + +A full explanation of any configuration system(s) used by the application, +including the names and locations of any configuration files, and the +meaning of any environment variables or properties that can be set. These +descriptions must also include details of any configuration language used + + +=head1 DEPENDENCIES + +A list of all the other modules that this module relies upon, including any +restrictions on versions, and an indication whether these required modules are +part of the standard Perl distribution, part of the module's distribution, +or must be installed separately. + + +=head1 INCOMPATIBILITIES + +A list of any modules that this module cannot be used in conjunction with. +This may be due to name conflicts in the interface, or competition for +system or program resources, or due to internal limitations of Perl +(for example, many modules that use source code filters are mutually +incompatible). + + +=head1 BUGS AND LIMITATIONS + +A list of known problems with the module, together with some indication +whether they are likely to be fixed in an upcoming release. + +Also a list of restrictions on the features the module does provide: +data types that cannot be handled, performance issues and the circumstances +in which they may arise, practical limitations on the size of data sets, +special cases that are not (yet) handled, etc. + +The initial template usually just has: + +There are no known bugs in this module. +Please report problems to () +Patches are welcome. + +=head1 AUTHOR + + () + + +=head1 LICENCE AND COPYRIGHT + +Copyright (c) (). All rights reserved. + +Followed by whatever licence you wish to release it under. +For Perl code that is often just: + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. See perldoc perlartistic. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +=cut + +#}}} + +# [zo to open fold, zc to close] + +# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker +