forked from HPR/hpr-tools
		
	InternetArchive/repair_assets: Accidentally reverset the "sanity check"
    logic, so put it back the right way!
InternetArchive/view_derivatives: Started on the POD documentation but
    didn't get very far.
		
	
		
			
				
	
	
		
			501 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			501 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: view_derivatives
 | |
| #
 | |
| #        USAGE: ./view_derivatives metadata_file
 | |
| #
 | |
| #  DESCRIPTION: Experimental!
 | |
| #               Reads a JSON file containing IA metadata for an item (HPR
 | |
| #               show) and processes 'derived' files from the IA. Their
 | |
| #               relationships are usually simple, but sometimes can be rather
 | |
| #               weird, so building a representation of the hierarchy might be
 | |
| #               a way to understand what's there and maybe make a list for the
 | |
| #               'ia delete' command to work on!
 | |
| #               Each 'original' file may be accompanied by 'derived' files. So
 | |
| #               an image file might generate a thumbnail for example. The
 | |
| #               image will be marked in the JSON metadata as "original" and
 | |
| #               the derived file as a "derivative" with the file it was built
 | |
| #               from being its "parent". There can also be "derivative" files
 | |
| #               built from other "derivative"s in rare cases. This has been
 | |
| #               seen when an EPUB file is used to build a PDF "derivative",
 | |
| #               and then other versions of the PDF are generated as
 | |
| #               "derivatives" with the PDF as the "parent".
 | |
| #               The point is that we do not want the "derivatives". We (HPR)
 | |
| #               generate all of the versions of a given file we require:
 | |
| #               audio types and thumbnails, for example. The tools we use can
 | |
| #               disable the generation of "derivatives", so it has been rare
 | |
| #               to see them in recent times. However, with older shows we
 | |
| #               either couldn't stop the "derive" process, or some IA servers
 | |
| #               ignored our '--no-derive' options. Now (August 2024) we are
 | |
| #               cleaning up the HPR collection, and thus this script has been
 | |
| #               developed.
 | |
| #               Special note: any cleaning up of IA items needs to be done
 | |
| #               before files are moved around. The parental relationships are
 | |
| #               stored in the metadata, and do not track file movements. This
 | |
| #               script cannot analyse metadata from after such rearrangements!
 | |
| #               [It may be possible to adjust these metadata fields, but it
 | |
| #               does not seem to be necessary if the correct sequence of
 | |
| #               changes is adhered to - find derivatives, delete derivatives,
 | |
| #               move files]
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.2
 | |
| #      CREATED: 2024-08-12 16:26:29
 | |
| #     REVISION: 2024-08-17 13:44:44
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use v5.36;
 | |
| use strict;
 | |
| use warnings;
 | |
| use feature qw{ postderef say signatures state try };
 | |
| no warnings
 | |
|     qw{ experimental::postderef experimental::signatures experimental::try };
 | |
| 
 | |
| use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
 | |
| 
 | |
| use Carp;
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use File::Slurper qw{ read_text read_lines };
 | |
| use JSON;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (Incremented by Vim)
 | |
| #
 | |
| our $VERSION = '0.0.2';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir    = "$ENV{HOME}/HPR/InternetArchive";
 | |
| my $outputfile = "$basedir/$PROG.svg";
 | |
| my $logfile    = "$basedir/$PROG.log";
 | |
| 
 | |
| my ( $file, $filebuffer, $json, @jsonbuffer );
 | |
| my ( %filetree, @original, @derived );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Option defaults
 | |
| #
 | |
| my $DEFDEBUG = 0;
 | |
| 
 | |
| #
 | |
| # Options and arguments
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help shows minimal information
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # The -documentation or -man option shows the full POD documentation through
 | |
| # a pager for convenience
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
 | |
|     if ( $options{'documentation'} );
 | |
| 
 | |
| my $DEBUG   = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
 | |
| my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
 | |
| my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
 | |
| 
 | |
| my $list_derived
 | |
|     = ( defined( $options{list_derived} ) ? $options{list_derived} : 0 );
 | |
| 
 | |
| #
 | |
| # Get the filename argument (metadata in JSON format)
 | |
| #
 | |
| $file = shift;
 | |
| die "File of IA metadata is required\n" unless ($file);
 | |
| 
 | |
| #
 | |
| # Read the entire JSON file into a buffer
 | |
| #
 | |
| try {
 | |
|     $filebuffer = read_text($file);
 | |
| }
 | |
| 
 | |
| catch ($e) {
 | |
|     die "Failed to read JSON file $file\n";
 | |
| }
 | |
| 
 | |
| #
 | |
| # Parse the JSON from the buffer
 | |
| #
 | |
| try {
 | |
|     $json = JSON->new;
 | |
|     @jsonbuffer = $json->incr_parse($filebuffer);
 | |
| }
 | |
| 
 | |
| catch ($e) {
 | |
|     die "Failed to parse JSON\n";
 | |
| }
 | |
| 
 | |
| _debug($DEBUG > 3, '@jsonbuffer: ' . Dumper(\@jsonbuffer));
 | |
| 
 | |
| #
 | |
| # Check that this is the type of JSON we need
 | |
| #
 | |
| die "Empty JSON?\n" unless (@jsonbuffer);
 | |
| 
 | |
| #
 | |
| # We got an array of one element from 'ia metadata <item>'. Use $md to
 | |
| # reference that one element
 | |
| #
 | |
| my $md = $jsonbuffer[0];
 | |
| 
 | |
| #
 | |
| # Collect the identifier from the parsed JSON and define the one derived file
 | |
| # we don't want to delete.
 | |
| #
 | |
| my $identifier = $md->{metadata}->{identifier};
 | |
| my $item_png = "${identifier}.png";
 | |
| 
 | |
| #
 | |
| # Build a hash from the original and derived files referenced in the metadata.
 | |
| # The hash will contain all but a number of files which are not of interest
 | |
| # when sorting out which file came from which. At the end of the loop the
 | |
| # structure will contain back links with child files pointing to parents. The
 | |
| # forward child links get added later.
 | |
| #
 | |
| # The original names of files are also kept in an array to help with
 | |
| # traversing the main hash.
 | |
| #
 | |
| foreach my $file ( @{ $md->{files} } ) {
 | |
|     my $fname = $file->{name};
 | |
| 
 | |
|     printf "%-40s %-10s %s\n", $fname, $file->{source},
 | |
|         coalesce( $file->{original}, 'null' )
 | |
|         if ($verbose > 1);
 | |
| 
 | |
|     unless ( $file->{source} =~ /[Mm]etadata/
 | |
|         || $file->{format}
 | |
|         =~ /[Mm]etadata|Item Tile|Columbia Peaks|Spectrogram/ )
 | |
|     {
 | |
|         $filetree{$fname} = {
 | |
|             source   => $file->{source},
 | |
|             parent   => $file->{original},
 | |
|             format   => $file->{format},
 | |
|             children => [],
 | |
|         };
 | |
| 
 | |
|         if ( $file->{source} eq 'original' ) {
 | |
|             push( @original, $fname );
 | |
|         }
 | |
|         elsif ( $file->{source} eq 'derivative' ) {
 | |
|             push( @derived, $fname );
 | |
|         }
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #
 | |
| # Visit all the nodes for derived files. They point to their parent node, so
 | |
| # we can visit this node and add the children to an internal array.
 | |
| #
 | |
| foreach my $file (@derived) {
 | |
|     if ( defined( $filetree{$file}->{parent} ) ) {
 | |
|         push(
 | |
|             @{ $filetree{ $filetree{$file}->{parent} }->{children} },
 | |
|             $file
 | |
|         );
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # The trees are built, so we can dump the final result
 | |
| #
 | |
| _debug(
 | |
|    $DEBUG > 2,
 | |
|    '%filetree'
 | |
|        . sprintf( '[%d]: ', scalar( keys(%filetree) ) )
 | |
|        . Dumper( \%filetree )
 | |
| );
 | |
| 
 | |
| _debug($DEBUG > 2, '@original: ' . Dumper(\@original));
 | |
| _debug($DEBUG > 2, '@derived: ' . Dumper(\@derived));
 | |
| 
 | |
| #
 | |
| # If the verbosity level is greater than 0 scan all the 'original' files in
 | |
| # the '$filetree' hash. For each one walk its 'children' array if there's
 | |
| # anything in it, and recurse into each node it points to. It's possible to
 | |
| # have further levels than the one below the parent, but very little seems to
 | |
| # be set up this way so far.
 | |
| #
 | |
| if ($verbose > 0) {
 | |
|     foreach my $file ( sort(@original) ) {
 | |
|         #
 | |
|         # Print the top level node and recurse into any children and print them
 | |
|         #
 | |
|         display_nodes( \%filetree, 0, $file );
 | |
|     }
 | |
|     say '-' x 10;
 | |
| }
 | |
| 
 | |
| if ($list_derived) {
 | |
|     foreach my $file ( sort(@derived) ) {
 | |
|         say "$file" unless ($file eq $item_png);
 | |
|     }
 | |
| }
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: display_nodes
 | |
| #      PURPOSE: Walks a tree of original and derived files in the IA metadata
 | |
| #   PARAMETERS: $tree           Hashref pointing to the file tree
 | |
| #               $level          The current level within the tree
 | |
| #               $key            Hash key for the current node
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Recurses through a tree
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub display_nodes {
 | |
|     my ($tree, $level, $key) = @_;
 | |
| 
 | |
|     #
 | |
|     # Report this node
 | |
|     #
 | |
|     printf "%s%d: %s\n", "\t" x $level, $level, $key;
 | |
| 
 | |
|     #
 | |
|     # If there are children we use each to recurse to a lower level
 | |
|     #
 | |
|     if (scalar(@{$tree->{$key}->{children}}) > 0) {
 | |
|         $level++;
 | |
| 
 | |
|         foreach my $child (@{$tree->{$key}->{children}}) {
 | |
|             display_nodes($tree, $level, $child);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
 | |
| #               returned when doing so might be a problem.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return undef;    ## no critic
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: _debug
 | |
| #      PURPOSE: Prints debug reports
 | |
| #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
 | |
| #               $message        Message to print
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
 | |
| #               newline and then adds one in the 'print' to the caller doesn't
 | |
| #               have to bother. Prepends the message with 'D> ' to show it's
 | |
| #               a debug message.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub _debug {
 | |
|     my ( $active, $message ) = @_;
 | |
| 
 | |
|     chomp($message);
 | |
|     print "D> $message\n" if $active;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION: Process the options we want to offer. See the documentation
 | |
| #               for details
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",     "documentation|man", "debug=i", "dry-run!",
 | |
|         "verbose+", "list_derived!"
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage(
 | |
|             -msg     => "$PROG version $VERSION\n",
 | |
|             -exitval => 1,
 | |
|             -verbose => 0
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| view_derivatives - <One line description of application's purpose>
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| The initial template usually just has:
 | |
| 
 | |
| This documentation refers to view_derivatives version 0.0.2
 | |
| 
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     # Brief working invocation example(s) here showing the most common usage(s)
 | |
| 
 | |
|     # This section will be as far as many users ever read
 | |
|     # so make it as educational and exemplary as possible.
 | |
| 
 | |
| 
 | |
| =head1 REQUIRED ARGUMENTS
 | |
| 
 | |
| A complete list of every argument that must appear on the command line.
 | |
| when the application  is invoked, explaining what each of them does, any
 | |
| restrictions on where each one may appear (i.e. flags that must appear
 | |
| before or after filenames), and how the various arguments and options
 | |
| may interact (e.g. mutual exclusions, required combinations, etc.)
 | |
| 
 | |
| If all of the application's arguments are optional this section
 | |
| may be omitted entirely.
 | |
| 
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| A complete list of every available option with which the application
 | |
| can be invoked, explaining what each does, and listing any restrictions,
 | |
| or interactions.
 | |
| 
 | |
| If the application has no options this section may be omitted entirely.
 | |
| 
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| A full description of the application and its features.
 | |
| May include numerous subsections (i.e. =head2, =head3, etc.)
 | |
| 
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| A list of every error and warning message that the application can generate
 | |
| (even the ones that will "never happen"), with a full explanation of each
 | |
| problem, one or more likely causes, and any suggested remedies. If the
 | |
| application generates exit status codes (e.g. under Unix) then list the exit
 | |
| status associated with each error.
 | |
| 
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| A full explanation of any configuration system(s) used by the application,
 | |
| including the names and locations of any configuration files, and the
 | |
| meaning of any environment variables or properties that can be set. These
 | |
| descriptions must also include details of any configuration language used
 | |
| 
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
| A list of all the other modules that this module relies upon, including any
 | |
| restrictions on versions, and an indication whether these required modules are
 | |
| part of the standard Perl distribution, part of the module's distribution,
 | |
| or must be installed separately.
 | |
| 
 | |
| 
 | |
| =head1 INCOMPATIBILITIES
 | |
| 
 | |
| A list of any modules that this module cannot be used in conjunction with.
 | |
| This may be due to name conflicts in the interface, or competition for
 | |
| system or program resources, or due to internal limitations of Perl
 | |
| (for example, many modules that use source code filters are mutually
 | |
| incompatible).
 | |
| 
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| A list of known problems with the module, together with some indication
 | |
| whether they are likely to be fixed in an upcoming release.
 | |
| 
 | |
| Also a list of restrictions on the features the module does provide:
 | |
| data types that cannot be handled, performance issues and the circumstances
 | |
| in which they may arise, practical limitations on the size of data sets,
 | |
| special cases that are not (yet) handled, etc.
 | |
| 
 | |
| The initial template usually just has:
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to <Maintainer name(s)>  (<contact address>)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| <Author name(s)>  (<contact address>)
 | |
| 
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
 | |
| 
 | |
| Followed by whatever licence you wish to release it under.
 | |
| For Perl code that is often just:
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 | |
| 
 |