| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | #!/usr/bin/env perl | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #         FILE: view_derivatives | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        USAGE: ./view_derivatives metadata_file | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  DESCRIPTION: Experimental! | 
					
						
							|  |  |  | #               Reads a JSON file containing IA metadata for an item (HPR | 
					
						
							|  |  |  | #               show) and processes 'derived' files from the IA. Their | 
					
						
							|  |  |  | #               relationships are usually simple, but sometimes can be rather | 
					
						
							|  |  |  | #               weird, so building a representation of the hierarchy might be | 
					
						
							|  |  |  | #               a way to understand what's there and maybe make a list for the | 
					
						
							|  |  |  | #               'ia delete' command to work on! | 
					
						
							|  |  |  | #               Each 'original' file may be accompanied by 'derived' files. So | 
					
						
							|  |  |  | #               an image file might generate a thumbnail for example. The | 
					
						
							|  |  |  | #               image will be marked in the JSON metadata as "original" and | 
					
						
							|  |  |  | #               the derived file as a "derivative" with the file it was built | 
					
						
							|  |  |  | #               from being its "parent". There can also be "derivative" files | 
					
						
							|  |  |  | #               built from other "derivative"s in rare cases. This has been | 
					
						
							|  |  |  | #               seen when an EPUB file is used to build a PDF "derivative", | 
					
						
							|  |  |  | #               and then other versions of the PDF are generated as | 
					
						
							|  |  |  | #               "derivatives" with the PDF as the "parent". | 
					
						
							|  |  |  | #               The point is that we do not want the "derivatives". We (HPR) | 
					
						
							|  |  |  | #               generate all of the versions of a given file we require: | 
					
						
							|  |  |  | #               audio types and thumbnails, for example. The tools we use can | 
					
						
							|  |  |  | #               disable the generation of "derivatives", so it has been rare | 
					
						
							|  |  |  | #               to see them in recent times. However, with older shows we | 
					
						
							|  |  |  | #               either couldn't stop the "derive" process, or some IA servers | 
					
						
							|  |  |  | #               ignored our '--no-derive' options. Now (August 2024) we are | 
					
						
							|  |  |  | #               cleaning up the HPR collection, and thus this script has been | 
					
						
							|  |  |  | #               developed. | 
					
						
							|  |  |  | #               Special note: any cleaning up of IA items needs to be done | 
					
						
							|  |  |  | #               before files are moved around. The parental relationships are | 
					
						
							|  |  |  | #               stored in the metadata, and do not track file movements. This | 
					
						
							|  |  |  | #               script cannot analyse metadata from after such rearrangements! | 
					
						
							|  |  |  | #               [It may be possible to adjust these metadata fields, but it | 
					
						
							|  |  |  | #               does not seem to be necessary if the correct sequence of | 
					
						
							|  |  |  | #               changes is adhered to - find derivatives, delete derivatives, | 
					
						
							|  |  |  | #               move files] | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      OPTIONS: --- | 
					
						
							|  |  |  | # REQUIREMENTS: --- | 
					
						
							|  |  |  | #         BUGS: --- | 
					
						
							|  |  |  | #        NOTES: --- | 
					
						
							|  |  |  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | #      VERSION: 0.0.4 | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | #      CREATED: 2024-08-12 16:26:29 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | #     REVISION: 2024-09-17 17:03:27 | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | # | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use v5.36; | 
					
						
							|  |  |  | use strict; | 
					
						
							|  |  |  | use warnings; | 
					
						
							|  |  |  | use feature qw{ postderef say signatures state try }; | 
					
						
							|  |  |  | no warnings | 
					
						
							|  |  |  |     qw{ experimental::postderef experimental::signatures experimental::try }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Carp; | 
					
						
							|  |  |  | use Getopt::Long; | 
					
						
							|  |  |  | use Pod::Usage; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use File::Slurper qw{ read_text read_lines }; | 
					
						
							|  |  |  | use JSON; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Data::Dumper; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Version number (Incremented by Vim) | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | our $VERSION = '0.0.4'; | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Script and directory names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( my $PROG = $0 ) =~ s|.*/||mx; | 
					
						
							|  |  |  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | 
					
						
							|  |  |  | $DIR = '.' unless $DIR; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Declarations | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Constants and other declarations | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $basedir    = "$ENV{HOME}/HPR/InternetArchive"; | 
					
						
							|  |  |  | my $outputfile = "$basedir/$PROG.svg"; | 
					
						
							|  |  |  | my $logfile    = "$basedir/$PROG.log"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my ( $file, $filebuffer, $json, @jsonbuffer ); | 
					
						
							|  |  |  | my ( %filetree, @original, @derived ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Options | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Option defaults | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEFDEBUG = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Options and arguments | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my %options; | 
					
						
							|  |  |  | Options( \%options ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default help shows minimal information | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     if ( $options{'help'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The -documentation or -man option shows the full POD documentation through | 
					
						
							|  |  |  | # a pager for convenience | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | 
					
						
							|  |  |  |     if ( $options{'documentation'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $DEBUG   = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); | 
					
						
							|  |  |  | my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); | 
					
						
							|  |  |  | my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $list_derived | 
					
						
							|  |  |  |     = ( defined( $options{list_derived} ) ? $options{list_derived} : 0 ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Get the filename argument (metadata in JSON format) | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $file = shift; | 
					
						
							|  |  |  | die "File of IA metadata is required\n" unless ($file); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Read the entire JSON file into a buffer | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | try { | 
					
						
							|  |  |  |     $filebuffer = read_text($file); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | catch ($e) { | 
					
						
							|  |  |  |     die "Failed to read JSON file $file\n"; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Parse the JSON from the buffer | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | try { | 
					
						
							|  |  |  |     $json = JSON->new; | 
					
						
							|  |  |  |     @jsonbuffer = $json->incr_parse($filebuffer); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | catch ($e) { | 
					
						
							|  |  |  |     die "Failed to parse JSON\n"; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | _debug($DEBUG > 3, '@jsonbuffer: ' . Dumper(\@jsonbuffer)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Check that this is the type of JSON we need | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | die "Empty JSON?\n" unless (@jsonbuffer); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # We got an array of one element from 'ia metadata <item>'. Use $md to | 
					
						
							|  |  |  | # reference that one element | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $md = $jsonbuffer[0]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | # Collect the identifier from the parsed JSON and define the derived files we | 
					
						
							|  |  |  | # don't want to delete. (Found cases of audio files being "derived" in 1672 | 
					
						
							|  |  |  | # and 1664) | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | # | 
					
						
							|  |  |  | my $identifier = $md->{metadata}->{identifier}; | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | my $skip_re = qr{^${identifier}\.(flac|mp3|ogg|opus|png|spx|wav)$}; | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Build a hash from the original and derived files referenced in the metadata. | 
					
						
							|  |  |  | # The hash will contain all but a number of files which are not of interest | 
					
						
							|  |  |  | # when sorting out which file came from which. At the end of the loop the | 
					
						
							|  |  |  | # structure will contain back links with child files pointing to parents. The | 
					
						
							|  |  |  | # forward child links get added later. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The original names of files are also kept in an array to help with | 
					
						
							|  |  |  | # traversing the main hash. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | foreach my $file ( @{ $md->{files} } ) { | 
					
						
							|  |  |  |     my $fname = $file->{name}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     printf "%-40s %-10s %s\n", $fname, $file->{source}, | 
					
						
							|  |  |  |         coalesce( $file->{original}, 'null' ) | 
					
						
							|  |  |  |         if ($verbose > 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     unless ( $file->{source} =~ /[Mm]etadata/ | 
					
						
							|  |  |  |         || $file->{format} | 
					
						
							|  |  |  |         =~ /[Mm]etadata|Item Tile|Columbia Peaks|Spectrogram/ ) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $filetree{$fname} = { | 
					
						
							|  |  |  |             source   => $file->{source}, | 
					
						
							|  |  |  |             parent   => $file->{original}, | 
					
						
							|  |  |  |             format   => $file->{format}, | 
					
						
							|  |  |  |             children => [], | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if ( $file->{source} eq 'original' ) { | 
					
						
							|  |  |  |             push( @original, $fname ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         elsif ( $file->{source} eq 'derivative' ) { | 
					
						
							|  |  |  |             push( @derived, $fname ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Visit all the nodes for derived files. They point to their parent node, so | 
					
						
							|  |  |  | # we can visit this node and add the children to an internal array. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | foreach my $file (@derived) { | 
					
						
							|  |  |  |     if ( defined( $filetree{$file}->{parent} ) ) { | 
					
						
							|  |  |  |         push( | 
					
						
							|  |  |  |             @{ $filetree{ $filetree{$file}->{parent} }->{children} }, | 
					
						
							|  |  |  |             $file | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The trees are built, so we can dump the final result | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | _debug( | 
					
						
							|  |  |  |    $DEBUG > 2, | 
					
						
							|  |  |  |    '%filetree' | 
					
						
							|  |  |  |        . sprintf( '[%d]: ', scalar( keys(%filetree) ) ) | 
					
						
							|  |  |  |        . Dumper( \%filetree ) | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | _debug($DEBUG > 2, '@original: ' . Dumper(\@original)); | 
					
						
							|  |  |  | _debug($DEBUG > 2, '@derived: ' . Dumper(\@derived)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # If the verbosity level is greater than 0 scan all the 'original' files in | 
					
						
							|  |  |  | # the '$filetree' hash. For each one walk its 'children' array if there's | 
					
						
							|  |  |  | # anything in it, and recurse into each node it points to. It's possible to | 
					
						
							|  |  |  | # have further levels than the one below the parent, but very little seems to | 
					
						
							|  |  |  | # be set up this way so far. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($verbose > 0) { | 
					
						
							|  |  |  |     foreach my $file ( sort(@original) ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Print the top level node and recurse into any children and print them | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         display_nodes( \%filetree, 0, $file ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     say '-' x 10; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | # | 
					
						
							|  |  |  | # List derived files that can be deleted, being careful not to delete the | 
					
						
							|  |  |  | # audio or the PNG image created by IA code. | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | if ($list_derived) { | 
					
						
							|  |  |  |     foreach my $file ( sort(@derived) ) { | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  |         say "$file" unless ($file =~ $skip_re); | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | exit; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: display_nodes | 
					
						
							|  |  |  | #      PURPOSE: Walks a tree of original and derived files in the IA metadata | 
					
						
							|  |  |  | #   PARAMETERS: $tree           Hashref pointing to the file tree | 
					
						
							|  |  |  | #               $level          The current level within the tree | 
					
						
							|  |  |  | #               $key            Hash key for the current node | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Recurses through a tree | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub display_nodes { | 
					
						
							|  |  |  |     my ($tree, $level, $key) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Report this node | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     printf "%s%d: %s\n", "\t" x $level, $level, $key; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If there are children we use each to recurse to a lower level | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if (scalar(@{$tree->{$key}->{children}}) > 0) { | 
					
						
							|  |  |  |         $level++; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         foreach my $child (@{$tree->{$key}->{children}}) { | 
					
						
							|  |  |  |             display_nodes($tree, $level, $child); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: coalesce | 
					
						
							|  |  |  | #      PURPOSE: To find the first defined argument and return it | 
					
						
							|  |  |  | #   PARAMETERS: Arbitrary number of arguments | 
					
						
							|  |  |  | #      RETURNS: The first defined argument or undef if there are none | 
					
						
							|  |  |  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | 
					
						
							|  |  |  | #               returned when doing so might be a problem. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub coalesce { | 
					
						
							|  |  |  |     foreach (@_) { | 
					
						
							|  |  |  |         return $_ if defined($_); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return undef;    ## no critic | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _debug | 
					
						
							|  |  |  | #      PURPOSE: Prints debug reports | 
					
						
							|  |  |  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | 
					
						
							|  |  |  | #               $message        Message to print | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | 
					
						
							|  |  |  | #               newline and then adds one in the 'print' to the caller doesn't | 
					
						
							|  |  |  | #               have to bother. Prepends the message with 'D> ' to show it's | 
					
						
							|  |  |  | #               a debug message. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _debug { | 
					
						
							|  |  |  |     my ( $active, $message ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     chomp($message); | 
					
						
							|  |  |  |     print "D> $message\n" if $active; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: Options | 
					
						
							|  |  |  | #      PURPOSE: Processes command-line options | 
					
						
							|  |  |  | #   PARAMETERS: $optref     Hash reference to hold the options | 
					
						
							|  |  |  | #      RETURNS: Undef | 
					
						
							|  |  |  | #  DESCRIPTION: Process the options we want to offer. See the documentation | 
					
						
							|  |  |  | #               for details | 
					
						
							|  |  |  | #       THROWS: no exceptions | 
					
						
							|  |  |  | #     COMMENTS: none | 
					
						
							|  |  |  | #     SEE ALSO: n/a | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub Options { | 
					
						
							|  |  |  |     my ($optref) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @options = ( | 
					
						
							|  |  |  |         "help",     "documentation|man", "debug=i", "dry-run!", | 
					
						
							|  |  |  |         "verbose+", "list_derived!" | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( !GetOptions( $optref, @options ) ) { | 
					
						
							|  |  |  |         pod2usage( | 
					
						
							|  |  |  |             -msg     => "$PROG version $VERSION\n", | 
					
						
							|  |  |  |             -exitval => 1, | 
					
						
							|  |  |  |             -verbose => 0 | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __END__ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #  Application Documentation | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #{{{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 NAME | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | view_derivatives - a tool to analyse IA metadata | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 VERSION | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | This documentation refers to view_derivatives version 0.0.4 | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 USAGE | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  |     view_derivatives [-help] [-documentation|-man] [-debug=N] [-[no]dry-run] | 
					
						
							|  |  |  |         [-verbose [-verbose] ...] [-[no]list_derived] metadata_file | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  |     # Parse the metadata and report the relationships beteen files | 
					
						
							|  |  |  |     view_derivatives -verb METADATA | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  |     # Parse the metadata and write out a list of derived files for potential | 
					
						
							|  |  |  |     # deletion. | 
					
						
							|  |  |  |     view_derivatives -list_derived METADATA > FILE | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 REQUIRED ARGUMENTS | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | The name of a file created by the following command: | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  |     ia metadata "show" > metadata_file | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | The file is expected to contain one JSON object (in a one-element array). If | 
					
						
							|  |  |  | it contains more objects only the first will be processed. | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 OPTIONS | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-help> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Prints a brief help message describing the usage of the program, and then exits. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-documentation> B<-man> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Displays the entirety of the documentation (using a pager), and then exits. To | 
					
						
							|  |  |  | generate a PDF version use the I<pod2pdf> tool from | 
					
						
							|  |  |  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | 
					
						
							|  |  |  | installed with the cpan tool as App::pod2pdf. Use the command: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     pod2pdf view_derivatives --out=view_derivatives.pdf | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-debug=N> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Selects a level of debugging. Debug information consists of a line or series | 
					
						
							|  |  |  | of lines prefixed with the characters 'D>': | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<0> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | No debug output is generated: this is the default | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<3> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Prints all data structures from options | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (The debug levels need work!) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]dry-run> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Enable/disable dry run mode (default off) | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | =item B<-verbose> | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | Sets the verbosity level. If the option is omitted then the level is zero (no | 
					
						
							|  |  |  | verbose output). Thereafter, for each occurrence of the option the verbosity | 
					
						
							|  |  |  | level is incremented. Only levels 1 and 2 are currently catered for. Any | 
					
						
							|  |  |  | levels above 2 produce the same result as level 2. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]list_derived> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option is off by default. Turning it on causes the script to write all | 
					
						
							|  |  |  | derived files to standard output. If the verbosity level is zero this is the | 
					
						
							|  |  |  | only output from the script. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The idea is that at verbosity level 1 or 2 information is displayed about the | 
					
						
							|  |  |  | relationship of files in the metadata, for human consumption. If | 
					
						
							|  |  |  | B<-nolist_derived> is the setting (or default) then this is all that is shown. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If the verbosity level is zero and B<-list_derived> is on then only the list | 
					
						
							|  |  |  | of derived files will be generated, and this can be used to delete the files | 
					
						
							|  |  |  | from the IA. | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 DESCRIPTION | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | =head2 OVERVIEW | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Items on the IA (Internet Archive, or I<archive.org>) consist of metadata and | 
					
						
							|  |  |  | files. Each item generated for HPR is a show or episode. Most files comprising | 
					
						
							|  |  |  | the episode on the IA are those which are part of the episode on the HPR | 
					
						
							|  |  |  | server. A few extra files are created by the IA software, but these are part | 
					
						
							|  |  |  | of the metadata (HTML details, upload date, etc.) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | By default the IA software will create additional files which are derived from | 
					
						
							|  |  |  | the original files. Typical examples are other audio formats, such as Ogg or | 
					
						
							|  |  |  | Mp3. We have been disabling this derivation process for several years for | 
					
						
							|  |  |  | various reasons, preferring to generate our own derivatives. IA-generated | 
					
						
							|  |  |  | audio derivatives do not have ID3 and similar tags, whereas HPR-generated | 
					
						
							|  |  |  | audio formats do. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Historically it was difficult to disable the derivation process. Even though | 
					
						
							|  |  |  | there were settings to do this they apparently didn't work on all of the | 
					
						
							|  |  |  | servers making up the IA, and so older items may have many derived files. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This script assists with identifying unwanted derivatives and with their | 
					
						
							|  |  |  | deletion. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 METADATA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The metadata for an item can be obtained (by a registered user) from the IA | 
					
						
							|  |  |  | using the B<ia> tool. Its format is JSON, and this script uses a JSON module | 
					
						
							|  |  |  | to parse it. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 FILE RELATIONSHIPS | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | The JSON metadata contains details of all files comprising the IA item. | 
					
						
							|  |  |  | It contains details such as the name, size, and type of each file. It also | 
					
						
							|  |  |  | categorises files into groups such as I<original> and I<derived>. Files which | 
					
						
							|  |  |  | are derived have parents. The script uses this to build tree-like data | 
					
						
							|  |  |  | structures of derived files based on the original files. All children of an | 
					
						
							|  |  |  | original file will be derived, but some derived files may also have children. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The derivatives can be classified simply as children of original files or of | 
					
						
							|  |  |  | derived files. These are what are listed if required and what are used in the | 
					
						
							|  |  |  | deletion process. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 DELETING UNWANTED DERIVATIVES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The simplest method is to pipe the output from the script with verbose level | 
					
						
							|  |  |  | zero and with B<-list_derived> enabled into B<xargs> in order to run a command | 
					
						
							|  |  |  | which will delete the unwanted derivatives. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | One usage is: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ./view_derivatives -list_derived metadata.json |\ | 
					
						
							|  |  |  |         xargs ia delete hpr1234 --no-backup | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This will generate a list of files to be deleted, then pipe them to B<xargs> | 
					
						
							|  |  |  | which will construct a command by appending the names to the command template. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This approach is not ideal since it does not handle the case where there is | 
					
						
							|  |  |  | nothing to delete. The script B<snapshot_metadata> manages this situation by | 
					
						
							|  |  |  | generating the metadata and saving it in a file, then it runs | 
					
						
							|  |  |  | B<view_derivatives> on this file and generates a file of derivatives. If this | 
					
						
							|  |  |  | file is not empty it can be used to perform the deletions, and otherwise no | 
					
						
							|  |  |  | attempt will be made. | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 DIAGNOSTICS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of every error and warning message that the application can generate | 
					
						
							|  |  |  | (even the ones that will "never happen"), with a full explanation of each | 
					
						
							|  |  |  | problem, one or more likely causes, and any suggested remedies. If the | 
					
						
							|  |  |  | application generates exit status codes (e.g. under Unix) then list the exit | 
					
						
							|  |  |  | status associated with each error. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 CONFIGURATION AND ENVIRONMENT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A full explanation of any configuration system(s) used by the application, | 
					
						
							|  |  |  | including the names and locations of any configuration files, and the | 
					
						
							|  |  |  | meaning of any environment variables or properties that can be set. These | 
					
						
							|  |  |  | descriptions must also include details of any configuration language used | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DEPENDENCIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of all the other modules that this module relies upon, including any | 
					
						
							|  |  |  | restrictions on versions, and an indication whether these required modules are | 
					
						
							|  |  |  | part of the standard Perl distribution, part of the module's distribution, | 
					
						
							|  |  |  | or must be installed separately. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 INCOMPATIBILITIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of any modules that this module cannot be used in conjunction with. | 
					
						
							|  |  |  | This may be due to name conflicts in the interface, or competition for | 
					
						
							|  |  |  | system or program resources, or due to internal limitations of Perl | 
					
						
							|  |  |  | (for example, many modules that use source code filters are mutually | 
					
						
							|  |  |  | incompatible). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 BUGS AND LIMITATIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of known problems with the module, together with some indication | 
					
						
							|  |  |  | whether they are likely to be fixed in an upcoming release. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Also a list of restrictions on the features the module does provide: | 
					
						
							|  |  |  | data types that cannot be handled, performance issues and the circumstances | 
					
						
							|  |  |  | in which they may arise, practical limitations on the size of data sets, | 
					
						
							|  |  |  | special cases that are not (yet) handled, etc. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The initial template usually just has: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no known bugs in this module. | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | Please report problems to Dave Morriss (dave.morriss@gmail.com) | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | Patches are welcome. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | =head1 AUTHOR | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | Dave Morriss (dave.morriss@gmail.com) | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 LICENCE AND COPYRIGHT | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-23 22:28:52 +00:00
										 |  |  | Copyright (c) 2024 Dave Morriss (dave.morriss@gmail.com). All rights reserved. | 
					
						
							| 
									
										
										
										
											2024-08-22 13:25:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =cut | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #}}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # [zo to open fold, zc to close] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | 
					
						
							|  |  |  | 
 |