forked from HPR/hpr-tools
501 lines
15 KiB
Plaintext
501 lines
15 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: view_derivatives
|
||
|
#
|
||
|
# USAGE: ./view_derivatives metadata_file
|
||
|
#
|
||
|
# DESCRIPTION: Experimental!
|
||
|
# Reads a JSON file containing IA metadata for an item (HPR
|
||
|
# show) and processes 'derived' files from the IA. Their
|
||
|
# relationships are usually simple, but sometimes can be rather
|
||
|
# weird, so building a representation of the hierarchy might be
|
||
|
# a way to understand what's there and maybe make a list for the
|
||
|
# 'ia delete' command to work on!
|
||
|
# Each 'original' file may be accompanied by 'derived' files. So
|
||
|
# an image file might generate a thumbnail for example. The
|
||
|
# image will be marked in the JSON metadata as "original" and
|
||
|
# the derived file as a "derivative" with the file it was built
|
||
|
# from being its "parent". There can also be "derivative" files
|
||
|
# built from other "derivative"s in rare cases. This has been
|
||
|
# seen when an EPUB file is used to build a PDF "derivative",
|
||
|
# and then other versions of the PDF are generated as
|
||
|
# "derivatives" with the PDF as the "parent".
|
||
|
# The point is that we do not want the "derivatives". We (HPR)
|
||
|
# generate all of the versions of a given file we require:
|
||
|
# audio types and thumbnails, for example. The tools we use can
|
||
|
# disable the generation of "derivatives", so it has been rare
|
||
|
# to see them in recent times. However, with older shows we
|
||
|
# either couldn't stop the "derive" process, or some IA servers
|
||
|
# ignored our '--no-derive' options. Now (August 2024) we are
|
||
|
# cleaning up the HPR collection, and thus this script has been
|
||
|
# developed.
|
||
|
# Special note: any cleaning up of IA items needs to be done
|
||
|
# before files are moved around. The parental relationships are
|
||
|
# stored in the metadata, and do not track file movements. This
|
||
|
# script cannot analyse metadata from after such rearrangements!
|
||
|
# [It may be possible to adjust these metadata fields, but it
|
||
|
# does not seem to be necessary if the correct sequence of
|
||
|
# changes is adhered to - find derivatives, delete derivatives,
|
||
|
# move files]
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: ---
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.0.2
|
||
|
# CREATED: 2024-08-12 16:26:29
|
||
|
# REVISION: 2024-08-17 13:44:44
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use v5.36;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use feature qw{ postderef say signatures state try };
|
||
|
no warnings
|
||
|
qw{ experimental::postderef experimental::signatures experimental::try };
|
||
|
|
||
|
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
||
|
|
||
|
use Carp;
|
||
|
use Getopt::Long;
|
||
|
use Pod::Usage;
|
||
|
|
||
|
use File::Slurper qw{ read_text read_lines };
|
||
|
use JSON;
|
||
|
|
||
|
use Data::Dumper;
|
||
|
|
||
|
#
|
||
|
# Version number (Incremented by Vim)
|
||
|
#
|
||
|
our $VERSION = '0.0.2';
|
||
|
|
||
|
#
|
||
|
# Script and directory names
|
||
|
#
|
||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||
|
$DIR = '.' unless $DIR;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Declarations
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Constants and other declarations
|
||
|
#
|
||
|
my $basedir = "$ENV{HOME}/HPR/InternetArchive";
|
||
|
my $outputfile = "$basedir/$PROG.svg";
|
||
|
my $logfile = "$basedir/$PROG.log";
|
||
|
|
||
|
my ( $file, $filebuffer, $json, @jsonbuffer );
|
||
|
my ( %filetree, @original, @derived );
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Options
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Option defaults
|
||
|
#
|
||
|
my $DEFDEBUG = 0;
|
||
|
|
||
|
#
|
||
|
# Options and arguments
|
||
|
#
|
||
|
my %options;
|
||
|
Options( \%options );
|
||
|
|
||
|
#
|
||
|
# Default help shows minimal information
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
||
|
if ( $options{'help'} );
|
||
|
|
||
|
#
|
||
|
# The -documentation or -man option shows the full POD documentation through
|
||
|
# a pager for convenience
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
||
|
if ( $options{'documentation'} );
|
||
|
|
||
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
||
|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
||
|
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
||
|
|
||
|
my $list_derived
|
||
|
= ( defined( $options{list_derived} ) ? $options{list_derived} : 0 );
|
||
|
|
||
|
#
|
||
|
# Get the filename argument (metadata in JSON format)
|
||
|
#
|
||
|
$file = shift;
|
||
|
die "File of IA metadata is required\n" unless ($file);
|
||
|
|
||
|
#
|
||
|
# Read the entire JSON file into a buffer
|
||
|
#
|
||
|
try {
|
||
|
$filebuffer = read_text($file);
|
||
|
}
|
||
|
|
||
|
catch ($e) {
|
||
|
die "Failed to read JSON file $file\n";
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Parse the JSON from the buffer
|
||
|
#
|
||
|
try {
|
||
|
$json = JSON->new;
|
||
|
@jsonbuffer = $json->incr_parse($filebuffer);
|
||
|
}
|
||
|
|
||
|
catch ($e) {
|
||
|
die "Failed to parse JSON\n";
|
||
|
}
|
||
|
|
||
|
_debug($DEBUG > 3, '@jsonbuffer: ' . Dumper(\@jsonbuffer));
|
||
|
|
||
|
#
|
||
|
# Check that this is the type of JSON we need
|
||
|
#
|
||
|
die "Empty JSON?\n" unless (@jsonbuffer);
|
||
|
|
||
|
#
|
||
|
# We got an array of one element from 'ia metadata <item>'. Use $md to
|
||
|
# reference that one element
|
||
|
#
|
||
|
my $md = $jsonbuffer[0];
|
||
|
|
||
|
#
|
||
|
# Collect the identifier from the parsed JSON and define the one derived file
|
||
|
# we don't want to delete.
|
||
|
#
|
||
|
my $identifier = $md->{metadata}->{identifier};
|
||
|
my $item_png = "${identifier}.png";
|
||
|
|
||
|
#
|
||
|
# Build a hash from the original and derived files referenced in the metadata.
|
||
|
# The hash will contain all but a number of files which are not of interest
|
||
|
# when sorting out which file came from which. At the end of the loop the
|
||
|
# structure will contain back links with child files pointing to parents. The
|
||
|
# forward child links get added later.
|
||
|
#
|
||
|
# The original names of files are also kept in an array to help with
|
||
|
# traversing the main hash.
|
||
|
#
|
||
|
foreach my $file ( @{ $md->{files} } ) {
|
||
|
my $fname = $file->{name};
|
||
|
|
||
|
printf "%-40s %-10s %s\n", $fname, $file->{source},
|
||
|
coalesce( $file->{original}, 'null' )
|
||
|
if ($verbose > 1);
|
||
|
|
||
|
unless ( $file->{source} =~ /[Mm]etadata/
|
||
|
|| $file->{format}
|
||
|
=~ /[Mm]etadata|Item Tile|Columbia Peaks|Spectrogram/ )
|
||
|
{
|
||
|
$filetree{$fname} = {
|
||
|
source => $file->{source},
|
||
|
parent => $file->{original},
|
||
|
format => $file->{format},
|
||
|
children => [],
|
||
|
};
|
||
|
|
||
|
if ( $file->{source} eq 'original' ) {
|
||
|
push( @original, $fname );
|
||
|
}
|
||
|
elsif ( $file->{source} eq 'derivative' ) {
|
||
|
push( @derived, $fname );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Visit all the nodes for derived files. They point to their parent node, so
|
||
|
# we can visit this node and add the children to an internal array.
|
||
|
#
|
||
|
foreach my $file (@derived) {
|
||
|
if ( defined( $filetree{$file}->{parent} ) ) {
|
||
|
push(
|
||
|
@{ $filetree{ $filetree{$file}->{parent} }->{children} },
|
||
|
$file
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# The trees are built, so we can dump the final result
|
||
|
#
|
||
|
_debug(
|
||
|
$DEBUG > 2,
|
||
|
'%filetree'
|
||
|
. sprintf( '[%d]: ', scalar( keys(%filetree) ) )
|
||
|
. Dumper( \%filetree )
|
||
|
);
|
||
|
|
||
|
_debug($DEBUG > 2, '@original: ' . Dumper(\@original));
|
||
|
_debug($DEBUG > 2, '@derived: ' . Dumper(\@derived));
|
||
|
|
||
|
#
|
||
|
# If the verbosity level is greater than 0 scan all the 'original' files in
|
||
|
# the '$filetree' hash. For each one walk its 'children' array if there's
|
||
|
# anything in it, and recurse into each node it points to. It's possible to
|
||
|
# have further levels than the one below the parent, but very little seems to
|
||
|
# be set up this way so far.
|
||
|
#
|
||
|
if ($verbose > 0) {
|
||
|
foreach my $file ( sort(@original) ) {
|
||
|
#
|
||
|
# Print the top level node and recurse into any children and print them
|
||
|
#
|
||
|
display_nodes( \%filetree, 0, $file );
|
||
|
}
|
||
|
say '-' x 10;
|
||
|
}
|
||
|
|
||
|
if ($list_derived) {
|
||
|
foreach my $file ( sort(@derived) ) {
|
||
|
say "$file" unless ($file eq $item_png);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
exit;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: display_nodes
|
||
|
# PURPOSE: Walks a tree of original and derived files in the IA metadata
|
||
|
# PARAMETERS: $tree Hashref pointing to the file tree
|
||
|
# $level The current level within the tree
|
||
|
# $key Hash key for the current node
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Recurses through a tree
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub display_nodes {
|
||
|
my ($tree, $level, $key) = @_;
|
||
|
|
||
|
#
|
||
|
# Report this node
|
||
|
#
|
||
|
printf "%s%d: %s\n", "\t" x $level, $level, $key;
|
||
|
|
||
|
#
|
||
|
# If there are children we use each to recurse to a lower level
|
||
|
#
|
||
|
if (scalar(@{$tree->{$key}->{children}}) > 0) {
|
||
|
$level++;
|
||
|
|
||
|
foreach my $child (@{$tree->{$key}->{children}}) {
|
||
|
display_nodes($tree, $level, $child);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: coalesce
|
||
|
# PURPOSE: To find the first defined argument and return it
|
||
|
# PARAMETERS: Arbitrary number of arguments
|
||
|
# RETURNS: The first defined argument or undef if there are none
|
||
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
||
|
# returned when doing so might be a problem.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub coalesce {
|
||
|
foreach (@_) {
|
||
|
return $_ if defined($_);
|
||
|
}
|
||
|
return undef; ## no critic
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: _debug
|
||
|
# PURPOSE: Prints debug reports
|
||
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
||
|
# $message Message to print
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
||
|
# newline and then adds one in the 'print' to the caller doesn't
|
||
|
# have to bother. Prepends the message with 'D> ' to show it's
|
||
|
# a debug message.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub _debug {
|
||
|
my ( $active, $message ) = @_;
|
||
|
|
||
|
chomp($message);
|
||
|
print "D> $message\n" if $active;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Options
|
||
|
# PURPOSE: Processes command-line options
|
||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||
|
# RETURNS: Undef
|
||
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
||
|
# for details
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Options {
|
||
|
my ($optref) = @_;
|
||
|
|
||
|
my @options = (
|
||
|
"help", "documentation|man", "debug=i", "dry-run!",
|
||
|
"verbose+", "list_derived!"
|
||
|
);
|
||
|
|
||
|
if ( !GetOptions( $optref, @options ) ) {
|
||
|
pod2usage(
|
||
|
-msg => "$PROG version $VERSION\n",
|
||
|
-exitval => 1,
|
||
|
-verbose => 0
|
||
|
);
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
__END__
|
||
|
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
# Application Documentation
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
#{{{
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
<application name> - <One line description of application's purpose>
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
The initial template usually just has:
|
||
|
|
||
|
This documentation refers to <application name> version 0.0.2
|
||
|
|
||
|
|
||
|
=head1 USAGE
|
||
|
|
||
|
# Brief working invocation example(s) here showing the most common usage(s)
|
||
|
|
||
|
# This section will be as far as many users ever read
|
||
|
# so make it as educational and exemplary as possible.
|
||
|
|
||
|
|
||
|
=head1 REQUIRED ARGUMENTS
|
||
|
|
||
|
A complete list of every argument that must appear on the command line.
|
||
|
when the application is invoked, explaining what each of them does, any
|
||
|
restrictions on where each one may appear (i.e. flags that must appear
|
||
|
before or after filenames), and how the various arguments and options
|
||
|
may interact (e.g. mutual exclusions, required combinations, etc.)
|
||
|
|
||
|
If all of the application's arguments are optional this section
|
||
|
may be omitted entirely.
|
||
|
|
||
|
|
||
|
=head1 OPTIONS
|
||
|
|
||
|
A complete list of every available option with which the application
|
||
|
can be invoked, explaining what each does, and listing any restrictions,
|
||
|
or interactions.
|
||
|
|
||
|
If the application has no options this section may be omitted entirely.
|
||
|
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
A full description of the application and its features.
|
||
|
May include numerous subsections (i.e. =head2, =head3, etc.)
|
||
|
|
||
|
|
||
|
=head1 DIAGNOSTICS
|
||
|
|
||
|
A list of every error and warning message that the application can generate
|
||
|
(even the ones that will "never happen"), with a full explanation of each
|
||
|
problem, one or more likely causes, and any suggested remedies. If the
|
||
|
application generates exit status codes (e.g. under Unix) then list the exit
|
||
|
status associated with each error.
|
||
|
|
||
|
|
||
|
=head1 CONFIGURATION AND ENVIRONMENT
|
||
|
|
||
|
A full explanation of any configuration system(s) used by the application,
|
||
|
including the names and locations of any configuration files, and the
|
||
|
meaning of any environment variables or properties that can be set. These
|
||
|
descriptions must also include details of any configuration language used
|
||
|
|
||
|
|
||
|
=head1 DEPENDENCIES
|
||
|
|
||
|
A list of all the other modules that this module relies upon, including any
|
||
|
restrictions on versions, and an indication whether these required modules are
|
||
|
part of the standard Perl distribution, part of the module's distribution,
|
||
|
or must be installed separately.
|
||
|
|
||
|
|
||
|
=head1 INCOMPATIBILITIES
|
||
|
|
||
|
A list of any modules that this module cannot be used in conjunction with.
|
||
|
This may be due to name conflicts in the interface, or competition for
|
||
|
system or program resources, or due to internal limitations of Perl
|
||
|
(for example, many modules that use source code filters are mutually
|
||
|
incompatible).
|
||
|
|
||
|
|
||
|
=head1 BUGS AND LIMITATIONS
|
||
|
|
||
|
A list of known problems with the module, together with some indication
|
||
|
whether they are likely to be fixed in an upcoming release.
|
||
|
|
||
|
Also a list of restrictions on the features the module does provide:
|
||
|
data types that cannot be handled, performance issues and the circumstances
|
||
|
in which they may arise, practical limitations on the size of data sets,
|
||
|
special cases that are not (yet) handled, etc.
|
||
|
|
||
|
The initial template usually just has:
|
||
|
|
||
|
There are no known bugs in this module.
|
||
|
Please report problems to <Maintainer name(s)> (<contact address>)
|
||
|
Patches are welcome.
|
||
|
|
||
|
=head1 AUTHOR
|
||
|
|
||
|
<Author name(s)> (<contact address>)
|
||
|
|
||
|
|
||
|
=head1 LICENCE AND COPYRIGHT
|
||
|
|
||
|
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
|
||
|
|
||
|
Followed by whatever licence you wish to release it under.
|
||
|
For Perl code that is often just:
|
||
|
|
||
|
This module is free software; you can redistribute it and/or
|
||
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
#}}}
|
||
|
|
||
|
# [zo to open fold, zc to close]
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||
|
|