hpr-tools/InternetArchive/view_derivatives

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: view_derivatives
#
#        USAGE: ./view_derivatives metadata_file
#
#  DESCRIPTION: Experimental!
#               Reads a JSON file containing IA metadata for an item (HPR
#               show) and processes 'derived' files from the IA. Their
#               relationships are usually simple, but sometimes can be rather
#               weird, so building a representation of the hierarchy might be
#               a way to understand what's there and maybe make a list for the
#               'ia delete' command to work on!
#               Each 'original' file may be accompanied by 'derived' files. So
#               an image file might generate a thumbnail for example. The
#               image will be marked in the JSON metadata as "original" and
#               the derived file as a "derivative" with the file it was built
#               from being its "parent". There can also be "derivative" files
#               built from other "derivative"s in rare cases. This has been
#               seen when an EPUB file is used to build a PDF "derivative",
#               and then other versions of the PDF are generated as
#               "derivatives" with the PDF as the "parent".
#               The point is that we do not want the "derivatives". We (HPR)
#               generate all of the versions of a given file we require:
#               audio types and thumbnails, for example. The tools we use can
#               disable the generation of "derivatives", so it has been rare
#               to see them in recent times. However, with older shows we
#               either couldn't stop the "derive" process, or some IA servers
#               ignored our '--no-derive' options. Now (August 2024) we are
#               cleaning up the HPR collection, and thus this script has been
#               developed.
#               Special note: any cleaning up of IA items needs to be done
#               before files are moved around. The parental relationships are
#               stored in the metadata, and do not track file movements. This
#               script cannot analyse metadata from after such rearrangements!
#               [It may be possible to adjust these metadata fields, but it
#               does not seem to be necessary if the correct sequence of
#               changes is adhered to - find derivatives, delete derivatives,
#               move files]
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.4
#      CREATED: 2024-08-12 16:26:29
#     REVISION: 2024-09-17 17:03:27
#
#===============================================================================

use v5.36;
use strict;
use warnings;
use feature qw{ postderef say signatures state try };
no warnings
    qw{ experimental::postderef experimental::signatures experimental::try };

use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8

use Carp;
use Getopt::Long;
use Pod::Usage;

use File::Slurper qw{ read_text read_lines };
use JSON;

use Data::Dumper;

#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.4';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/InternetArchive";
my $outputfile = "$basedir/$PROG.svg";
my $logfile    = "$basedir/$PROG.log";

my ( $file, $filebuffer, $json, @jsonbuffer );
my ( %filetree, @original, @derived );

#-------------------------------------------------------------------------------
# Options
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;

#
# Options and arguments
#
my %options;
Options( \%options );

#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
    if ( $options{'help'} );

#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
    if ( $options{'documentation'} );

my $DEBUG   = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );

my $list_derived
    = ( defined( $options{list_derived} ) ? $options{list_derived} : 0 );

#
# Get the filename argument (metadata in JSON format)
#
$file = shift;
die "File of IA metadata is required\n" unless ($file);

#
# Read the entire JSON file into a buffer
#
try {
    $filebuffer = read_text($file);
}

catch ($e) {
    die "Failed to read JSON file $file\n";
}

#
# Parse the JSON from the buffer
#
try {
    $json = JSON->new;
    @jsonbuffer = $json->incr_parse($filebuffer);
}

catch ($e) {
    die "Failed to parse JSON\n";
}

_debug($DEBUG > 3, '@jsonbuffer: ' . Dumper(\@jsonbuffer));

#
# Check that this is the type of JSON we need
#
die "Empty JSON?\n" unless (@jsonbuffer);

#
# We got an array of one element from 'ia metadata <item>'. Use $md to
# reference that one element
#
my $md = $jsonbuffer[0];

#
# Collect the identifier from the parsed JSON and define the derived files we
# don't want to delete. (Found cases of audio files being "derived" in 1672
# and 1664)
#
my $identifier = $md->{metadata}->{identifier};
my $skip_re = qr{^${identifier}\.(flac|mp3|ogg|opus|png|spx|wav)$};

#
# Build a hash from the original and derived files referenced in the metadata.
# The hash will contain all but a number of files which are not of interest
# when sorting out which file came from which. At the end of the loop the
# structure will contain back links with child files pointing to parents. The
# forward child links get added later.
#
# The original names of files are also kept in an array to help with
# traversing the main hash.
#
foreach my $file ( @{ $md->{files} } ) {
    my $fname = $file->{name};

    printf "%-40s %-10s %s\n", $fname, $file->{source},
        coalesce( $file->{original}, 'null' )
        if ($verbose > 1);

    unless ( $file->{source} =~ /[Mm]etadata/
        || $file->{format}
        =~ /[Mm]etadata|Item Tile|Columbia Peaks|Spectrogram/ )
    {
        $filetree{$fname} = {
            source   => $file->{source},
            parent   => $file->{original},
            format   => $file->{format},
            children => [],
        };

        if ( $file->{source} eq 'original' ) {
            push( @original, $fname );
        }
        elsif ( $file->{source} eq 'derivative' ) {
            push( @derived, $fname );
        }
    }

}

#
# Visit all the nodes for derived files. They point to their parent node, so
# we can visit this node and add the children to an internal array.
#
foreach my $file (@derived) {
    if ( defined( $filetree{$file}->{parent} ) ) {
        push(
            @{ $filetree{ $filetree{$file}->{parent} }->{children} },
            $file
        );
    }
}

#
# The trees are built, so we can dump the final result
#
_debug(
   $DEBUG > 2,
   '%filetree'
       . sprintf( '[%d]: ', scalar( keys(%filetree) ) )
       . Dumper( \%filetree )
);

_debug($DEBUG > 2, '@original: ' . Dumper(\@original));
_debug($DEBUG > 2, '@derived: ' . Dumper(\@derived));

#
# If the verbosity level is greater than 0 scan all the 'original' files in
# the '$filetree' hash. For each one walk its 'children' array if there's
# anything in it, and recurse into each node it points to. It's possible to
# have further levels than the one below the parent, but very little seems to
# be set up this way so far.
#
if ($verbose > 0) {
    foreach my $file ( sort(@original) ) {
        #
        # Print the top level node and recurse into any children and print them
        #
        display_nodes( \%filetree, 0, $file );
    }
    say '-' x 10;
}

#
# List derived files that can be deleted, being careful not to delete the
# audio or the PNG image created by IA code.
#
if ($list_derived) {
    foreach my $file ( sort(@derived) ) {
        say "$file" unless ($file =~ $skip_re);
    }
}

exit;

#===  FUNCTION  ================================================================
#         NAME: display_nodes
#      PURPOSE: Walks a tree of original and derived files in the IA metadata
#   PARAMETERS: $tree           Hashref pointing to the file tree
#               $level          The current level within the tree
#               $key            Hash key for the current node
#      RETURNS: Nothing
#  DESCRIPTION: Recurses through a tree
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub display_nodes {
    my ($tree, $level, $key) = @_;

    #
    # Report this node
    #
    printf "%s%d: %s\n", "\t" x $level, $level, $key;

    #
    # If there are children we use each to recurse to a lower level
    #
    if (scalar(@{$tree->{$key}->{children}}) > 0) {
        $level++;

        foreach my $child (@{$tree->{$key}->{children}}) {
            display_nodes($tree, $level, $child);
        }
    }

    return;
}

#===  FUNCTION  ================================================================
#         NAME: coalesce
#      PURPOSE: To find the first defined argument and return it
#   PARAMETERS: Arbitrary number of arguments
#      RETURNS: The first defined argument or undef if there are none
#  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
#               returned when doing so might be a problem.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub coalesce {
    foreach (@_) {
        return $_ if defined($_);
    }
    return undef;    ## no critic
}

#===  FUNCTION  ================================================================
#         NAME: _debug
#      PURPOSE: Prints debug reports
#   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
#               $message        Message to print
#      RETURNS: Nothing
#  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
#               newline and then adds one in the 'print' to the caller doesn't
#               have to bother. Prepends the message with 'D> ' to show it's
#               a debug message.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub _debug {
    my ( $active, $message ) = @_;

    chomp($message);
    print "D> $message\n" if $active;
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION: Process the options we want to offer. See the documentation
#               for details
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = (
        "help",     "documentation|man", "debug=i", "dry-run!",
        "verbose+", "list_derived!"
    );

    if ( !GetOptions( $optref, @options ) ) {
        pod2usage(
            -msg     => "$PROG version $VERSION\n",
            -exitval => 1,
            -verbose => 0
        );
    }

    return;
}

__END__

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#  Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{

=head1 NAME

view_derivatives - a tool to analyse IA metadata

=head1 VERSION

This documentation refers to view_derivatives version 0.0.4

=head1 USAGE

    view_derivatives [-help] [-documentation|-man] [-debug=N] [-[no]dry-run]
        [-verbose [-verbose] ...] [-[no]list_derived] metadata_file

    # Parse the metadata and report the relationships beteen files
    view_derivatives -verb METADATA

    # Parse the metadata and write out a list of derived files for potential
    # deletion.
    view_derivatives -list_derived METADATA > FILE

=head1 REQUIRED ARGUMENTS

The name of a file created by the following command:

    ia metadata "show" > metadata_file

The file is expected to contain one JSON object (in a one-element array). If
it contains more objects only the first will be processed.

=head1 OPTIONS

=over 4

=item B<-help>

Prints a brief help message describing the usage of the program, and then exits.

=item B<-documentation> B<-man>

Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf. Use the command:

    pod2pdf view_derivatives --out=view_derivatives.pdf

=item B<-debug=N>

Selects a level of debugging. Debug information consists of a line or series
of lines prefixed with the characters 'D>':

=over 4

=item B<0>

No debug output is generated: this is the default

=item B<3>

Prints all data structures from options

=back

(The debug levels need work!)

=item B<-[no]dry-run>

Enable/disable dry run mode (default off)

=item B<-verbose>

Sets the verbosity level. If the option is omitted then the level is zero (no
verbose output). Thereafter, for each occurrence of the option the verbosity
level is incremented. Only levels 1 and 2 are currently catered for. Any
levels above 2 produce the same result as level 2.

=item B<-[no]list_derived>

This option is off by default. Turning it on causes the script to write all
derived files to standard output. If the verbosity level is zero this is the
only output from the script.

The idea is that at verbosity level 1 or 2 information is displayed about the
relationship of files in the metadata, for human consumption. If
B<-nolist_derived> is the setting (or default) then this is all that is shown.

If the verbosity level is zero and B<-list_derived> is on then only the list
of derived files will be generated, and this can be used to delete the files
from the IA.

=head1 DESCRIPTION

=head2 OVERVIEW

Items on the IA (Internet Archive, or I<archive.org>) consist of metadata and
files. Each item generated for HPR is a show or episode. Most files comprising
the episode on the IA are those which are part of the episode on the HPR
server. A few extra files are created by the IA software, but these are part
of the metadata (HTML details, upload date, etc.)

By default the IA software will create additional files which are derived from
the original files. Typical examples are other audio formats, such as Ogg or
Mp3. We have been disabling this derivation process for several years for
various reasons, preferring to generate our own derivatives. IA-generated
audio derivatives do not have ID3 and similar tags, whereas HPR-generated
audio formats do.

Historically it was difficult to disable the derivation process. Even though
there were settings to do this they apparently didn't work on all of the
servers making up the IA, and so older items may have many derived files.

This script assists with identifying unwanted derivatives and with their
deletion.

=head2 METADATA

The metadata for an item can be obtained (by a registered user) from the IA
using the B<ia> tool. Its format is JSON, and this script uses a JSON module
to parse it.

=head2 FILE RELATIONSHIPS

The JSON metadata contains details of all files comprising the IA item.
It contains details such as the name, size, and type of each file. It also
categorises files into groups such as I<original> and I<derived>. Files which
are derived have parents. The script uses this to build tree-like data
structures of derived files based on the original files. All children of an
original file will be derived, but some derived files may also have children.

The derivatives can be classified simply as children of original files or of
derived files. These are what are listed if required and what are used in the
deletion process.

=head2 DELETING UNWANTED DERIVATIVES

The simplest method is to pipe the output from the script with verbose level
zero and with B<-list_derived> enabled into B<xargs> in order to run a command
which will delete the unwanted derivatives.

One usage is:

    ./view_derivatives -list_derived metadata.json |\
        xargs ia delete hpr1234 --no-backup

This will generate a list of files to be deleted, then pipe them to B<xargs>
which will construct a command by appending the names to the command template.

This approach is not ideal since it does not handle the case where there is
nothing to delete. The script B<snapshot_metadata> manages this situation by
generating the metadata and saving it in a file, then it runs
B<view_derivatives> on this file and generates a file of derivatives. If this
file is not empty it can be used to perform the deletions, and otherwise no
attempt will be made.

=head1 DIAGNOSTICS

A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.


=head1 CONFIGURATION AND ENVIRONMENT

A full explanation of any configuration system(s) used by the application,
including the names and locations of any configuration files, and the
meaning of any environment variables or properties that can be set. These
descriptions must also include details of any configuration language used


=head1 DEPENDENCIES

A list of all the other modules that this module relies upon, including any
restrictions on versions, and an indication whether these required modules are
part of the standard Perl distribution, part of the module's distribution,
or must be installed separately.


=head1 INCOMPATIBILITIES

A list of any modules that this module cannot be used in conjunction with.
This may be due to name conflicts in the interface, or competition for
system or program resources, or due to internal limitations of Perl
(for example, many modules that use source code filters are mutually
incompatible).


=head1 BUGS AND LIMITATIONS

A list of known problems with the module, together with some indication
whether they are likely to be fixed in an upcoming release.

Also a list of restrictions on the features the module does provide:
data types that cannot be handled, performance issues and the circumstances
in which they may arise, practical limitations on the size of data sets,
special cases that are not (yet) handled, etc.

The initial template usually just has:

There are no known bugs in this module.
Please report problems to Dave Morriss (dave.morriss@gmail.com)
Patches are welcome.


=head1 AUTHOR

Dave Morriss (dave.morriss@gmail.com)


=head1 LICENCE AND COPYRIGHT

Copyright (c) 2024 Dave Morriss (dave.morriss@gmail.com). All rights reserved.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=cut

#}}}

# [zo to open fold, zc to close]

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker