| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #!/usr/bin/env perl | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #         FILE: make_metadata | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        USAGE: ./make_metadata -from=FROM [-to=TO] [-count=COUNT] | 
					
						
							|  |  |  | #               [-output=FILE] [-[no]fetch] [-[no]meta_only] [-[no]verbose] | 
					
						
							|  |  |  | #               [-[no]silent] [-[no]test] [-help] [-{documentation|man}] | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  DESCRIPTION: Make metadata for the uploading of a range of HPR episodes to | 
					
						
							|  |  |  | #               the Internet Archive by collecting fields from the database. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      OPTIONS: See the POD documentation for full details (./make_metadata | 
					
						
							|  |  |  | #               -man) or use 'pod2pdf' to generate a PDF version in the | 
					
						
							|  |  |  | #               directory holding this script. | 
					
						
							|  |  |  | # REQUIREMENTS: --- | 
					
						
							|  |  |  | #         BUGS: --- | 
					
						
							|  |  |  | #        NOTES: 2023-07-06: A version in development 0.4.14 has been put aside | 
					
						
							|  |  |  | #               and this version (0.4.12) made into the main line version | 
					
						
							|  |  |  | #               because 4.14 was developing in a direction that doesn't fit | 
					
						
							|  |  |  | #               with the changes made to the HPR system in June/July 2023. | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  | #               Will now move forward with version numbers (and will get | 
					
						
							|  |  |  | #               a duplicate). | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #               2024-01-23: Added the 'open' pragma for UTF-8 | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  | #               2024-07-08: Fixed a bug where the top-level directory was | 
					
						
							|  |  |  | #               being added to assets paths. See the definition of $linkre for | 
					
						
							|  |  |  | #               more detals. | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | 
					
						
							|  |  |  | #      VERSION: 0.4.14 | 
					
						
							|  |  |  | #      CREATED: 2014-06-13 12:51:04 | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  | #     REVISION: 2024-07-08 15:21:02 | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | # | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use 5.010; | 
					
						
							|  |  |  | use strict; | 
					
						
							|  |  |  | use warnings; | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  | use open ':std', ':encoding(UTF-8)'; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | use Carp; | 
					
						
							|  |  |  | use Getopt::Long; | 
					
						
							|  |  |  | use Pod::Usage; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use List::Util qw{max}; | 
					
						
							|  |  |  | use List::MoreUtils qw{uniq apply}; | 
					
						
							|  |  |  | use Config::General; | 
					
						
							|  |  |  | use Text::CSV_XS; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use IO::HTML; | 
					
						
							|  |  |  | use HTML::TreeBuilder 5 -weak; | 
					
						
							|  |  |  | use HTML::Entities; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use File::Find::Rule; | 
					
						
							|  |  |  | use File::Path qw{make_path}; | 
					
						
							|  |  |  | #use LWP::Simple; | 
					
						
							|  |  |  | use LWP::UserAgent; | 
					
						
							|  |  |  | use HTTP::Status qw(status_message); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use DBI; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Data::Dumper; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Version number (manually incremented) | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | our $VERSION = '0.4.14'; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Script and directory names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( my $PROG = $0 ) =~ s|.*/||mx; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Constants and defaults | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $basedir      = "$ENV{HOME}/HPR/IA"; | 
					
						
							|  |  |  | my $configfile   = "$basedir/.$PROG.cfg"; | 
					
						
							|  |  |  | my $dbconfigfile = "$basedir/.hpr_db.cfg"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Declarations | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Constants used in the metadata | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $mediatype   = 'audio'; | 
					
						
							|  |  |  | my $collection  = 'hackerpublicradio'; | 
					
						
							|  |  |  | my $language    = 'eng'; | 
					
						
							|  |  |  | my $contributor = 'HackerPublicRadio'; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Variables, arrays and hashes | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my ( $dbh, $sth1, $h1 ); | 
					
						
							|  |  |  | my ( $DEBUG, $verbose, $silent, $test, $from, $to, $count ); | 
					
						
							|  |  |  | my ( $list, $meta_only, $outfile, $scriptfile, $fetch, $assets); | 
					
						
							|  |  |  | my ( $acountfile, $ignore ); | 
					
						
							|  |  |  | my ( @range, $tree, %links, %meta, @source, @transcripts, @counts, @head, @data ); | 
					
						
							|  |  |  | my ( $ep_name, $ft, $file, $URL, $status, $incomplete ); | 
					
						
							|  |  |  | my ( $filepath, $max_epno, $filetemplate, $sourceURLtemplate ); | 
					
						
							|  |  |  | my ( $iauploadtemplate, $iauploadoptions ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # File types we'll look for when working out (and collecting) the main audio | 
					
						
							|  |  |  | # file and their order | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my @filetypes = (qw{ wav mp3 ogg }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Additional filetypes we'll look for if we're working round the lack of tags | 
					
						
							|  |  |  | # on the derived audio files. See the discussion at | 
					
						
							|  |  |  | # https://gitlab.anhonesthost.com/HPR/HPR_Public_Code/issues/34 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my @additional_audio = (qw{ flac mp3 ogg opus spx }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Templates for finding the transcripts. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # We expect these to be in $uploads/$episode/ where we expect to find: | 
					
						
							|  |  |  | #     $episode.json | 
					
						
							|  |  |  | #     $episode.srt | 
					
						
							|  |  |  | #     $episode.tsv | 
					
						
							|  |  |  | #     $episode.txt | 
					
						
							|  |  |  | #     $episode.vtt | 
					
						
							|  |  |  | # The regular expression below should find the transcripts given that the | 
					
						
							|  |  |  | # search is directed to the sub-directory in $uploads. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $transcript_re = qr{hpr\d{4}\.(json|srt|tsv|txt|vtt)}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Names of CSV fields and their order | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my @fields = ( | 
					
						
							|  |  |  |     qw{ | 
					
						
							|  |  |  |         identifier file mediatype collection title creator language | 
					
						
							|  |  |  |         description contributor date subject licenseurl | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Maximum episodes per run | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $max_episodes = 20; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # A dispatch table to define how to build metadata. The keys are actually | 
					
						
							|  |  |  | # header names to be used in the metadata file and must relate to the @fields | 
					
						
							|  |  |  | # array above. Two types of values are expected here: 1) anonymous subroutine | 
					
						
							|  |  |  | # references which return the value for the field or 2) scalars (or strings) | 
					
						
							|  |  |  | # to be placed as-is. Some of the anonymous subroutines are written as | 
					
						
							|  |  |  | # closures (see | 
					
						
							|  |  |  | # http://en.wikipedia.org/wiki/Closure_%28computer_programming%29), which make | 
					
						
							|  |  |  | # it easier to build the calling code since we can pass in a variety of | 
					
						
							|  |  |  | # arguments internally while offering a simple no-argument call externally. | 
					
						
							|  |  |  | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | my %dispatch = ( | 
					
						
							|  |  |  |     identifier => sub { return make_item( $h1, $test ) }, | 
					
						
							|  |  |  |     file       => sub { | 
					
						
							|  |  |  |         return ( | 
					
						
							|  |  |  |             $meta_only | 
					
						
							|  |  |  |             ? '' | 
					
						
							|  |  |  |             : make_filename( $filepath, $filetemplate, \@filetypes, $h1 ) | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     }, | 
					
						
							|  |  |  |     mediatype   => $mediatype, | 
					
						
							|  |  |  |     collection  => $collection, | 
					
						
							|  |  |  |     #title       => sub { return decode_entities( db_title($h1) ) }, | 
					
						
							|  |  |  |     title       => sub { return db_title($h1) }, | 
					
						
							|  |  |  |     creator     => sub { return $h1->{host} }, | 
					
						
							|  |  |  |     language    => $language, | 
					
						
							|  |  |  |     description => sub { return db_notes( $sourceURLtemplate, $h1, $tree, \@source ) }, | 
					
						
							|  |  |  |     contributor => $contributor, | 
					
						
							|  |  |  |     date       => sub { return $h1->{date} }, | 
					
						
							|  |  |  |     subject    => sub { return db_tags($h1) }, | 
					
						
							|  |  |  |     licenseurl => sub { return $h1->{license_url} } | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Validate the data structures above to ensure there have been no mis-edits | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | foreach (@fields) { | 
					
						
							|  |  |  |     die "Mis-match between \@fields and \%dispatch!\n" | 
					
						
							|  |  |  |         unless ( defined( $dispatch{$_} ) ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The main query on the database. We want episode details, the host's name, | 
					
						
							|  |  |  | # the series name and the Creative Commons licence the episode is under. | 
					
						
							|  |  |  | # Note that the 'LEFT JOIN' for 'miniseries' is for the case where the | 
					
						
							|  |  |  | # 'series' column of 'eps' is 'NULL'. This is not the case for the live | 
					
						
							|  |  |  | # database - yet! | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $sql = q{ | 
					
						
							|  |  |  |     SELECT | 
					
						
							|  |  |  |         e.id AS eps_id, | 
					
						
							|  |  |  |         e.date, | 
					
						
							|  |  |  |         e.title, | 
					
						
							|  |  |  |         sec_to_time(e.duration) AS duration, | 
					
						
							|  |  |  |         e.summary, | 
					
						
							|  |  |  |         e.notes, | 
					
						
							|  |  |  |         e.explicit, | 
					
						
							|  |  |  |         e.license, | 
					
						
							|  |  |  |         l.url as license_url, | 
					
						
							|  |  |  |         e.tags, | 
					
						
							|  |  |  |         e.valid, | 
					
						
							|  |  |  |         h.host, | 
					
						
							|  |  |  |         s.name AS s_name, | 
					
						
							|  |  |  |         s.description AS s_desc | 
					
						
							|  |  |  |     FROM eps e | 
					
						
							|  |  |  |         JOIN hosts h ON e.hostid = h.hostid | 
					
						
							|  |  |  |         LEFT JOIN miniseries s ON e.series = s.id | 
					
						
							|  |  |  |         JOIN licenses l ON e.license = l.short_name | 
					
						
							|  |  |  |     WHERE e.id = ? | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Enable Unicode mode | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | binmode STDOUT, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | binmode STDERR, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Options and arguments | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Option defaults | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEFDEBUG = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my %options; | 
					
						
							|  |  |  | Options( \%options ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default help shows minimal information | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     if ( $options{'help'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The -documentation or -man option shows the full POD documentation through | 
					
						
							|  |  |  | # a pager for convenience | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | 
					
						
							|  |  |  |     if ( $options{'documentation'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Collect options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $DEBUG   = ( defined( $options{debug} )   ? $options{debug}   : $DEFDEBUG ); | 
					
						
							|  |  |  | $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 ); | 
					
						
							|  |  |  | $silent  = ( defined( $options{silent} )  ? $options{silent}  : 0 ); | 
					
						
							|  |  |  | $test    = ( defined( $options{test} )    ? $options{test}    : 0 ); | 
					
						
							|  |  |  | $from    = $options{from}; | 
					
						
							|  |  |  | $to      = $options{to}; | 
					
						
							|  |  |  | $count   = $options{count}; | 
					
						
							|  |  |  | $list    = $options{list}; | 
					
						
							|  |  |  | $meta_only = ( defined( $options{meta_only} ) ? $options{meta_only} : 0 ); | 
					
						
							|  |  |  | $fetch     = ( defined( $options{fetch} )     ? $options{fetch}     : 1 ); | 
					
						
							|  |  |  | $assets    = ( defined( $options{assets} )    ? $options{assets}    : 1 ); | 
					
						
							|  |  |  | $ignore | 
					
						
							|  |  |  |     = ( defined( $options{ignore_missing} ) ? $options{ignore_missing} : 0 ); | 
					
						
							|  |  |  | $outfile    = $options{output}; | 
					
						
							|  |  |  | $scriptfile = $options{script}; | 
					
						
							|  |  |  | $acountfile = $options{a_count}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The two config files can be overridden or default to the original | 
					
						
							|  |  |  | # declarations | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $cfgfile | 
					
						
							|  |  |  |     = ( defined( $options{config} ) ? $options{config} : $configfile ); | 
					
						
							|  |  |  | my $dbcfgfile | 
					
						
							|  |  |  |     = ( defined( $options{dbconfig} ) ? $options{dbconfig} : $dbconfigfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Load and validate program configuration data | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | die "Configuration file $cfgfile not found\n" unless ( -e $cfgfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $conf = Config::General->new( | 
					
						
							|  |  |  |     -ConfigFile      => $configfile, | 
					
						
							|  |  |  |     -InterPolateVars => 1, | 
					
						
							|  |  |  |     -ExtendedAccess  => 1, | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | my %config = $conf->getall(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Check the config settings | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $filepath = $config{uploads}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | die "Path $filepath not found\n" unless ( -e $filepath ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $max_epno          = $config{max_epno}; | 
					
						
							|  |  |  | $filetemplate      = $config{filetemplate}; | 
					
						
							|  |  |  | $sourceURLtemplate = $config{sourceURLtemplate}; | 
					
						
							|  |  |  | $iauploadtemplate  = $config{iauploadtemplate} . "\n"; | 
					
						
							|  |  |  | $iauploadoptions   = $config{iauploadoptions}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | die "Configuration data missing\n" | 
					
						
							|  |  |  |     unless $max_epno | 
					
						
							|  |  |  |     && $filetemplate | 
					
						
							|  |  |  |     && $sourceURLtemplate | 
					
						
							|  |  |  |     && $iauploadtemplate | 
					
						
							|  |  |  |     && $iauploadoptions; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Load database configuration data | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | die "Configuration file $dbcfgfile not found\n"  unless ( -e $dbcfgfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $dbconf = Config::General->new( | 
					
						
							|  |  |  |     -ConfigFile      => $dbcfgfile, | 
					
						
							|  |  |  |     -InterPolateVars => 1, | 
					
						
							|  |  |  |     -ExtendedAccess  => 1, | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | my %dbcfg = $dbconf->getall(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Validate options | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     if ( ( !defined($from) and !defined($list) ) | 
					
						
							|  |  |  |     or ( defined($from) and defined($list) ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Deal with the values specified by one of: | 
					
						
							|  |  |  | #     -from=N1 -to=N2 | 
					
						
							|  |  |  | #     -from=N3 -count=N4 | 
					
						
							|  |  |  | #     -list="N1,N2,N3" | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # End by populating @range with a list of episode numbers | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ( defined($list) ) { | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # We have a list which we'll parse, validate, sort, make unique and filter | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $lcsv = Text::CSV_XS->new( { binary => 1, } ); | 
					
						
							|  |  |  |     if ( $lcsv->parse($list) ) { | 
					
						
							|  |  |  |         @range = uniq( sort { $a <=> $b } $lcsv->fields() ); | 
					
						
							|  |  |  |         @range = grep {/\d+/} @range; | 
					
						
							|  |  |  |         @range = grep { $_ > 0 && $_ <= $max_epno } @range; | 
					
						
							|  |  |  |         @range = apply { $_ =~ s/(^\s*|\s*$)// } @range; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         die "Invalid list; no elements\n" if scalar(@range) == 0; | 
					
						
							|  |  |  |         die "Invalid list; too many elements\n" | 
					
						
							|  |  |  |             if scalar(@range) > $max_episodes; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         die "Failed to parse -list='$list'\n" . $lcsv->error_diag() . "\n"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | else { | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # We have -from=N, -to=M or -count=X | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     die "Invalid starting episode number ($from)\n" unless $from > 0; | 
					
						
							|  |  |  |     die "Do not combine -to and -count\n" | 
					
						
							|  |  |  |         if ( defined($to) && defined($count) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( defined($count) ) { | 
					
						
							|  |  |  |         $to = $from + $count - 1; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     elsif ( !defined($to) ) { | 
					
						
							|  |  |  |         $to = $from; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     die "Invalid range; $from is greater than $to\n" unless $from <= $to; | 
					
						
							|  |  |  |     die "Invalid range; range is too big (>$max_episodes): $to - $from = " | 
					
						
							|  |  |  |         . ( $to - $from ) . "\n" if ( $to - $from ) > $max_episodes; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @range = ( $from .. $to ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Check we got a sensible range | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     unless scalar(@range) > 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # You can't be silent and verbose at the same time | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $silent = 0 if $verbose; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Fetching audio files is not relevant in 'metadata only' mode | 
					
						
							|  |  |  | # TODO: -nofetch with -meta_only incorrectly results in the download of files; | 
					
						
							|  |  |  | # see the Journal in regard to show 3004 in April 2020. | 
					
						
							|  |  |  | # TODO: 2023-07-06 Time to drop '-[no]fetch' completely? | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $fetch = 0 if $meta_only; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # If there's a defined output filename then there are three options: a null | 
					
						
							|  |  |  | # string, a plain filename and a substitution string with '%d' sequences. The | 
					
						
							|  |  |  | # null string means the user used '-output' without a value, so we want to | 
					
						
							|  |  |  | # generate a substitution string. A string with '%d' requires a check to | 
					
						
							|  |  |  | # ensure there's the right number, one for $ubound and one for $lbound. The | 
					
						
							|  |  |  | # plain filename needs no more work. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # If no defined output filename we'll provide a default. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $outfile = expand_template( $outfile, 'metadata', 'csv', | 
					
						
							|  |  |  |     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Check the -script option. Like -output the choices are: a null string, | 
					
						
							|  |  |  | # a plain filename or a substitution string with '%d' sequences. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $scriptfile = expand_template( $scriptfile, 'script', 'sh', | 
					
						
							|  |  |  |     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # In verbose mode report the options | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | if ($verbose) { | 
					
						
							|  |  |  |     my $fmt = "%18s: %s\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     printf $fmt, 'From',  $from  if defined( $options{from} ); | 
					
						
							|  |  |  |     printf $fmt, 'To',    $to    if defined( $options{to} ); | 
					
						
							|  |  |  |     printf $fmt, 'Count', $count if defined( $options{count} ); | 
					
						
							|  |  |  |     printf $fmt, 'List',  $list  if defined( $options{list} ); | 
					
						
							|  |  |  |     printf $fmt, 'Output',        $outfile; | 
					
						
							|  |  |  |     printf $fmt, 'Output script', $scriptfile; | 
					
						
							|  |  |  |     printf $fmt, 'Output asset count', ( $acountfile ? $acountfile : '--' ); | 
					
						
							|  |  |  |     printf $fmt, 'Meta only', ( $meta_only ? 'Yes' : 'No' ); | 
					
						
							|  |  |  |     printf $fmt, 'Fetch',     ( $fetch     ? 'Yes' : 'No' ); | 
					
						
							|  |  |  |     printf $fmt, 'Silent',    ( $silent    ? 'Yes' : 'No' ); | 
					
						
							|  |  |  |     printf $fmt, 'Verbose',   ( $verbose   ? 'Yes' : 'No' ); | 
					
						
							|  |  |  |     printf $fmt, 'Debug',           $DEBUG; | 
					
						
							|  |  |  |     printf $fmt, 'Script config',   $cfgfile; | 
					
						
							|  |  |  |     printf $fmt, 'Database config', $dbcfgfile; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print '-' x 80, "\n"; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Set up output and script file | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | open( my $out, '>:encoding(UTF-8)', $outfile ) | 
					
						
							|  |  |  |     or die "Unable to open $outfile for output: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Open the script file and add a header. Set execute permission on the | 
					
						
							|  |  |  | # file for the owner, and nobody else. We keep a count of lines (other than | 
					
						
							|  |  |  | # the header stuff) written to this file. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $script_lines = 0; | 
					
						
							|  |  |  | open( my $script, '>:encoding(UTF-8)', $scriptfile ) | 
					
						
							|  |  |  |     or die "Unable to open $scriptfile for output: $!\n"; | 
					
						
							|  |  |  | print $script "#!/usr/bin/env bash\n\n"; | 
					
						
							|  |  |  | print $script <<'EndHead'; | 
					
						
							|  |  |  | Upload () { | 
					
						
							|  |  |  |     local id=${1} | 
					
						
							|  |  |  |     local file=${2} | 
					
						
							|  |  |  |     local remote=${3:-} | 
					
						
							|  |  |  |     local options=${4:-} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if [[ -e $file ]]; then | 
					
						
							|  |  |  |         if [[ -z $remote ]]; then | 
					
						
							|  |  |  |             ia upload ${id} ${file} ${options} | 
					
						
							|  |  |  |         else | 
					
						
							|  |  |  |             ia upload ${id} ${file} --remote-name=${remote} ${options} | 
					
						
							|  |  |  |         fi | 
					
						
							|  |  |  |     else | 
					
						
							|  |  |  |         echo "File missing: $file" | 
					
						
							|  |  |  |     fi | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | EndHead | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | chmod(0744,$scriptfile); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Make changes for test mode | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # NOTE: Commented out 2023-07-06 since this is dangerous | 
					
						
							|  |  |  | #if ($test) { | 
					
						
							|  |  |  | #    $dispatch{collection}  = 'test_collection'; | 
					
						
							|  |  |  | #    $dispatch{contributor} = 'perlist'; | 
					
						
							|  |  |  | #} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Prepare to turn the data into CSV data. Since we're allowing embedded | 
					
						
							|  |  |  | # newlines (binary mode) add an explicit end of line. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # TODO: The new tool 'ia' does not accept newlines in the CSV, so no point in | 
					
						
							|  |  |  | # defining them here. They have been removed elsewhere in the code. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $csv = Text::CSV_XS->new( { binary => 1, eol => "\r\n" } ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Copy audio files from the website if requested and if needed. We look for | 
					
						
							|  |  |  | # the WAV file in the 'uploads' area but if it's not found we get the MP3 | 
					
						
							|  |  |  | # (which is all we have for older shows). | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # TODO: Given we're redirecting requests for audio files on the HPR site to | 
					
						
							|  |  |  | # archive.org there's potential for a nasty paradox here if the rewrite rules | 
					
						
							|  |  |  | # aren't perfect. We do the MP3 thing only for old shows, so we need to make | 
					
						
							|  |  |  | # sure that it works when we need it. | 
					
						
							|  |  |  | # TODO: Are there any logic flaws here? This was designed before we started | 
					
						
							|  |  |  | # creating all the audio formats we have now (partly to avoid the 'derive' | 
					
						
							|  |  |  | # process on archive.org from making copies without tags). | 
					
						
							|  |  |  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # NOTE: Commented out 2023-07-06 because (a) this situation should not occur | 
					
						
							|  |  |  | # again now that all audio versions are on the IA and we get everything for | 
					
						
							|  |  |  | # new shows, (b) the URLs used for downloads are obsolete. | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | #if ( $fetch && !$meta_only ) { | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #    foreach my $episode (@range) { | 
					
						
							|  |  |  | #        $ft = 0; | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        # | 
					
						
							|  |  |  | #        # Does the WAV version already exist? | 
					
						
							|  |  |  | #        # | 
					
						
							|  |  |  | #        $file = sprintf( $filetemplate, $episode, $filetypes[$ft] ); | 
					
						
							|  |  |  | #        if ( ! -e "$filepath/$file" ) { | 
					
						
							|  |  |  | #            # | 
					
						
							|  |  |  | #            # Doesn't exist, try the alternative version | 
					
						
							|  |  |  | #            # | 
					
						
							|  |  |  | #            $ft++; | 
					
						
							|  |  |  | #            $file = sprintf( $filetemplate, $episode, $filetypes[$ft] ); | 
					
						
							|  |  |  | #            if ( !-e "$filepath/$file" ) { | 
					
						
							|  |  |  | #                # | 
					
						
							|  |  |  | #                # We need to download the file | 
					
						
							|  |  |  | #                # | 
					
						
							|  |  |  | #                $URL = sprintf( $URLtemplate, $file ); | 
					
						
							|  |  |  | #                print STDERR "Downloading $URL\n" unless $silent; | 
					
						
							|  |  |  | #                $status = getstore( $URL, "$filepath/$file" ); | 
					
						
							|  |  |  | #                $status == 200 or warn "$status : $URL ($file)\n"; | 
					
						
							|  |  |  | #            } | 
					
						
							|  |  |  | #        } | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        die "Unable to find or download $file\n" | 
					
						
							|  |  |  | #            unless ( -e "$filepath/$file" ); | 
					
						
							|  |  |  | #    } | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Connect to the database | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $dbhost = $dbcfg{database}->{host} // '127.0.0.1'; | 
					
						
							|  |  |  | my $dbport = $dbcfg{database}->{port} // 3306; | 
					
						
							|  |  |  | my $dbname = $dbcfg{database}->{name}; | 
					
						
							|  |  |  | my $dbuser = $dbcfg{database}->{user}; | 
					
						
							|  |  |  | my $dbpwd  = $dbcfg{database}->{password}; | 
					
						
							|  |  |  | $dbh = DBI->connect( "dbi:mysql:database=$dbname;host=$dbhost;port=$dbport", | 
					
						
							|  |  |  |     $dbuser, $dbpwd, { AutoCommit => 1 } ) | 
					
						
							|  |  |  |     or croak $DBI::errstr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Enable client-side UTF8 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $dbh->{mysql_enable_utf8} = 1; | 
					
						
							|  |  |  | $dbh->{mysql_enable_utf8mb4} = 1; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Collect the necessary information for constructing the metadata for each of | 
					
						
							|  |  |  | # the selected episodes. | 
					
						
							|  |  |  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # Each key of the %meta hash will contain an arrayref with an element per | 
					
						
							|  |  |  | # episode in the @range array. Some of these elements will be objects | 
					
						
							|  |  |  | # (arrayrefs or hashrefs) containing multi-part items such as 'subject' | 
					
						
							|  |  |  | # strings from the 'tags' field in the database. The %meta keys correspond to | 
					
						
							|  |  |  | # the strings of the @fields array which correspond to IA metadata fields and | 
					
						
							|  |  |  | # are also used by the %dispatch hash to contain actions to perform to make | 
					
						
							|  |  |  | # the contents. A few of the %meta keys correspond to other items relating to | 
					
						
							|  |  |  | # the episode such as supplementary audio and supplementary files like | 
					
						
							|  |  |  | # pictures. | 
					
						
							|  |  |  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # NOTE: See the dump of the %meta hash in the Journal that goes with this | 
					
						
							|  |  |  | # project (Example 1). It should give a better representation of this | 
					
						
							|  |  |  | # structure. | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | $sth1 = $dbh->prepare($sql); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $incomplete = 0; | 
					
						
							|  |  |  | foreach my $episode (@range) { | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Get the episode from the database | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $sth1->execute($episode); | 
					
						
							|  |  |  |     if ( $dbh->err ) { | 
					
						
							|  |  |  |         croak $dbh->errstr; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $h1 = $sth1->fetchrow_hashref; | 
					
						
							|  |  |  |     unless ($h1) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Somehow this episode doesn't exist in the database. No idea why, but | 
					
						
							|  |  |  |         # skip this one and keep going anyway. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         carp "Failed to find requested episode number $episode\n"; | 
					
						
							|  |  |  |         next; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print STDERR "D> ", $h1->{title}, "\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Make the episode name with leading zeroes | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $ep_name = sprintf( 'hpr%04d', $episode ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Check whether this episode has a summary and tags. Count up the number | 
					
						
							|  |  |  |     # that are missing either of these. | 
					
						
							|  |  |  |     # TODO: Time to remove this check? | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( length( $h1->{summary} ) == 0 || length( $h1->{tags} ) == 0 ) { | 
					
						
							|  |  |  |         print STDERR "Episode $episode is missing summary and/or tags\n" | 
					
						
							|  |  |  |             unless ($silent); | 
					
						
							|  |  |  |         $incomplete++; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Handle supplementary audio | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ( defined( $meta{audio} ) ) { | 
					
						
							|  |  |  |         $meta{audio} = []; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @audio; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Look for other audio files for upload. | 
					
						
							|  |  |  |     # TODO: Consider catering for hpr\d{4}(\.\d)?\.(flac|mp3|ogg|opus|spx) | 
					
						
							|  |  |  |     # e.g. hpr2781.1.ogg which could happen if the audio has had to be | 
					
						
							|  |  |  |     # re-released. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     foreach my $ext ( @additional_audio ) { | 
					
						
							|  |  |  |         my $audio | 
					
						
							|  |  |  |             = sprintf( "%s/hpr%04d.%s", $config{uploads}, $episode, $ext ); | 
					
						
							|  |  |  |         if ( -e $audio ) { | 
					
						
							|  |  |  |             push( @audio, $audio ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Look for the source file if provided. It will be called | 
					
						
							|  |  |  |     # 'hpr9999_source.{wav,flac,mp3,ogg,opus}' (though it's doubtful if people | 
					
						
							|  |  |  |     # are sending in opus). For safety we'll accept any extension. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     @source = File::Find::Rule->file() | 
					
						
							|  |  |  |         ->name("hpr${episode}_source.*") | 
					
						
							|  |  |  |         ->in($filepath); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Generate the upload command(s) for the source file(s). We upload to the | 
					
						
							|  |  |  |     # main directory for the item and request that no 'derive' process be run | 
					
						
							|  |  |  |     # on the file(s). If we're just generating metadata it's not really | 
					
						
							|  |  |  |     # necessary to do this. Rather than having a different format here we now | 
					
						
							|  |  |  |     # use a generic format from the configuration file but pass in a null | 
					
						
							|  |  |  |     # remote filename. We also add default options, also from the | 
					
						
							|  |  |  |     # configuration file in all cases so we don't derive anything and don't | 
					
						
							|  |  |  |     # generate an archive of old stuff. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # TODO: What if the source file(s) had been omitted in the first pass, and | 
					
						
							|  |  |  |     # we need to upload them now? Could just do it by hand of course. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ($meta_only) { | 
					
						
							|  |  |  |         for my $sou (@source) { | 
					
						
							|  |  |  |             printf $script $iauploadtemplate, | 
					
						
							|  |  |  |                 'hpr' . $episode, | 
					
						
							|  |  |  |                 $sou, | 
					
						
							|  |  |  |                 '', | 
					
						
							|  |  |  |                 $iauploadoptions; | 
					
						
							|  |  |  |             $script_lines++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Turn all source audio files into URLs pointing to the IA item | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     for (my $i = 0; $i <= $#source; $i++) { | 
					
						
							|  |  |  |         ( my $bn = $source[$i] ) =~ s|.*/||mx; | 
					
						
							|  |  |  |         $source[$i] = sprintf( $config{IAURLtemplate}, "hpr$episode", $bn ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     #push( @audio, @source ) if (@source); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Save whatever was found, whether an arrayref of audio or undef | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     push( @{ $meta{audio} }, \@audio ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Most shows have transcripts since Q2 2023. They aren't referred to in | 
					
						
							|  |  |  |     # the show notes or database (not at present anyway). They get added to | 
					
						
							|  |  |  |     # the static page on the HPR site with a footer template on every page. | 
					
						
							|  |  |  |     # The files themselves arrive in a sub-directory of $uploads (usually | 
					
						
							|  |  |  |     # `/data/IA/uploads/hpr1234/`) as part of the show components. For now | 
					
						
							|  |  |  |     # we'll search for them and upload them using the Bash script we're | 
					
						
							|  |  |  |     # building which is run after the initial 'ia upload'. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ($meta_only) { | 
					
						
							|  |  |  |         @transcripts = File::Find::Rule->file() | 
					
						
							|  |  |  |             ->name($transcript_re) | 
					
						
							|  |  |  |             ->in("$filepath/$ep_name"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Write 'Upload' function calls to the script for all of the transcripts | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         for my $ts (@transcripts) { | 
					
						
							|  |  |  |             ( my $bn = $ts ) =~ s|.*/||mx; | 
					
						
							|  |  |  |             printf $script $iauploadtemplate, $ep_name,    # identifier | 
					
						
							|  |  |  |                 $ts,                                       # local file path | 
					
						
							|  |  |  |                 "$ep_name/$bn",                            # Remote file path | 
					
						
							|  |  |  |                 $iauploadoptions; | 
					
						
							|  |  |  |             $script_lines++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Initialise the hash element for holding links to files in the notes per | 
					
						
							|  |  |  |     # show | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ( defined( $meta{links} ) ) { | 
					
						
							|  |  |  |         $meta{links} = []; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Scan the database-stored notes looking for links to files on the HPR | 
					
						
							|  |  |  |     # server. For any HTML files found in that pass, download them and perform | 
					
						
							|  |  |  |     # another scan. If further HTML files are found download and scan them and | 
					
						
							|  |  |  |     # so on recursively. Any non-HTML files are left for downloading later. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     #<<<                                 (Stop perltidy reformatting oddly) | 
					
						
							|  |  |  |     if (find_links_in_notes($episode, $h1->{notes}, \$tree, \%links, \%config, | 
					
						
							|  |  |  |          $verbose, $silent) > 0) { | 
					
						
							|  |  |  |     #>>> | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Save the links we found. Do this by copying the %links hash into the | 
					
						
							|  |  |  |         # %meta hash | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         push( @{ $meta{links} }, {%links} ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         push( @{ $meta{links} }, undef ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Build the metadata hash using the dispatch table. For each field we will | 
					
						
							|  |  |  |     # have an array of data. This may be a scalar for single-valued items or | 
					
						
							|  |  |  |     # an array for multi-valued ones. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     foreach my $fld (@fields) { | 
					
						
							|  |  |  |         unless ( defined( $meta{$fld} ) ) { | 
					
						
							|  |  |  |             $meta{$fld} = []; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # If it's a code reference then call the code and save the result | 
					
						
							|  |  |  |         # otherwise save the data | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( ref( $dispatch{$fld} ) eq 'CODE' ) { | 
					
						
							|  |  |  |             push( @{ $meta{$fld} }, $dispatch{$fld}->() ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             push( @{ $meta{$fld} }, $dispatch{$fld} ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | _debug( $DEBUG >= 3, Dumper(\%meta) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | die "Nothing to do\n" unless %meta; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # If we're not ignoring missed summaries and tags and we found them then abort | 
					
						
							|  |  |  | # with a message. | 
					
						
							|  |  |  | # TODO: 2023-07-06 Not needed any more? | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ( $incomplete > 0 ) { | 
					
						
							|  |  |  |     unless ($ignore) { | 
					
						
							|  |  |  |         die "Aborted due to missing summaries and/or tags\n"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         say "Missing summaries and/or tags - ignored" unless ($silent); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # ~~ Explanation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # Considering '%meta' as a "sideways spreadsheet" where the keys are row | 
					
						
							|  |  |  | # headings and the elements within the arrayrefs held as the hash values, the | 
					
						
							|  |  |  | # "cells" (array elements) may be arrayrefs with multiple elements (i.e. the | 
					
						
							|  |  |  | # spreadsheet is 3-dimensional!). We need to know the maximum number of array | 
					
						
							|  |  |  | # elements per cell because we have to expand the resulting CSV "spreadsheet" | 
					
						
							|  |  |  | # we'll generate for the 'ia upload' command by providing headers like | 
					
						
							|  |  |  | # "subject[1]" and "subject[2]". Hard to explain without a diagram! | 
					
						
							|  |  |  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Compute the maximum number of sub-fields when a field can be multi-valued. | 
					
						
							|  |  |  | # This is because all CSV rows must have the same number of fields; the | 
					
						
							|  |  |  | # maximum of course. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | @counts = map { | 
					
						
							|  |  |  |     max( map { ref($_) eq 'ARRAY' ? scalar(@$_) : 1 } @{ $meta{$_} } ) | 
					
						
							|  |  |  | } @fields; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Report on the collected data if requested. | 
					
						
							|  |  |  | # See 'Example 1' in the Journal for this project for what the '%meta' hash | 
					
						
							|  |  |  | # looks like at this stage. | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | if ($verbose) { | 
					
						
							|  |  |  |     foreach my $i ( 0 .. $#range ) { | 
					
						
							|  |  |  |         my $ind = 0; | 
					
						
							|  |  |  |         foreach my $fld (@fields) { | 
					
						
							|  |  |  |             printf "%20s[%d]: %s\n", $fld, $counts[$ind], | 
					
						
							|  |  |  |                 defined( $meta{$fld}->[$i] ) | 
					
						
							|  |  |  |                 ? ( | 
					
						
							|  |  |  |                 ref( $meta{$fld}->[$i] ) eq 'ARRAY' | 
					
						
							|  |  |  |                 ? join( ",", @{ $meta{$fld}->[$i] } ) | 
					
						
							|  |  |  |                 : $meta{$fld}->[$i] | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 : ''; | 
					
						
							|  |  |  |             $ind++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Each show will (most likely) have an array of audio types in an | 
					
						
							|  |  |  |         # array of arrays. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( defined( $meta{audio}->[$i] ) ) { | 
					
						
							|  |  |  |             printf "%23s: %s\n", 'added audio', | 
					
						
							|  |  |  |                 scalar( @{ $meta{audio}->[$i] } ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             printf "%23s: 0\n", 'added audio'; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Each show may have links. They will be stored as an array of hashes | 
					
						
							|  |  |  |         # where each hash has sub-hashes, one per "asset". Some will just be | 
					
						
							|  |  |  |         # 'undef' meaning that they have no assets. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( defined( $meta{links}->[$i] ) ) { | 
					
						
							|  |  |  |             printf "%23s: %s\n", 'links', | 
					
						
							|  |  |  |                 scalar( keys( %{ $meta{links}->[$i] } ) ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             printf "%23s: 0\n", 'links'; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print '-' x 80, "\n"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Output asset counts if requested | 
					
						
							|  |  |  | # TODO: Used in 'past_upload', which is no longer used (since that project is | 
					
						
							|  |  |  | # complete). | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | if ($acountfile) { | 
					
						
							|  |  |  |     my $acount_lines = 0; | 
					
						
							|  |  |  |     open( my $acount, '>:encoding(UTF-8)', $acountfile ) | 
					
						
							|  |  |  |         or die "Unable to open $acountfile for output: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     foreach my $i ( 0 .. $#range ) { | 
					
						
							|  |  |  |         if ( defined( $meta{links}->[$i] ) ) { | 
					
						
							|  |  |  |             printf $acount "%s %d\n", $meta{identifier}->[$i], | 
					
						
							|  |  |  |                 scalar( keys( %{ $meta{links}->[$i] } ) ); | 
					
						
							|  |  |  |             $acount_lines++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     close($acount); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If we never wrote anything to the asset count file then delete it | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( $acount_lines eq 0 ) { | 
					
						
							|  |  |  |         print "Deleting empty '$acountfile'\n" if $verbose; | 
					
						
							|  |  |  |         unlink($acountfile); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Download any linked files unless the -noassets option was used. | 
					
						
							|  |  |  | # Report the details in verbose mode. Only download if the file is not already | 
					
						
							|  |  |  | # downloaded. | 
					
						
							|  |  |  | # NOTE: HTML downloads will have already happened since we need to recursively | 
					
						
							|  |  |  | # parse and modify them. So if an asset is HTML on the HPR site an earlier | 
					
						
							|  |  |  | # stage will have downloaded it so that it can be parsed for any further HTML | 
					
						
							|  |  |  | # or assets. | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | if ($assets) { | 
					
						
							|  |  |  |     foreach my $i ( 0 .. $#range ) { | 
					
						
							|  |  |  |         if ( defined( $meta{links}->[$i] ) ) { | 
					
						
							|  |  |  |             for my $key ( keys( %{ $meta{links}->[$i] } ) ) { | 
					
						
							|  |  |  |                 my $linkfile = $meta{links}->[$i]->{$key}->{cached}; | 
					
						
							|  |  |  |                 if ( ! -e $linkfile ) { | 
					
						
							|  |  |  |                     download_url( $key, $linkfile, | 
					
						
							|  |  |  |                         $meta{links}->[$i]->{$key}->{new}, | 
					
						
							|  |  |  |                         $verbose, $silent ); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Generate the completed CSV | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # First build and output the header fields. If there's a single header then just | 
					
						
							|  |  |  | # output it, but if it's a multi-item thing output the maximum with indexes | 
					
						
							|  |  |  | # (like "subject[0]"). | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | for my $ind ( 0 .. $#fields ) { | 
					
						
							|  |  |  |     if ( $counts[$ind] == 1 ) { | 
					
						
							|  |  |  |         push( @head, $fields[$ind] ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         for ( my $i = 0; $i < $counts[$ind]; $i++ ) { | 
					
						
							|  |  |  |             push( @head, $fields[$ind] . "[$i]" ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $csv->print( $out, \@head ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Build and output the data. The structure we have is a hash where each | 
					
						
							|  |  |  | # element is an array with an element per episode (0 .. $range). The | 
					
						
							|  |  |  | # per-episode elements may be scalars (for single-value fields) or arrays (for | 
					
						
							|  |  |  | # multi-value). In the latter case we have to make sure we have filled all of | 
					
						
							|  |  |  | # the header positions (e.g. we have 10 "subject" headers, but this episode | 
					
						
							|  |  |  | # has only 1 - we have to fill in the remaining 9 slots with empty items). | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | foreach my $i ( 0 .. $#range ) { | 
					
						
							|  |  |  |     @data = (); | 
					
						
							|  |  |  |     my $ind = 0; | 
					
						
							|  |  |  |     foreach my $fld (@fields) { | 
					
						
							|  |  |  |         my $count = 0; | 
					
						
							|  |  |  |         if ( ref( $meta{$fld}->[$i] ) eq 'ARRAY' ) { | 
					
						
							|  |  |  |             foreach my $elem ( @{ $meta{$fld}->[$i] } ) { | 
					
						
							|  |  |  |                 push( @data, $elem ); | 
					
						
							|  |  |  |                 $count++; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             push( @data, $meta{$fld}->[$i] ); | 
					
						
							|  |  |  |             $count++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Cater for any blank slots | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( $count < $counts[$ind] ) { | 
					
						
							|  |  |  |             push( @data, undef ) for 1 .. ( $counts[$ind] - $count ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $ind++; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     #print Dumper( \@data ), "\n"; | 
					
						
							|  |  |  |     $csv->print( $out, \@data ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If there are additional audio files or supplementary files for the | 
					
						
							|  |  |  |     # current show write a CSV row for each of them here. | 
					
						
							|  |  |  |     # NOTE: originally the CSV format did not require the second and | 
					
						
							|  |  |  |     # subsequent row belonging to an IA identifier to have an identifier in it | 
					
						
							|  |  |  |     # (maybe it was even illegal in the early days). Now the ia tool seems to | 
					
						
							|  |  |  |     # have made it mandatory - surprisingly. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( defined( $meta{audio}->[$i] ) ) { | 
					
						
							|  |  |  |         for my $audio (@{$meta{audio}->[$i]}) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Each row is a file path which we'll add to the CSV. We use the | 
					
						
							|  |  |  |             # length of the @header array to get the number of CSV fields | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             @data = (); | 
					
						
							|  |  |  | #           push( @data, undef, $audio ); | 
					
						
							|  |  |  |             push( @data, $meta{identifier}->[$i], $audio ); | 
					
						
							|  |  |  |             push( @data, undef ) for 1 .. ( scalar(@head) - 2 ); | 
					
						
							|  |  |  |             $csv->print( $out, \@data ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # The 'links' key holds an array of hashes containing details of the | 
					
						
							|  |  |  |     # 'assets' relating to a show (files other than audio files, but including | 
					
						
							|  |  |  |     # the original "source" audio). In make_metadata V0.3.x we used these | 
					
						
							|  |  |  |     # hashes to add rows to the CSV file which results in the upload of these | 
					
						
							|  |  |  |     # files.  However, this mechanism does not cater for (a) the uploading of | 
					
						
							|  |  |  |     # directories to the IA, and (b) the ability to disable the "derive" | 
					
						
							|  |  |  |     # mechanism for a given file. Therefore we build a Bash script containing | 
					
						
							|  |  |  |     # a function that calls 'ia' commands which can be used instead, but only | 
					
						
							|  |  |  |     # if $meta_only is not set. | 
					
						
							|  |  |  |     # TODO: 2023-07-06 We want to upload the transcripts created by 'whisper' | 
					
						
							|  |  |  |     # and placed in the 'uploads' directory. We will add these to the 'script' | 
					
						
							|  |  |  |     # file. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( ! $meta_only && defined( $meta{links}->[$i] ) ) { | 
					
						
							|  |  |  |         for my $key ( keys( %{ $meta{links}->[$i] } ) ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Build a call to the 'Upload' function being explicit about the | 
					
						
							|  |  |  |             # remote name | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             ( my $rem = $meta{links}->[$i]->{$key}->{cached} ) | 
					
						
							|  |  |  |                 =~ s|$config{uploads}/||; | 
					
						
							|  |  |  |             printf $script $iauploadtemplate, | 
					
						
							|  |  |  |                 $meta{identifier}->[$i], | 
					
						
							|  |  |  |                 $meta{links}->[$i]->{$key}->{cached}, | 
					
						
							|  |  |  |                 $rem, | 
					
						
							|  |  |  |                 $iauploadoptions; | 
					
						
							|  |  |  |             $script_lines++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Report the output and script file names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | unless ($silent) { | 
					
						
							|  |  |  |     print "Output file: $outfile\n"; | 
					
						
							|  |  |  |     print "Script file: $scriptfile\n" unless ( $script_lines eq 0 ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | close($out); | 
					
						
							|  |  |  | close($script); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # If we never wrote anything to the script file then delete it | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($script_lines eq 0) { | 
					
						
							|  |  |  |     print "Deleting empty '$scriptfile'\n" if $verbose; | 
					
						
							|  |  |  |     unlink($scriptfile); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | exit; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: make_item | 
					
						
							|  |  |  | #      PURPOSE: Build an Archive.org item name for this item | 
					
						
							|  |  |  | #   PARAMETERS: $h              - hashref returned from the database | 
					
						
							|  |  |  | #               $testmode       - Boolean denoting test mode | 
					
						
							|  |  |  | #      RETURNS: The newly fashioned item string | 
					
						
							|  |  |  | #  DESCRIPTION: The method for generating an unique item string for | 
					
						
							|  |  |  | #               Archive.org is embodied in this function | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub make_item { | 
					
						
							|  |  |  |     my ( $h, $testmode ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return sprintf( "%shpr%04d", ( $testmode ? 'test_' : '' ), $h->{eps_id} ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: make_filename | 
					
						
							|  |  |  | #      PURPOSE: Determine the filename path for the metadata | 
					
						
							|  |  |  | #   PARAMETERS: $path           - relative path to the files | 
					
						
							|  |  |  | #               $template       - template for forming the file name | 
					
						
							|  |  |  | #               $ftypes         - arrayref holding ordered file types | 
					
						
							|  |  |  | #               $h              - hashref returned from the database | 
					
						
							|  |  |  | #      RETURNS: The path to the file | 
					
						
							|  |  |  | #  DESCRIPTION: Forms the file path then checks that there really is a file of | 
					
						
							|  |  |  | #               that name. Since we don't have *.wav files for every episode | 
					
						
							|  |  |  | #               we might have one of the other formats. The array referenced | 
					
						
							|  |  |  | #               by $ftypes contains a list of these file types, starting with | 
					
						
							|  |  |  | #               the WAV default. At the moment we only look at the second | 
					
						
							|  |  |  | #               choice if there is no WAV, but we could be more fancy if | 
					
						
							|  |  |  | #               necessary. | 
					
						
							|  |  |  | #               TODO The design of this code is not good. The earlier fetch | 
					
						
							|  |  |  | #               stage will have determined what is available. We are | 
					
						
							|  |  |  | #               potentially repeating these checks here, though they are | 
					
						
							|  |  |  | #               necessary if -nofetch was specified. Needs some further | 
					
						
							|  |  |  | #               thought. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub make_filename { | 
					
						
							|  |  |  |     my ( $path, $template, $ftypes, $h ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my $file; | 
					
						
							|  |  |  |     my $ft = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Look for the WAV file first. If found, return it | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $file = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] ); | 
					
						
							|  |  |  |     if ( -e $file ) { | 
					
						
							|  |  |  |         return $file; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Look for the alternative type of file and return that if found | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $ft++; | 
					
						
							|  |  |  |         $file | 
					
						
							|  |  |  |             = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] ); | 
					
						
							|  |  |  |         if ( -e $file ) { | 
					
						
							|  |  |  |             return $file; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             warn "No file found for hpr" . $h->{eps_id} . "\n"; | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: db_title | 
					
						
							|  |  |  | #      PURPOSE: Return a title for the Archive.org item | 
					
						
							|  |  |  | #   PARAMETERS: $h      - hashref returned from the database | 
					
						
							|  |  |  | #      RETURNS: The title | 
					
						
							|  |  |  | #  DESCRIPTION: Makes the title string we want for the item | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub db_title { | 
					
						
							|  |  |  |     my ($h) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # print STDERR "D> ", $h->{title}, "\n"; | 
					
						
							|  |  |  |     return sprintf( "hpr%04d :: %s", $h->{eps_id}, $h->{title} ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: db_notes | 
					
						
							|  |  |  | #      PURPOSE: Return the description for an episode | 
					
						
							|  |  |  | #   PARAMETERS: $sourceURLtpl   - printf template for making the HPR link to | 
					
						
							|  |  |  | #                                 the original show (from config file) | 
					
						
							|  |  |  | #               $h              - hashref returned from the database | 
					
						
							|  |  |  | #               $tree           - parsed notes | 
					
						
							|  |  |  | #               $asource        - arrayref containing source audio file URLs | 
					
						
							|  |  |  | #      RETURNS: The description built from the database for Archive.org | 
					
						
							|  |  |  | #  DESCRIPTION: This routine generates the item description for Archive.org as | 
					
						
							|  |  |  | #               HTML by concatenating the summary, series name (if there is | 
					
						
							|  |  |  | #               one), duration, link to episode on the HPR site and the notes | 
					
						
							|  |  |  | #               into a long string. The main work is in the formatting of the | 
					
						
							|  |  |  | #               notes element which is stored as HTML in the database. Since | 
					
						
							|  |  |  | #               we are making a single string from the multi-line data we have | 
					
						
							|  |  |  | #               to take special action to preserve the contents of <pre> tags. | 
					
						
							|  |  |  | #               See the comments in 'flatten_pre' for what is being done to | 
					
						
							|  |  |  | #               achieve this. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub db_notes { | 
					
						
							|  |  |  |     my ( $sourceURLtpl, $h, $tree, $asource ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $lines, $episode, $desc, $sourceurl, $notes ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Ensure leading zeroes on the episode number and prepend 'hpr' | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $episode = sprintf('hpr%04d', $h->{eps_id}); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #    $sourceurl = sprintf( "http://hackerpublicradio.org/eps.php?id=%04d", | 
					
						
							|  |  |  | #        $h->{eps_id} ); | 
					
						
							|  |  |  |     $sourceurl = sprintf( $sourceURLtpl, $episode ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Make the header with newlines so we can count them | 
					
						
							|  |  |  |     # TODO: Should we use the series number 0 which denotes "no series" rather | 
					
						
							|  |  |  |     # than the name "general"? Need to adjust the query to return this if so. | 
					
						
							|  |  |  |     # FIXME: The "not series" 'general' is being removed and the logic will | 
					
						
							|  |  |  |     # use NULL for an episode that is not in a series. The local copy of the | 
					
						
							|  |  |  |     # database has had this change made, but not the live one. We have to | 
					
						
							|  |  |  |     # cater for both. When it goes we can drop references to 'general'. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $desc = sprintf( "Summary: %s\n", $h->{summary} ); | 
					
						
							|  |  |  |     unless ( !defined( $h->{s_name} ) or $h->{s_name} eq 'general' ) { | 
					
						
							|  |  |  |         $desc .= sprintf( "Series: %s\n", coalesce( $h->{s_name}, '' ) ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     #$desc .= sprintf( "Duration: %s\n", $h->{duration} ); | 
					
						
							|  |  |  |     $desc .= sprintf( "Source: <a href=\"%s\">%s</a>\n", $sourceurl, $sourceurl ); | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Add a pointer to the source audio if there is any (cater for multiple, | 
					
						
							|  |  |  |     # but it's not likely we'll get more than one) | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if (@{$asource}) { | 
					
						
							|  |  |  |         foreach my $aurl (@{$asource}) { | 
					
						
							|  |  |  |             $desc .= sprintf( "Original audio: <a href=\"%s\">%s</a>\n", | 
					
						
							|  |  |  |                 $aurl, $aurl ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Count the lines so we can add <br/> tags at the end to make the header | 
					
						
							|  |  |  |     # a standard size of 4 lines high to accomodate the media widget. | 
					
						
							|  |  |  |     # TODO: Rationalise this since there is no media widget any more. | 
					
						
							|  |  |  |     # Temporarily boosted the height to 6 lines now the Original audio has | 
					
						
							|  |  |  |     # been added. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $lines = $desc =~ tr/\n//; | 
					
						
							|  |  |  |     $desc =~ s#\n#<br />#g; | 
					
						
							|  |  |  |     $desc .= ( "<br />" x ( 6 - $lines ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Prepare the notes and add to the header. Actions are: | 
					
						
							|  |  |  |     # 1. Make all <pre> nodes compatible by adding <br/> | 
					
						
							|  |  |  |     # 2. Generate entities for all non-ASCII | 
					
						
							|  |  |  |     # 3. Do the silly thing of double-encoding entities to counteract the IA | 
					
						
							|  |  |  |     #    bug | 
					
						
							|  |  |  |     # 4. Remove all newlines ('ia' doesn't seem to know what to do with them) | 
					
						
							|  |  |  |     # 5. Trim trailing spaces | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # ---- | 
					
						
							|  |  |  |     # NOTE: Removed the double-encoding on 2017-10-07 since (unannounced) it | 
					
						
							|  |  |  |     # looks as if the IA have fixed this bug. There is some debate over | 
					
						
							|  |  |  |     # whether we should be encoding non-ASCII also. | 
					
						
							|  |  |  |     # ---- | 
					
						
							|  |  |  |     # NOTE: 2017-11-25 restored the double-encoding since the bug bit again at | 
					
						
							|  |  |  |     # the last upload. This is not the whole story though. Added a step to | 
					
						
							|  |  |  |     # decode the notes before re-encoding the non-ASCII bits. This is to | 
					
						
							|  |  |  |     # counteract what seems to be being done by Pandoc if we process the notes | 
					
						
							|  |  |  |     # from plain text. | 
					
						
							|  |  |  |     # ---- | 
					
						
							|  |  |  |     # NOTE: 2021-02-20 found a problem with the strategy of decoding | 
					
						
							|  |  |  |     # everything and then encoding it again (selectively). The issue was seen | 
					
						
							|  |  |  |     # in HPR show 3284 where the HTML notes contains a <pre> section with C++ | 
					
						
							|  |  |  |     # code in it which uses characters like '<' and '>'. These were encoded as | 
					
						
							|  |  |  |     # HTML entities at shownote preparation time since they were flagged as | 
					
						
							|  |  |  |     # bad HTML by my validator script. The original algorithm then decoded | 
					
						
							|  |  |  |     # these characters but couldn't encode them with encode_entities because | 
					
						
							|  |  |  |     # that would have encoded all the HTML tags. The decode/encode stuff is | 
					
						
							|  |  |  |     # therefore HTML context dependent, and we don't have the means of | 
					
						
							|  |  |  |     # handling this with the current script. So, the solution seems to be not | 
					
						
							|  |  |  |     # to decode and encode at all. After running flatten_pre, simply run the | 
					
						
							|  |  |  |     # re_encode_entities and trust that there's nothing in the notes that need | 
					
						
							|  |  |  |     # special action here. Anyway, the preparation stage should have handled | 
					
						
							|  |  |  |     # whatever's necessary. | 
					
						
							|  |  |  |     # ---- | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     #$notes = flatten_pre( $h->{notes} ); | 
					
						
							|  |  |  |     $notes = flatten_pre($tree); | 
					
						
							|  |  |  |     ## $notes = decode_entities($notes); | 
					
						
							|  |  |  |     ## $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); | 
					
						
							|  |  |  |     ## $notes = re_encode_entities($notes); | 
					
						
							|  |  |  |     $notes =~ s/\n//g; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $desc .= $notes; | 
					
						
							|  |  |  |     $desc =~ s/\s*$//; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $desc; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: db_tags | 
					
						
							|  |  |  | #      PURPOSE: Return the episode search tags as an array for populating | 
					
						
							|  |  |  | #               'subject' fields on Archive.org | 
					
						
							|  |  |  | #   PARAMETERS: $h      - hashref returned from the database | 
					
						
							|  |  |  | #      RETURNS: A reference to an array of tags or undef if there are none | 
					
						
							|  |  |  | #  DESCRIPTION: In the database the tags are held as a comma-delimited string. | 
					
						
							|  |  |  | #               We need to turn this into an array to populate 'subject' | 
					
						
							|  |  |  | #               fields in the CSV file. If there are no tags the caller needs | 
					
						
							|  |  |  | #               an undef to be returned to indicate that a blank field needs | 
					
						
							|  |  |  | #               to be constructed in the CSV. The format of the database | 
					
						
							|  |  |  | #               string may be messy since we rely on the show's host to submit | 
					
						
							|  |  |  | #               this information. We split the string taking this into | 
					
						
							|  |  |  | #               consideration. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub db_tags { | 
					
						
							|  |  |  |     my ($h) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $h->{tags} ? [ split( /\s*,\s*/, $h->{tags} ) ] : undef; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: flatten_pre | 
					
						
							|  |  |  | #      PURPOSE: Process notes "flattening" <pre> contents | 
					
						
							|  |  |  | #   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and | 
					
						
							|  |  |  | #                       partially processed notes | 
					
						
							|  |  |  | #      RETURNS: Processed notes | 
					
						
							|  |  |  | #  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also | 
					
						
							|  |  |  | #               contain some formatting tags like <em> and <code>, but spaces | 
					
						
							|  |  |  | #               and newlines are significant. The Internet Archive upload API | 
					
						
							|  |  |  | #               uses HTTP headers which are text strings without newlines, so | 
					
						
							|  |  |  | #               when these tags are uploaded through this route some | 
					
						
							|  |  |  | #               formatting is lost. What this routine does is parse the | 
					
						
							|  |  |  | #               contents of all <pre> sections in $notes, adding <br/> tags | 
					
						
							|  |  |  | #               to replace newlines. It has to perform a full parse | 
					
						
							|  |  |  | #               since the contents may include HTML tags and these need to be | 
					
						
							|  |  |  | #               passed through intact. It calls the subroutine 'flatten_item' to | 
					
						
							|  |  |  | #               deal with the recursive nature of HTML tags. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub flatten_pre { | 
					
						
							|  |  |  |     my ($tree) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Find all the <pre> tags | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my @pre_tags = $tree->look_down( _tag => 'pre', ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Walk the various <pre> elements in the document | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     foreach my $tag (@pre_tags) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Save the tag and empty the original | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         my $saved = $tag->clone(); | 
					
						
							|  |  |  |         $tag->delete_content(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Walk the saved content and rebuild the tag into $atag using the | 
					
						
							|  |  |  |         # nested arrayref structure permitted by HTML::Element for | 
					
						
							|  |  |  |         # convenience (the alternative is a little nasty). See the | 
					
						
							|  |  |  |         # documentation for 'new_from_lol' in HTML::Element. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         my $atag; | 
					
						
							|  |  |  |         foreach my $item ( @{ $saved->content_array_ref } ) { | 
					
						
							|  |  |  |             push( @$atag, flatten_item($item) ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Rebuild the tag from the arrayref we built. We treat the arrayref | 
					
						
							|  |  |  |         # structure we just built as an array because otherwise the top level | 
					
						
							|  |  |  |         # is interpreted as a spurious <null> tag. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $tag->push_content(@$atag); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Trim out the original notes from the enclosing tags we added earlier | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $body = $tree->look_down( _tag => 'body' ); | 
					
						
							|  |  |  |     ( my $result = $body->as_HTML( undef, ' ', {} ) ) | 
					
						
							|  |  |  |         =~ s{(^<body[^>]*>|</body>$)}{}gi; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $result; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: flatten_item | 
					
						
							|  |  |  | #      PURPOSE: Recursively "flatten" items within the enclosing <pre> | 
					
						
							|  |  |  | #   PARAMETERS: $item   an HTML::Element item parsed from the original | 
					
						
							|  |  |  | #                       <pre> section | 
					
						
							|  |  |  | #      RETURNS: An arrayref if the last seen item was a tag, otherwise a list | 
					
						
							|  |  |  | #  DESCRIPTION: Since <pre> sections can contain inline elements which change | 
					
						
							|  |  |  | #               the rendering of the text we need to parse these as we add | 
					
						
							|  |  |  | #               <br/> tags. This routine does this by recursively descending | 
					
						
							|  |  |  | #               through the contents. A common tag sequence is <pre><code> for | 
					
						
							|  |  |  | #               scripts and the like. This routine deals with such sequences. | 
					
						
							|  |  |  | #               It expects to receive the contents in sequence and builds the | 
					
						
							|  |  |  | #               result as a nested arrayref structure. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub flatten_item { | 
					
						
							|  |  |  |     my ($item) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return unless defined($item); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( @result, %attr ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Is it a sub-tag or non-tag content? | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( ref($item) ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # It's a tag. Save the tag name and any attributes and recurse into | 
					
						
							|  |  |  |         # it. Return an arrayref | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         push( @result, $item->tag() ); | 
					
						
							|  |  |  |         %attr = $item->all_external_attr(); | 
					
						
							|  |  |  |         push( @result, \%attr ) if %attr; | 
					
						
							|  |  |  |         for my $child ( $item->content_list() ) { | 
					
						
							|  |  |  |             push( @result, flatten_item($child) ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         return \@result; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # It's non-tag content. Join the lines with <br/> tags.  Return an | 
					
						
							|  |  |  |         # array (since this is a simple list). | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Note that we split with a LIMIT of -1 which causes any trailing list | 
					
						
							|  |  |  |         # items to be returned; default behaviour is to drop them. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $item =~ s/\r//g; | 
					
						
							|  |  |  |         my @content = split( /\n/, $item, -1 ); | 
					
						
							|  |  |  |         if (@content) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Remove a leading blank line - usually the result of | 
					
						
							|  |  |  |             # a "<pre>'NL'text" sequence | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             shift(@content) if ( $content[0] =~ /^\s*$/ ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Join back the lines with <br/> tags between them. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             foreach my $txt (@content) { | 
					
						
							|  |  |  |                 push( @result, $txt, ['br'] ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Remove the <br/> at the end, it's spurious | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             pop(@result); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return (@result); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: re_encode_entities | 
					
						
							|  |  |  | #      PURPOSE: Find all encoded entities and encode them all over again | 
					
						
							|  |  |  | #   PARAMETERS: $notes  - string containing unprocessed notes | 
					
						
							|  |  |  | #      RETURNS: Processed notes | 
					
						
							|  |  |  | #  DESCRIPTION: Uses a brutal regular expression substitution approach, but | 
					
						
							|  |  |  | #               since this is a very unusual requirement brought about by what | 
					
						
							|  |  |  | #               is essentially a bug in the way the Internet Archive stores | 
					
						
							|  |  |  | #               and processes metadata, we have no choice. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: It looks as if the software on archive.org saves HTML metadata | 
					
						
							|  |  |  | #               in an XML file by encoding it as entities. So HTML tags are | 
					
						
							|  |  |  | #               turned into <TAG> sequences. Then when HTML is required | 
					
						
							|  |  |  | #               the XML is decoded back to <TAG>. Unfortunately any existing | 
					
						
							|  |  |  | #               entities in the HTML like '<' will also be decoded in this | 
					
						
							|  |  |  | #               phase which may result in invalid HTML sequences. In this | 
					
						
							|  |  |  | #               routine we are converting existing entities so that the | 
					
						
							|  |  |  | #               decoding phase turns them into valid entities. The archive.org | 
					
						
							|  |  |  | #               software should be doing this, but it isn't, and any messages | 
					
						
							|  |  |  | #               sent to the staff there are ignored. Of course, there may be | 
					
						
							|  |  |  | #               a point at which this bug is corrected and the double encoding | 
					
						
							|  |  |  | #               process here becomes redundant. Then the call to this routine | 
					
						
							|  |  |  | #               can be omitted. | 
					
						
							|  |  |  | #               Note also that the IA editor corrupts HTML containing entities | 
					
						
							|  |  |  | #               so should not be used. To make a change, edit the notes on the | 
					
						
							|  |  |  | #               HPR database and generate new metadata with this script. Use | 
					
						
							|  |  |  | #               the -meta_only option to avoid the need for the media (unless | 
					
						
							|  |  |  | #               it too is being updated of course) and re-submit the CSV file | 
					
						
							|  |  |  | #               to ias3upload.pl or ia. | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub re_encode_entities { | 
					
						
							|  |  |  |     my ($notes) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Replace any '&xxx;' sequence by '&xxx;' | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $notes =~ s#\&([^;]+;)#&$1#g; | 
					
						
							|  |  |  |     return $notes; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: find_links_in_notes | 
					
						
							|  |  |  | #      PURPOSE: Finds HPR links in show notes | 
					
						
							|  |  |  | #   PARAMETERS: $episode        episode number we're dealing with | 
					
						
							|  |  |  | #               $notes          a string containing the shownotes | 
					
						
							|  |  |  | #               $rtree          reference to an HTML::TreeBuilder object built | 
					
						
							|  |  |  | #                               from the notes | 
					
						
							|  |  |  | #               $rlinks         hashref containing the links found | 
					
						
							|  |  |  | #               $rconfig        hashref containing config data | 
					
						
							|  |  |  | #               $verbose        setting controlling what reports are generated | 
					
						
							|  |  |  | #               $silent         setting controlling what reports are generated | 
					
						
							|  |  |  | #      RETURNS: Count of the number of links | 
					
						
							|  |  |  | #  DESCRIPTION: Parses the notes passed as an argument then scans these notes | 
					
						
							|  |  |  | #               looking for links which relate to items on the HPR server. | 
					
						
							|  |  |  | #               Normally these are files and images but could be anchor | 
					
						
							|  |  |  | #               references as well. If the latter then the URL has to be | 
					
						
							|  |  |  | #               parsed to remove the anchor. Unique links are saved in a hash | 
					
						
							|  |  |  | #               as the path to the file that will be saved on the VPS and as | 
					
						
							|  |  |  | #               the link to the item in the new notes. This hash will be | 
					
						
							|  |  |  | #               passed back to the caller so it can get the file(s) and | 
					
						
							|  |  |  | #               prepare them for upload to the IA. Finally, the link in the | 
					
						
							|  |  |  | #               notes is modified to refer to the file that will be uploaded | 
					
						
							|  |  |  | #               to the IA. The function returns the number of links it has | 
					
						
							|  |  |  | #               recorded (not the number it has changed), as well as passing | 
					
						
							|  |  |  | #               back the parsed tree and the link hash it has constructed. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub find_links_in_notes { | 
					
						
							|  |  |  |     my ( $episode, $notes, $rtree, $rlinks, $rconfig, $verbose, $silent ) | 
					
						
							|  |  |  |         = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $linkre,  $epstr,    $uri,     $slink ); | 
					
						
							|  |  |  |     my ( $oldfile, $newfile, $newURL ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     _debug( $DEBUG >= 3, "Entered find_links_in_notes\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Initialise links | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     undef %$rlinks; # TODO: Consider whether we should do this? | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Create an object that can be shared downstream | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $$rtree = HTML::TreeBuilder->new; | 
					
						
							|  |  |  |     $$rtree->ignore_unknown(0); | 
					
						
							|  |  |  |     $$rtree->no_expand_entities(1); | 
					
						
							|  |  |  |     $$rtree->p_strict(1); | 
					
						
							|  |  |  |     $$rtree->store_comments(1); | 
					
						
							|  |  |  |     $$rtree->warn(1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Parse the notes. Die if we fail because then we know this show needs | 
					
						
							|  |  |  |     # some urgent attention. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $$rtree->parse_content($notes) | 
					
						
							|  |  |  |         or die "HTML::TreeBuilder failed to parse notes: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # NOTE: No longer filtering out links not relating to the episode. | 
					
						
							|  |  |  |     # Experience with 'upload_manager' has shown that there are some weird | 
					
						
							|  |  |  |     # links in the HPR database which will cause problems. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Regular expression to match links to the HPR server. Examples: | 
					
						
							|  |  |  |     #   http://hackerpublicradio.org/eps/hpr2153.png | 
					
						
							|  |  |  |     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html | 
					
						
							|  |  |  |     # Also things like this (**Why Ken?**) | 
					
						
							|  |  |  |     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  |     # Don't match things like this when *not* processing 1986: | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  |     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2 | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  |     # ---------------------------------------------------------------------- | 
					
						
							|  |  |  |     # NOTE: 2024-07-08 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # It used to be that we added a top-level hprXXXX directory to URLs | 
					
						
							|  |  |  |     # because there wasn't one on the HPR server. This was because the | 
					
						
							|  |  |  |     # majority of shows without assets had no files; the notes were taken from | 
					
						
							|  |  |  |     # the database and displayed dynamically. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Now all HPR shows have a top-level directory for holding the index.html | 
					
						
							|  |  |  |     # with the pre-created notes page. So we DO NOT want to create that | 
					
						
							|  |  |  |     # top-level part. The RE below matches but doesn't store it or we'd get | 
					
						
							|  |  |  |     # one too many directory levels. | 
					
						
							|  |  |  |     # ---------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  |     # | 
					
						
							|  |  |  |     $epstr = sprintf( "hpr%04d", $episode ); | 
					
						
							|  |  |  | #   my $re | 
					
						
							|  |  |  | #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x; | 
					
						
							|  |  |  |     $linkre = qr{ | 
					
						
							|  |  |  |         ^https?:// | 
					
						
							|  |  |  |         (?:www.)? | 
					
						
							|  |  |  |         (?:hacker|hobby)publicradio.org/eps/ | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  |         $epstr/ | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  |         (.+)$ | 
					
						
							|  |  |  |     }x; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Scan the HTML tree for links we care about | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     for ( @{ $$rtree->extract_links( 'a', 'img' ) } ) { | 
					
						
							|  |  |  |         my ( $link, $element, $attr, $tag ) = @$_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Standardise the link (expands relative URLs, removes any fragment). | 
					
						
							|  |  |  |         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative | 
					
						
							|  |  |  |         # URIs are removed. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         local $URI::ABS_REMOTE_LEADING_DOTS = 1; | 
					
						
							|  |  |  |         $uri = URI->new_abs( $link, $rconfig->{baseURL} ); | 
					
						
							|  |  |  |         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         _debug( $DEBUG >= 3, "\$uri = $uri\n" ); | 
					
						
							|  |  |  |         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment ) | 
					
						
							|  |  |  |             if $uri->fragment; | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  |         _debug( $DEBUG >= 3, "\$slink = $slink\n" ); | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Is it an HPR link? | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( $slink =~ $linkre ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Save the last bracketed match, without any 'fragment' if there | 
					
						
							|  |  |  |             # is one (we want this not to be URL-related) | 
					
						
							|  |  |  |             # NOTE: Will we ever have a fragment here? | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             ( $oldfile = "$+" ) =~ s/#.*$//; | 
					
						
							|  |  |  |             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Does this file path begin with an 'hpr' prefix? If so is it the | 
					
						
							|  |  |  |             # show id? If not we don't want to process it. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) { | 
					
						
							|  |  |  |                 if ( $1 ne $epstr ) { | 
					
						
							|  |  |  |                     _debug( $DEBUG >= 3, "Ignored $slink\n" ); | 
					
						
							|  |  |  |                     next; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # The path and URL might end with a slash which means the URL is | 
					
						
							|  |  |  |             # relying on the Web server to fill in the filename as | 
					
						
							|  |  |  |             # 'index.html'. We have to make this explicit. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $slink =~ /\/$/ ) { | 
					
						
							|  |  |  |                 $slink    .= 'index.html'; | 
					
						
							|  |  |  |                 $oldfile .= 'index.html'; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Save the original link if it's unique. We have a hashref in | 
					
						
							|  |  |  |             # $rlinks (pointing to a global hash). | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # We add a key made from the parsed link in which we store an | 
					
						
							|  |  |  |             # anonymous hashref containing the name of the file we'll store in | 
					
						
							|  |  |  |             # the cache area for upload, and the new IA-based URL we'll use in | 
					
						
							|  |  |  |             # the notes. We rename files that don't start with 'hprNNNN_' with | 
					
						
							|  |  |  |             # that prefix to make it clear it belongs to the show (it mainly | 
					
						
							|  |  |  |             # helps organise and manage the cache if truth be told) | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             unless ( exists( $rlinks->{$slink} ) ) { | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Originally we turned "hpr9999/file.dat" into | 
					
						
							|  |  |  |                 # "hpr9999_file.dat". We don't want to do this any more so the | 
					
						
							|  |  |  |                 # code is much simpler | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 $newfile = $rconfig->{uploads} . "/$oldfile"; | 
					
						
							|  |  |  |                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Save the link details as a sub-hash indexed by the | 
					
						
							|  |  |  |                 # standardised URL. Elements are: | 
					
						
							|  |  |  |                 # { | 
					
						
							|  |  |  |                 #     cached => 'file path for the cache area', | 
					
						
							|  |  |  |                 #     new    => 'URL to be used on the IA', | 
					
						
							|  |  |  |                 # } | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 $rlinks->{$slink}           = {}; | 
					
						
							|  |  |  |                 $rlinks->{$slink}->{cached} = $newfile; | 
					
						
							|  |  |  |                 $rlinks->{$slink}->{new}    = $newURL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Simply change the content of the link (in the parsed HTML) with | 
					
						
							|  |  |  |             # the new URL built above. We know the attribute ('src' or 'href') | 
					
						
							|  |  |  |             # from what 'extract_links' returned. Deal with any fragment we | 
					
						
							|  |  |  |             # found as well. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $uri->fragment ) { | 
					
						
							|  |  |  |                 $element->attr( $attr, | 
					
						
							|  |  |  |                     $rlinks->{$slink}->{new} . '#' . $uri->fragment ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             else { | 
					
						
							|  |  |  |                 $element->attr( $attr, $rlinks->{$slink}->{new} ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If we found any links the notes will have been changed, but since the | 
					
						
							|  |  |  |     # tree is shared with the caller this will result in changes to the | 
					
						
							|  |  |  |     # metadata. However, we want to look at any HTML files in those links | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( scalar( keys(%$rlinks) ) > 0 ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # If any links are HTML we need to download them and recursively parse, | 
					
						
							|  |  |  |         # record and possibly change their contents. This updates $rlinks | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         foreach my $key ( keys(%$rlinks) ) { | 
					
						
							|  |  |  |             if ( $key =~ /\.html$/ ) { | 
					
						
							|  |  |  |                 my $linkfile = $rlinks->{$key}->{cached}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Get the file unless we've already collected it | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 if ( ! -e $linkfile ) { | 
					
						
							|  |  |  |                     download_url( $key, $linkfile, $rlinks->{$key}->{new}, | 
					
						
							|  |  |  |                         $verbose, $silent ); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Do the recursive parsing since we need to know about further | 
					
						
							|  |  |  |                 # links | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 find_links_in_file( | 
					
						
							|  |  |  |                     $episode, $linkfile, $rlinks, | 
					
						
							|  |  |  |                     $rconfig, $verbose,  $silent | 
					
						
							|  |  |  |                 ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Return the link count after all recursion through supplementary files | 
					
						
							|  |  |  |     # and so on | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return scalar( keys(%$rlinks) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: find_links_in_file | 
					
						
							|  |  |  | #      PURPOSE: Finds HPR links in related files | 
					
						
							|  |  |  | #   PARAMETERS: $episode        episode number we're dealing with | 
					
						
							|  |  |  | #               $filename       file being examined | 
					
						
							|  |  |  | #               $rlinks         hashref containing the links found | 
					
						
							|  |  |  | #               $rconfig        hashref containing config data | 
					
						
							|  |  |  | #               $verbose        setting controlling what reports are generated | 
					
						
							|  |  |  | #               $silent         setting controlling what reports are generated | 
					
						
							|  |  |  | #      RETURNS: The number of new links found and downloaded | 
					
						
							|  |  |  | #  DESCRIPTION: The routine is related to 'find_links_in_notes' but doesn't work | 
					
						
							|  |  |  | #               quite the same way. It is given the name of an HTML which has | 
					
						
							|  |  |  | #               already been downloaded. It parses this with HTML::TreeBuilder | 
					
						
							|  |  |  | #               because it neds to be scanned and possibly edited. The HTML | 
					
						
							|  |  |  | #               tree is scanned for <a> and <img> tags. If the URL returned is | 
					
						
							|  |  |  | #               an HPR URL then we will want to change it to an IA one so the | 
					
						
							|  |  |  | #               show can reference its components on archive.org and be | 
					
						
							|  |  |  | #               independent of HPR. If a link is found which hasn't been seen | 
					
						
							|  |  |  | #               before it is saved in the $rlinks hash keyed by the original | 
					
						
							|  |  |  | #               URL. The link URL is replaced by the IA version in the HTML | 
					
						
							|  |  |  | #               tree. Once the scan is complete if URL edits have been counted | 
					
						
							|  |  |  | #               the file is rewritten from the modified tree. Then if any new | 
					
						
							|  |  |  | #               links have been found these are checked, and if any are HTML | 
					
						
							|  |  |  | #               they are downloaded ready to be parsed. Parsing is done by | 
					
						
							|  |  |  | #               recursively calling this routine. Other links, which are not | 
					
						
							|  |  |  | #               HTML exist in the $rlinks hash and are downloaded later in the | 
					
						
							|  |  |  | #               script since they do not need to be edited. | 
					
						
							|  |  |  | #               NOTE: As of 2017-12-10 this code has been used in a live | 
					
						
							|  |  |  | #               situation but the recursive capabilities have not been | 
					
						
							|  |  |  | #               invoked. This is because we have shows with the usual short | 
					
						
							|  |  |  | #               notes which reference longer notes, and sometimes the latter | 
					
						
							|  |  |  | #               have links to other files. However, so far none of these cases | 
					
						
							|  |  |  | #               have referenced further HTML files needing scanning and | 
					
						
							|  |  |  | #               editing. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub find_links_in_file { | 
					
						
							|  |  |  |     my ( $episode, $filename, $rlinks, $rconfig, $verbose, $silent ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $linkre, $epstr, $tree, $uri, $slink ); | 
					
						
							|  |  |  |     my ( $oldfile, $newfile, $newURL ); | 
					
						
							|  |  |  |     my ( $encoding, $linkcount, $linkedits ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     _debug( $DEBUG >= 3, "Entered find_links_in_file\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Create a tree object | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $tree = HTML::TreeBuilder->new; | 
					
						
							|  |  |  |     $tree->ignore_unknown(0); | 
					
						
							|  |  |  |     $tree->no_expand_entities(1); | 
					
						
							|  |  |  |     $tree->p_strict(1); | 
					
						
							|  |  |  |     $tree->store_comments(1); | 
					
						
							|  |  |  |     $tree->warn(1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Parse the file using IO::HTML to grab it. Die if we fail because then we | 
					
						
							|  |  |  |     # know this stuff needs some urgent attention. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $tree->parse_file( html_file($filename) ) | 
					
						
							|  |  |  |         or die "HTML::TreeBuilder failed to process $filename: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # NOTE: No longer filtering out links not relating to the episode. | 
					
						
							|  |  |  |     # Experience with 'upload_manager' has shown that there are some weird | 
					
						
							|  |  |  |     # links in the HPR database which will cause problems. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Regular expression to match links to the HPR server. Examples: | 
					
						
							|  |  |  |     #   http://hackerpublicradio.org/eps/hpr2153.png | 
					
						
							|  |  |  |     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html | 
					
						
							|  |  |  |     # Also things like this (**Why Ken?**) | 
					
						
							|  |  |  |     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg | 
					
						
							| 
									
										
										
										
											2024-07-16 21:39:28 +01:00
										 |  |  |     # Don't match things like this when *not* processing 1986: | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  |     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $epstr = sprintf( "hpr%04d", $episode ); | 
					
						
							|  |  |  | #   my $re | 
					
						
							|  |  |  | #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x; | 
					
						
							|  |  |  |     $linkre = qr{ | 
					
						
							|  |  |  |         ^https?:// | 
					
						
							|  |  |  |         (?:www.)? | 
					
						
							|  |  |  |         (?:hacker|hobby)publicradio.org/eps/ | 
					
						
							|  |  |  |         (.+)$ | 
					
						
							|  |  |  |     }x; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Counting new links found and stashed as well as edits made | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $linkcount = 0; | 
					
						
							|  |  |  |     $linkedits = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Scan the HTML tree for links we care about | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     for ( @{ $tree->extract_links( 'a', 'img' ) } ) { | 
					
						
							|  |  |  |         my ( $link, $element, $attr, $tag ) = @$_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Standardise the link (expands relative URLs, removes any fragment) | 
					
						
							|  |  |  |         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative | 
					
						
							|  |  |  |         # URIs are removed. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         local $URI::ABS_REMOTE_LEADING_DOTS = 1; | 
					
						
							|  |  |  |         $uri = URI->new_abs( $link, $rconfig->{baseURL} ); | 
					
						
							|  |  |  |         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         _debug( $DEBUG >= 3, "\$uri = $uri\n" ); | 
					
						
							|  |  |  |         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment ) | 
					
						
							|  |  |  |             if $uri->fragment; | 
					
						
							|  |  |  |         _debug( $DEBUG >= 3, "\$slink = $slink, \n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Is it an HPR link? | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( $slink =~ $linkre ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Save the last bracketed match, without any 'fragment' if there | 
					
						
							|  |  |  |             # is one (we want this not to be URL-related) | 
					
						
							|  |  |  |             # NOTE: Will we ever have a fragment here? | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             ( $oldfile = "$+" ) =~ s/#.*$//; | 
					
						
							|  |  |  |             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Does this file path begin with an 'hpr' prefix? If so is it the | 
					
						
							|  |  |  |             # show id? If not we don't want to process it. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) { | 
					
						
							|  |  |  |                 if ( $1 ne $epstr ) { | 
					
						
							|  |  |  |                     _debug( $DEBUG >= 3, "Ignored $slink\n" ); | 
					
						
							|  |  |  |                     next; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # The path and URL might end with a slash which means the URL is | 
					
						
							|  |  |  |             # relying on the Web server to fill in the filename as | 
					
						
							|  |  |  |             # 'index.html'. We have to make this explicit. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $slink =~ /\/$/ ) { | 
					
						
							|  |  |  |                 $slink    .= 'index.html'; | 
					
						
							|  |  |  |                 $oldfile .= 'index.html'; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Save the original link if it's unique. We have a hashref in | 
					
						
							|  |  |  |             # $rlinks (pointing to a global hash). | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # We add a key made from the parsed link in which we store an | 
					
						
							|  |  |  |             # anonymous hashref containing the name of the file we'll store in | 
					
						
							|  |  |  |             # the cache area for upload, and the new IA-based URL we'll use in | 
					
						
							|  |  |  |             # the notes. We rename files that don't start with 'hprNNNN_' with | 
					
						
							|  |  |  |             # that prefix to make it clear it belongs to the show (it mainly | 
					
						
							|  |  |  |             # helps organise and manage the cache if truth be told) | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             unless ( exists( $rlinks->{$slink} ) ) { | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Originally we turned "hpr9999/file.dat" into | 
					
						
							|  |  |  |                 # "hpr9999_file.dat". We don't want to do this any more so the | 
					
						
							|  |  |  |                 # code is much simpler | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 $newfile = $rconfig->{uploads} . "/$oldfile"; | 
					
						
							|  |  |  |                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Save the link details as a sub-hash indexed by the | 
					
						
							|  |  |  |                 # standardised URL. Elements are: | 
					
						
							|  |  |  |                 # { | 
					
						
							|  |  |  |                 #     cached => 'file path for the cache area', | 
					
						
							|  |  |  |                 #     new    => 'URL to be used on the IA', | 
					
						
							|  |  |  |                 # } | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 $rlinks->{$slink}           = {}; | 
					
						
							|  |  |  |                 $rlinks->{$slink}->{cached} = $newfile; | 
					
						
							|  |  |  |                 $rlinks->{$slink}->{new}    = $newURL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 $linkcount++; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Simply change the content of the link (in the parsed HTML) with | 
					
						
							|  |  |  |             # the new URL built above. We know the attribute ('src' or 'href') | 
					
						
							|  |  |  |             # from what 'extract_links' returned. Deal with any fragment we | 
					
						
							|  |  |  |             # found as well. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $uri->fragment ) { | 
					
						
							|  |  |  |                 $element->attr( $attr, | 
					
						
							|  |  |  |                     $rlinks->{$slink}->{new} . '#' . $uri->fragment ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             else { | 
					
						
							|  |  |  |                 $element->attr( $attr, $rlinks->{$slink}->{new} ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             $linkedits++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If we found HPR links then we'll have changed them in the tree, so we | 
					
						
							|  |  |  |     # need to update the file. We use 'as_HTML' with no entities, indentation | 
					
						
							|  |  |  |     # by one space and no optional end tags (see HTML::Element). If the file | 
					
						
							|  |  |  |     # gets examined after the edits there should be no eligible HPR links, so | 
					
						
							|  |  |  |     # no action. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( $linkedits > 0 ) { | 
					
						
							|  |  |  |         open( my $out, ">:encoding(UTF-8)", $filename ) | 
					
						
							|  |  |  |             or die "Unable to open $filename for writing: $!\n"; | 
					
						
							|  |  |  |         print $out $tree->as_HTML( undef, ' ', {} ); | 
					
						
							|  |  |  |         close($out); | 
					
						
							|  |  |  |         print STDERR "Altered links in $filename\n" unless $silent; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If we found new links then $linkcount will be non-zero and we need to | 
					
						
							|  |  |  |     # download and parse them, otherwise nothing else to do. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( $linkcount > 0 ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # If any links are HTML we need to download them and recursively parse, | 
					
						
							|  |  |  |         # record and possibly change their contents | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         foreach my $key ( keys(%$rlinks) ) { | 
					
						
							|  |  |  |             if ( $key =~ /\.html$/ ) { | 
					
						
							|  |  |  |                 my $linkfile = $rlinks->{$key}->{cached}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Get the file unless we've already collected it | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 if ( !-e $linkfile ) { | 
					
						
							|  |  |  |                     download_url( $key, $linkfile, $rlinks->{$key}->{new}, | 
					
						
							|  |  |  |                         $verbose, $silent ); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # Do the recursive parsing since we need to know about further | 
					
						
							|  |  |  |                 # links | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 find_links_in_file( | 
					
						
							|  |  |  |                     $episode, $linkfile, $rlinks, | 
					
						
							|  |  |  |                     $rconfig, $verbose,  $silent | 
					
						
							|  |  |  |                 ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Return the link count | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return $linkcount; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: download_url | 
					
						
							|  |  |  | #      PURPOSE: Download a file from an URL | 
					
						
							|  |  |  | #   PARAMETERS: $from           the URL the file is to be collected from | 
					
						
							|  |  |  | #               $to             the path to which the file is to be written | 
					
						
							|  |  |  | #               $upload         the eventual IA URL (for information) | 
					
						
							|  |  |  | #               $verbose        setting controlling what reports are generated | 
					
						
							|  |  |  | #               $silent         setting controlling what reports are generated | 
					
						
							|  |  |  | #      RETURNS: The status value from the download | 
					
						
							|  |  |  | #  DESCRIPTION: If 'verbose' is true the details of the download are reported. | 
					
						
							|  |  |  | #               We are to download from the URL specified as $from, and the | 
					
						
							|  |  |  | #               destination is the path in $to which is in a temporary cache | 
					
						
							|  |  |  | #               area. This may result in the original file being renamed. The | 
					
						
							|  |  |  | #               value in $upload shows the URL the file will be available at | 
					
						
							|  |  |  | #               on the IA once the metadata has been used for uploads. A brief | 
					
						
							|  |  |  | #               message is written to STDERR by default then the HTTP download | 
					
						
							|  |  |  | #               is initiated with a warning generated if the download is not | 
					
						
							|  |  |  | #               successful. | 
					
						
							|  |  |  | #               Moved from LWP::Simple to LWP::UserAgent to get more control | 
					
						
							|  |  |  | #               over the download. The former failed when an HTTPS URL was | 
					
						
							|  |  |  | #               used and the server didn't offer this type of connection, | 
					
						
							|  |  |  | #               whereas the latter does not. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub download_url { | 
					
						
							|  |  |  |     my ( $from, $to, $upload, $verbose, $silent ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $dirname, $status ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ($verbose) { | 
					
						
							|  |  |  |         print "Link to be downloaded: $from\n"; | 
					
						
							|  |  |  |         print "  to ",              $to,     "\n"; | 
					
						
							|  |  |  |         print "  and uploaded as ", $upload, "\n"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Extract the directory from the path. If the directory doesn't exist then | 
					
						
							|  |  |  |     # make it. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     ( $dirname = $to ) =~ s|/?[^/]*$||mx; | 
					
						
							|  |  |  |     make_path($dirname) unless -d $dirname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Collect the file and save it | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     print STDERR "Downloading $from\n" unless $silent; | 
					
						
							|  |  |  |     $status = getstore( $from, $to, $verbose ); | 
					
						
							|  |  |  |     $status == 200 or warn "Download failed: $status : $from ($to)\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print '~' x 80, "\n" if $verbose; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $status; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: getstore | 
					
						
							|  |  |  | #      PURPOSE: Get an URL and store the result in a file | 
					
						
							|  |  |  | #   PARAMETERS: $from           the URL to download | 
					
						
							|  |  |  | #               $to             where to put the result | 
					
						
							|  |  |  | #               $verbose        setting controlling what reports are generated | 
					
						
							|  |  |  | #      RETURNS: The status code from the 'get' | 
					
						
							|  |  |  | #  DESCRIPTION: When using LWP::Simple an attempt to fetch an URL with an | 
					
						
							|  |  |  | #               'https' method failed since the server doesn't offer this | 
					
						
							|  |  |  | #               service. Using LWP::UserAgent this is not a problem, so we are | 
					
						
							|  |  |  | #               effectively emulating LWP::Simple in a more complex way! | 
					
						
							|  |  |  | #               However, we now have a means of taking action when the | 
					
						
							|  |  |  | #               download fails in some predictable way. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub getstore { | 
					
						
							|  |  |  |     my ( $from, $to, $verbose ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Create the agent, and identify it (a bit) | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $ua = LWP::UserAgent->new; | 
					
						
							|  |  |  |     $ua->agent("make_metadata/$VERSION "); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Get the URL and store it to the requested file | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $res = $ua->get( $from, ':content_file' => $to ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Report what happened | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     print $res->status_line, "\n" if $verbose; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Return status | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return $res->code; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: expand_template | 
					
						
							|  |  |  | #      PURPOSE: Turns a filename template into a filename or a default | 
					
						
							|  |  |  | #   PARAMETERS: $option         The input from the -output or -script option | 
					
						
							|  |  |  | #               $prefix         Filename prefix | 
					
						
							|  |  |  | #               $suffix         Filename suffix | 
					
						
							|  |  |  | #               $lbound         Lower bound of the episode range | 
					
						
							|  |  |  | #               $ubound         Upper bound of the episode range | 
					
						
							|  |  |  | #      RETURNS: The expanded template or the default | 
					
						
							|  |  |  | #  DESCRIPTION: The -output or -script options both take an optional template, | 
					
						
							|  |  |  | #               so we need to expand it with episode numbers. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub expand_template { | 
					
						
							|  |  |  |     my ( $option, $prefix, $suffix, $lbound, $ubound ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( defined($option) ) { | 
					
						
							|  |  |  |         if ( $option =~ /^$/ ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Build a default template depending on the episode range | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $lbound == $ubound ) { | 
					
						
							|  |  |  |                 $option = $prefix . '_%04d.' . $suffix; | 
					
						
							|  |  |  |                 $option = sprintf( $option, $lbound ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             else { | 
					
						
							|  |  |  |                 $option = $prefix . '_%04d-%04d.' . $suffix; | 
					
						
							|  |  |  |                 $option = sprintf( $option, $lbound, $ubound ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         elsif ( $option =~ /%(\d*)d/ ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Caller specified a template. We need to check it | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             my $count = () = $option =~ /%(\d*)d/g; | 
					
						
							|  |  |  |             die "Invalid - too many '%d' sequences in '$option'\n" | 
					
						
							|  |  |  |                 if ( $count > ( $lbound == $ubound ? 1 : 2 ) ); | 
					
						
							|  |  |  |             die "Invalid - too few '%d' sequences in '$option'\n" | 
					
						
							|  |  |  |                 if ( $count < ( $lbound == $ubound ? 1 : 2 ) ); | 
					
						
							|  |  |  |             $option =~ s/%(\d*)d/%04d/g; | 
					
						
							|  |  |  |             $option = sprintf( $option, $lbound, $ubound ); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # The default | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $option = "$prefix.$suffix" | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $option; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: coalesce | 
					
						
							|  |  |  | #      PURPOSE: To find the first defined argument and return it | 
					
						
							|  |  |  | #   PARAMETERS: Arbitrary number of arguments | 
					
						
							|  |  |  | #      RETURNS: The first defined argument or undef if there are none | 
					
						
							|  |  |  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | 
					
						
							|  |  |  | #               returned when doing so might be a problem. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub coalesce { | 
					
						
							|  |  |  |     foreach (@_) { | 
					
						
							|  |  |  |         return $_ if defined($_); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return undef;    ## no critic | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _debug | 
					
						
							|  |  |  | #      PURPOSE: Prints debug reports | 
					
						
							|  |  |  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | 
					
						
							|  |  |  | #               $message        Message to print | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | 
					
						
							|  |  |  | #               newline and then adds one in the 'print' to the caller doesn't | 
					
						
							|  |  |  | #               have to bother. Prepends the message with 'D> ' to show it's | 
					
						
							|  |  |  | #               a debug message. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _debug { | 
					
						
							|  |  |  |     my ( $active, $message ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     chomp($message); | 
					
						
							|  |  |  |     print "D> $message\n" if $active; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: Options | 
					
						
							|  |  |  | #      PURPOSE: Processes command-line options | 
					
						
							|  |  |  | #   PARAMETERS: $optref     Hash reference to hold the options | 
					
						
							|  |  |  | #      RETURNS: Undef | 
					
						
							|  |  |  | #  DESCRIPTION: Process the options we want to offer. See the documentation | 
					
						
							|  |  |  | #               for details | 
					
						
							|  |  |  | #       THROWS: no exceptions | 
					
						
							|  |  |  | #     COMMENTS: none | 
					
						
							|  |  |  | #     SEE ALSO: n/a | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub Options { | 
					
						
							|  |  |  |     my ($optref) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @options = ( | 
					
						
							|  |  |  |         "help",               "documentation|man", | 
					
						
							|  |  |  |         "debug=i",            "verbose!", | 
					
						
							|  |  |  |         "silent!",            "test!", | 
					
						
							|  |  |  |         "meta_only|noaudio!", "from=i", | 
					
						
							|  |  |  |         "to=i",               "count=i", | 
					
						
							|  |  |  |         "list=s",             "output:s", | 
					
						
							|  |  |  |         "fetch!",             "ignore_missing|im!", | 
					
						
							|  |  |  |         "assets!",            "script:s", | 
					
						
							|  |  |  |         "a_count=s",          "dbconfig=s", | 
					
						
							|  |  |  |         "config=s", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( !GetOptions( $optref, @options ) ) { | 
					
						
							|  |  |  |         pod2usage( | 
					
						
							|  |  |  |             -msg     => "$PROG version $VERSION\n", | 
					
						
							|  |  |  |             -exitval => 1, | 
					
						
							|  |  |  |             -verbose => 0 | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __END__ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #  Application Documentation | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #{{{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 NAME | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | make_metadata - Generate metadata from the HPR database for Archive.org | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 VERSION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This documentation refers to make_metadata version 0.4.14 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 USAGE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata [-help] [-documentation] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=FROM [-to=TO] [-count=COUNT] [-output[=FILE]] | 
					
						
							|  |  |  |         [-script[=FILE]] [-a_count=FILE] [-[no]meta_only] [-[no]fetch] | 
					
						
							|  |  |  |         [-[no]assets] [-[no]silent] [-[no]verbose] [-[no]test] | 
					
						
							|  |  |  |         [-[no]ignore_missing] [-config=FILE] [-dbconfig=FILE] [-debug=N] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -list=LIST [-output[=FILE]] [-script[=FILE]] | 
					
						
							|  |  |  |         [-[no]meta_only] [-[no]fetch] [-[no]assets] [-[no]silent] | 
					
						
							|  |  |  |         [-[no]verbose] [-[no]test] [-[no]ignore_missing] [-config=FILE] | 
					
						
							|  |  |  |         [-dbconfig=FILE] [-debug=N] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Examples: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1234 -nofetch | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1234 -to=1235 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1234 -count=10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1 -to=3 -output=metadata_1-3.csv | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1500 -to=1510 -out=metadata_1500-1510.csv -verbose | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1500 -to=1510 -out=metadata_%d-%d.csv -verbose | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=500 -to=510 -out=metadata_%04d-%04d.csv -verbose | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1500 -to=1510 -out -verbose | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1500 -to=1510 -out | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1675 -to=1680 -out=metadata_%d-%d.csv -meta_only | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=1450 -test | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -list='1234,2134,2314' -out -meta_only | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -list="931,932,933,935,938,939,940" -out -meta -ignore | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -dbconf=.hpr_livedb.cfg -from=1234 -to=1235 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     make_metadata -from=3004 -out -meta_only -noassets | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 OPTIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-help> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Reports brief information about how to use the script and exits. To see the | 
					
						
							|  |  |  | full documentation use the option B<-documentation> or B<-man>. Alternatively, | 
					
						
							|  |  |  | to generate a PDF version use the I<pod2pdf> tool from | 
					
						
							|  |  |  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | 
					
						
							|  |  |  | installed with the cpan tool as App::pod2pdf. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-documentation> or B<-man> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Reports full information about how to use the script and exits. Alternatively, | 
					
						
							|  |  |  | to generate a PDF version use the I<pod2pdf> tool from | 
					
						
							|  |  |  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | 
					
						
							|  |  |  | installed with the cpan tool as App::pod2pdf. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-debug=N> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Run in debug mode at the level specified by I<N>. Possible values are: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<0> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | No debugging (the default). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<1> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<2> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<3> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<4 and above> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The metadata hash is dumped. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Each call of the function I<find_links_in_notes> is reported. On finding an | 
					
						
							|  |  |  | <a> or <img> tag the I<uri> value is shown, as is any fragment and the related | 
					
						
							|  |  |  | link. The original file is reported here. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Each call of the function I<find_links_in_file> is reported. On finding an | 
					
						
							|  |  |  | <a> or <img> tag the I<uri> value is shown, as is any fragment and the related | 
					
						
							|  |  |  | link. The original file is reported here, and if a link is to be ignored this | 
					
						
							|  |  |  | is reported. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-from=NUMBER> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option defines the starting episode number of a group. It is mandatory to | 
					
						
							|  |  |  | provide either the B<-from=NUMBER> option or the B<-list=LIST> option (see | 
					
						
							|  |  |  | below). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-to=NUMBER> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option specifies the final episode number of a group. If not given the | 
					
						
							|  |  |  | script generates metadata for the single episode indicated by B<-from>. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The value given here must be greater than or equal to that given in the | 
					
						
							|  |  |  | B<-from> option. The option must not be present with the B<-count> option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The difference between the episode numbers given by the B<-from> and B<-to> | 
					
						
							|  |  |  | options must not be greater than 20. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-count=NUMBER> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option specifies the number of episodes to process (starting from the | 
					
						
							|  |  |  | episode number specified by the B<-from>) option. The option must not be | 
					
						
							|  |  |  | present with the B<-to> option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The number of episodes specified must not be greater than 20. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-list=LIST> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option is an alternative to B<-from=NUMBER> and its associated modifying | 
					
						
							|  |  |  | options. The B<LIST> is a comma-separated list of not necessarily consecutive | 
					
						
							|  |  |  | episode numbers, and must consist of at least one and no more than 20 numbers. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option is useful for the case when non-sequential episode numbers are to | 
					
						
							|  |  |  | be uploaded, and is particularly useful when repairing elements of particular | 
					
						
							|  |  |  | episodes (such as adding summary fields and tags) where they have already | 
					
						
							|  |  |  | been uploaded. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | For example, the following shows have no summary and/or tags, but the shows | 
					
						
							|  |  |  | are already in the IA. The missing items have been provided, so we wish to | 
					
						
							|  |  |  | update the HTML part of the upload: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $ ./make_metadata -list='2022,2027,2028,2029,2030,2033' -out -meta | 
					
						
							|  |  |  |     Output file: metadata_2022-2033.csv | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-output[=FILE]> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option specifies the file to receive the generated CSV data. If omitted | 
					
						
							|  |  |  | the output is written to B<metadata.csv> in the current directory. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The file name may contain one or two instances of the characters '%d', with | 
					
						
							|  |  |  | a leading width specification if desired (such as '%04d'). These will be | 
					
						
							|  |  |  | substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if | 
					
						
							|  |  |  | B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the | 
					
						
							|  |  |  | appropriate endpoint (adding the count to the starting number). If neither of | 
					
						
							|  |  |  | the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only | 
					
						
							|  |  |  | be one instance of '%d' or the script will abort. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If no value is provided to B<-output> then a suitable template will be | 
					
						
							|  |  |  | generated. It will be 'metadata_%04d.csv' if one episode is being processed, and | 
					
						
							|  |  |  | 'metadata_%04d-%04d.csv' if a range has been specified. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Example: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -out=metadata_%04d.csv | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | the output file name will be B<metadata_1430.csv>. The same effect can be | 
					
						
							|  |  |  | achieved with: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -out= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | or | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -out | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-script[=FILE]> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option specifies the file to receive commands required to upload certain | 
					
						
							|  |  |  | files relating to a show. If omitted the commands are written to B<script.sh> | 
					
						
							|  |  |  | in the current directory. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The file name may contain one or two instances of the characters '%d', with | 
					
						
							|  |  |  | a leading width specification if desired (such as '%04d'). These will be | 
					
						
							|  |  |  | substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if | 
					
						
							|  |  |  | B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the | 
					
						
							|  |  |  | appropriate endpoint (adding the count to the starting number). If neither of | 
					
						
							|  |  |  | the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only | 
					
						
							|  |  |  | be one instance of '%d' or the script will abort. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If no value is provided to B<-script> then a suitable template will be | 
					
						
							|  |  |  | generated. It will be 'script_%04d.sh' if one episode is being processed, and | 
					
						
							|  |  |  | 'script_%04d-%04d.sh' if a range has been specified. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Example: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -script=script_%04d.sh | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | the output file name will be B<script_1430.sh>. The same effect can be | 
					
						
							|  |  |  | achieved with: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -script= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | or | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  ./make_metadata -from=1430 -script | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-a_count=FILE> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Defines a file into which the script writes details of assets downloaded | 
					
						
							|  |  |  | during the analysis of notes (and other HTML files associated with a show). | 
					
						
							|  |  |  | The listing consists of the show identifier (e.g. 'hpr3901') followed by the | 
					
						
							|  |  |  | number of links followed to collect the files for this show. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This feature was added to allow other scripts to perform tasks with these | 
					
						
							|  |  |  | assets, but is now deprecated. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The feature will probably be removed in a later release of this script. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]fetch> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option controls whether the script attempts to fetch the MP3 audio file | 
					
						
							|  |  |  | from the HPR website should there be no WAV file in the upload area. The | 
					
						
							|  |  |  | default setting is B<-fetch>. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Normally the script is run as part of the workflow to upload the metadata and | 
					
						
							|  |  |  | audio to archive.org. The audio is expected to be a WAV file and to be in the | 
					
						
							|  |  |  | location referenced in the configuration file under the 'uploads' label. | 
					
						
							|  |  |  | However, not all of the WAV files exist for older shows. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | When the WAV file is missing and B<-fetch> is selected or defaulted, the | 
					
						
							|  |  |  | script will attempt to download the MP3 version of the audio and will store it | 
					
						
							|  |  |  | in the 'uploads' area for the upload script (B<ias3upload.pl> or B<ia>) to | 
					
						
							|  |  |  | send to archive.org. If the MP3 file is not found then the script will abort. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If B<-fetch> is specified (or defaulted) as well as B<-nometa_only> (see | 
					
						
							|  |  |  | below) then the audio file fetching process will not be carried out. This is | 
					
						
							|  |  |  | because it makes no sense to fetch this file if it's not going to be | 
					
						
							|  |  |  | referenced in the metadata. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]assets> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option controls the downloading of any assets that may be associated with | 
					
						
							|  |  |  | a show. Assets are the files held on the HPR server which are referenced by | 
					
						
							|  |  |  | the show. Examples might be photographs, scripts, and supplementary notes. | 
					
						
							|  |  |  | Normally all such assets are collected and stored in the upload area and are | 
					
						
							|  |  |  | then sent to the archive via the script. The notes sent to the archive are | 
					
						
							|  |  |  | adjusted to refer to these notes on archive.org, making the HPR episode | 
					
						
							|  |  |  | completely self-contained. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]meta_only> (alias B<-[no]noaudio>) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option controls whether the output file will contain a reference to the | 
					
						
							|  |  |  | audio file(s) or only the metadata. The default is B<-nometa_only> meaning that | 
					
						
							|  |  |  | the file reference(s) and the metadata are present. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Omitting the file(s) allows the metadata to be regenerated, perhaps due to | 
					
						
							|  |  |  | edits and corrections in the database, and the changes to be propagated to | 
					
						
							|  |  |  | archive.org. If the file reference(s) exist(s) in the metadata file then the | 
					
						
							|  |  |  | file(s) must be available at the time the uploader is run. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Note that making changes this way is highly preferable to editing the entry on | 
					
						
							|  |  |  | archive.org using the web-based editor. This is because there is a problem | 
					
						
							|  |  |  | with the way HTML entities are treated and this can cause the HTML to be | 
					
						
							|  |  |  | corrupted. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]silent> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The option enables (B<-silent>) and disables (B<-nosilent>) I<silent mode>. | 
					
						
							|  |  |  | When enabled the script reports nothing on STDOUT. If the script cannot find | 
					
						
							|  |  |  | the audio files and downloads the MP3 version from the HPR site for upload to | 
					
						
							|  |  |  | archive.org then the downloads are reported on STDERR. This cannot be | 
					
						
							|  |  |  | disabled, though the STDERR output could be redirected to a file or to | 
					
						
							|  |  |  | /dev/null. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If B<-silent> is specified with B<-verbose> then the latter "wins". | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script runs with silent mode disabled by default. When B<-nosilent> is | 
					
						
							|  |  |  | used with B<-noverbose> the script reports the output file name and nothing | 
					
						
							|  |  |  | else. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]verbose> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option enables (B<-verbose>) and disables (B<-noverbose>) | 
					
						
							|  |  |  | I<verbose mode>. When enabled the script reports the metadata it has collected | 
					
						
							|  |  |  | from the database before writing it to the output file. The data is reported | 
					
						
							|  |  |  | in a more readable mode than examining the CSV file, although another script | 
					
						
							|  |  |  | B<show_metadata> is also available to help with this. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If B<-verbose> is specified with B<-silent> then the former "wins". | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script runs with verbose mode disabled by default. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]ignore_missing> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script checks each episode to ensure it has a summary and tags. If either | 
					
						
							|  |  |  | of these fields is missing then a warning message is printed for that episode | 
					
						
							|  |  |  | (unless B<-silent> has been chosen), and if any episodes are lacking this | 
					
						
							|  |  |  | information the script aborts without producing metadata. If the option | 
					
						
							|  |  |  | B<-ignore_missing> is selected then the warnings are produced (dependent on | 
					
						
							|  |  |  | B<-silent>) but the script runs to completion. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The default setting is B<-noignore_missing>; the script checks and aborts if | 
					
						
							|  |  |  | any summaries or tags are missing. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]test> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | DO NOT USE! | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option enables (B<-test>) and disables (B<-notest>) | 
					
						
							|  |  |  | I<test mode>. When enabled the script generates metadata containing various | 
					
						
							|  |  |  | test values. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | In test mode the following changes are made: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item . | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The item names, which normally contain 'hprnnnn', built from the episode | 
					
						
							|  |  |  | number, have 'test_' prepended to them. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item . | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The collection, which is normally a list containing 'hackerpublicradio' and | 
					
						
							|  |  |  | 'podcasts', is changed to 'test_collection'. Items in this collection are | 
					
						
							|  |  |  | normally deleted by Archive.org after 30 days. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item . | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The contributor, which is normally 'HackerPublicRadio' is changed to | 
					
						
							|  |  |  | 'perlist'. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | B<NOTE> The test mode only works for the author! | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-config=FILE> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option allows an alternative script configuration file to be used. This | 
					
						
							|  |  |  | file defines various settings relating to the running of the script - things | 
					
						
							|  |  |  | like the place to look for the files to be uploaded. It is rare to need to use | 
					
						
							|  |  |  | any other file than the default since these are specific to the environmewnt | 
					
						
							|  |  |  | in which the script runs. However, this has been added at the same time as an | 
					
						
							|  |  |  | alternative database configuration option was added. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See the CONFIGURATION AND ENVIRONMENT section below for the file format. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If the option is omitted the default file is used: B<.make_metadata.cfg> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-dbconfig=FILE> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option allows an alternative database configuration file to be used. This | 
					
						
							|  |  |  | file defines the location of the database, its port, its name and the username | 
					
						
							|  |  |  | and password to be used to access it. This feature was added to allow the | 
					
						
							|  |  |  | script to access alternative databases or the live database over an SSH | 
					
						
							|  |  |  | tunnel. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See the CONFIGURATION AND ENVIRONMENT section below for the file format. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If the option is omitted the default file is used: B<.hpr_db.cfg> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DESCRIPTION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This script generates metadata suitable for uploading Hacker Public Radio | 
					
						
							|  |  |  | episodes to the Internet Archive (archive.org). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The metadata is in comma-separated variable (CSV) format suitable for | 
					
						
							|  |  |  | processing with an upload script. The original upload script was called | 
					
						
							|  |  |  | B<ias3upload.pl>, and could be obtained from | 
					
						
							|  |  |  | I<https://github.com/kngenie/ias3upload>. This script is no longer supported | 
					
						
							|  |  |  | and B<make_metadata> no longer generates output suitable for it (though it is | 
					
						
							|  |  |  | simple to make it compatible if necessary). The replacement script is called | 
					
						
							|  |  |  | B<internetarchive> which is a Python tool which can also be run from the | 
					
						
							|  |  |  | command line. It can be found at I<https://github.com/jjjake/internetarchive>. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The B<make_metadata> script generates CSV from the HPR database. It looks up | 
					
						
							|  |  |  | details for each episode selected by the options, and performs various | 
					
						
							|  |  |  | conversions and concatenations. The goal is to prepare items for the Internet | 
					
						
							|  |  |  | Archive with as much detail as the format can support. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The resulting CSV file contains a header line listing the field names required | 
					
						
							|  |  |  | by archive.org followed by as many CSV lines of episode data as requested (up | 
					
						
							|  |  |  | to a limit of 20). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Since the upload method uses the HTTP protocol with fields stored in headers, | 
					
						
							|  |  |  | there are restrictions on the way HTML can be formatted in the B<Details> | 
					
						
							|  |  |  | field. The script converts newlines, which are not allowed into I<<br/>> tags | 
					
						
							|  |  |  | where necessary. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | HPR shows often have associated files, such as pictures, examples, long-form | 
					
						
							|  |  |  | notes and so forth. The script finds these and downloads them to the cache | 
					
						
							|  |  |  | area where the audio is kept and writes the necessary lines to the CSV file to | 
					
						
							|  |  |  | ensure they are uploaded with the show. It modifies any HTML which links to | 
					
						
							|  |  |  | these files to link to the archive.org copies in order to make the complete | 
					
						
							|  |  |  | show self-contained. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DIAGNOSTICS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Configuration file ... not found> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | One or more of the configuration files has not been found. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Path ... not found> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The path specified in the B<uploads> definition in the configuration file | 
					
						
							|  |  |  | B<.make_metadata.cfg> does not exist. Check the configuration file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Configuration data missing> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | While checking the configuration file(s) the script has detected that settings | 
					
						
							|  |  |  | are missing. Check the details specified below and provide the missing | 
					
						
							|  |  |  | elements. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Mis-match between @fields and %dispatch!> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | An internal error in the script has been detected where the elements of the | 
					
						
							|  |  |  | @fields array do not match the keys of the %dispatch hash. This is probably the | 
					
						
							|  |  |  | result of a failed attempt to edit either of these components. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Correct the error and run the script again. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid list; no elements> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no list elements in the B<-list=LIST> option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid list; too many elements> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are more than the allowed 20 elements in the list specified by the | 
					
						
							|  |  |  | B<-list=LIST> option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Failed to parse -list=...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list was specified that did not contain a CSV list of numbers. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid starting episode number (...)> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The value used in the B<-from> option must be greater than 0. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Do not combine -to and -count> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Using both the B<-to> and B<-count> is not permitted (and makes no sense). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid range; ... is greater than ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The B<-from> episode number must be less than or equal to the B<-to> number. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid range; range is too big (E<gt>20)> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The difference between the starting and ending episode number is greater than | 
					
						
							|  |  |  | 20. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid - too many '%d' sequences in '...'> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There were more than two '%d' sequences in the the name of the output file if | 
					
						
							|  |  |  | a range of episodes is being processed, or more than one if a single episode | 
					
						
							|  |  |  | has been specified. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid - too few '%d' sequences in '...'> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There were fewer than two '%d' sequences in the the name of the output file | 
					
						
							|  |  |  | when a range of episodes was being processed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Unable to open ... for output: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script was unable to open the requested output file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Unable to find or download ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script has not found a I<.WAV> file in the cache area so has attempted to | 
					
						
							|  |  |  | download the I<MP3> copy of the audio from the HPR website. This process has | 
					
						
							|  |  |  | failed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Failed to find requested episode> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | An episode number could not be found in the database. This error is not fatal. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Nothing to do> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | After processing the range of episodes specified the script could not find | 
					
						
							|  |  |  | anything to do. This is most often caused by all of the episodes in the range | 
					
						
							|  |  |  | being invalid. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Aborted due to missing summaries and/or tags> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | One or more of the shows being processed does not have a summary or tags. The | 
					
						
							|  |  |  | script has been told not to ignore this so has aborted before generating | 
					
						
							|  |  |  | metadata. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<HTML::TreeBuilder failed to parse notes: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script failed to parse the HTML in the notes of one of the episodes. This | 
					
						
							|  |  |  | indicates a serious problem with these notes and is fatal since these notes | 
					
						
							|  |  |  | need to be corrected before the episode is uploaded to the Internet Archive. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<HTML::TreeBuilder failed to process ...: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | While parsing the HTML in a related file the parse has failed. The file being | 
					
						
							|  |  |  | parsed is reported as well as the error that was encountered. This is likely | 
					
						
							|  |  |  | due to bad HTML. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Unable to open ... for writing: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script is attempting to open an HTML file which it has downloaded to | 
					
						
							|  |  |  | write back edited HTML, yet the open has failed. The filename is in the error | 
					
						
							|  |  |  | message as is the cause of the error. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 CONFIGURATION AND ENVIRONMENT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This script reads two configuration files in B<Config::General> format | 
					
						
							|  |  |  | (similar to Apache configuration files) for the path to the files to be | 
					
						
							|  |  |  | uploaded and for credentials to access the HPR database. Two files are used | 
					
						
							|  |  |  | because the database configuration file is used by several other scripts. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 SCRIPT CONFIGURATION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The general configuration file is B<.make_metadata.cfg> (although this can be | 
					
						
							|  |  |  | overridden through the B<-config=FILE> option) and contains the following | 
					
						
							|  |  |  | lines: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  uploads = "<path to files>" | 
					
						
							|  |  |  |  filetemplate = "hpr%04d.%s" | 
					
						
							|  |  |  |  baseURL = "http://hackerpublicradio.org" | 
					
						
							|  |  |  |  sourceURLtemplate = "https://hackerpublicradio.org/eps/%s/index.html" | 
					
						
							|  |  |  |  IAURLtemplate = "http://archive.org/download/%s/%s" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The I<uploads> line defines where the WAV files are to be found (currently | 
					
						
							|  |  |  | I</var/IA/uploads> on the VPS). The same area is used to store downloaded MP3 | 
					
						
							|  |  |  | files and any supplementary files associated with the episode. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The I<filetemplate> line defines the format of an audio file such as | 
					
						
							|  |  |  | I<hpr1234.wav>. This should not be changed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The I<baseURL> line defines the common base for download URLs. It is used when | 
					
						
							|  |  |  | parsing and standardising URLs relating to files on the HPR server. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The I<sourceURLtemplate> line defines the format of the URL required to access the | 
					
						
							|  |  |  | show on the HPR site. This should not be changed except in the unlikely event that the | 
					
						
							|  |  |  | these URLs change. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The I<IAURLtemplate> line defines the format of URLs on archive.org which is | 
					
						
							|  |  |  | used when generating new links in HTML notes or supplementary files. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 DATABASE CONFIGURATION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The database configuration file is B<.hpr_db.cfg> (although this can be | 
					
						
							|  |  |  | overridden through the B<-dbconfig=FILE> option). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The layout of the file should be as follows: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  <database> | 
					
						
							|  |  |  |      host = 127.0.0.1 | 
					
						
							|  |  |  |      port = PORT | 
					
						
							|  |  |  |      name = DATABASE | 
					
						
							|  |  |  |      user = USERNAME | 
					
						
							|  |  |  |      password = PASSWORD | 
					
						
							|  |  |  |  </database> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DEPENDENCIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Carp | 
					
						
							|  |  |  |     Config::General | 
					
						
							|  |  |  |     DBI | 
					
						
							|  |  |  |     Data::Dumper | 
					
						
							|  |  |  |     File::Find::Rule | 
					
						
							|  |  |  |     File::Path | 
					
						
							|  |  |  |     Getopt::Long | 
					
						
							|  |  |  |     HTML::Entities | 
					
						
							|  |  |  |     HTML::TreeBuilder | 
					
						
							|  |  |  |     IO::HTML | 
					
						
							|  |  |  |     LWP::Simple | 
					
						
							|  |  |  |     List::MoreUtils | 
					
						
							|  |  |  |     List::Util | 
					
						
							|  |  |  |     Pod::Usage | 
					
						
							|  |  |  |     Text::CSV_XS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 BUGS AND LIMITATIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no known bugs in this module. | 
					
						
							|  |  |  | Please report problems to Dave Morriss (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | Patches are welcome. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 AUTHOR | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Dave Morriss  (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 LICENCE AND COPYRIGHT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Copyright (c) 2014-2019 Dave Morriss (Dave.Morriss@gmail.com). | 
					
						
							|  |  |  | All rights reserved. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This module is free software; you can redistribute it and/or | 
					
						
							|  |  |  | modify it under the same terms as Perl itself. See perldoc perlartistic. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =cut | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #}}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # [zo to open fold, zc to close] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker |