forked from HPR/hpr-tools
		
	
		
			
	
	
		
			2801 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			2801 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: make_metadata | ||
|  | # | ||
|  | #        USAGE: ./make_metadata -from=FROM [-to=TO] [-count=COUNT] | ||
|  | #               [-output=FILE] [-[no]fetch] [-[no]meta_only] [-[no]verbose] | ||
|  | #               [-[no]silent] [-[no]test] [-help] [-{documentation|man}] | ||
|  | # | ||
|  | #  DESCRIPTION: Make metadata for the uploading of a range of HPR episodes to | ||
|  | #               the Internet Archive by collecting fields from the database. | ||
|  | # | ||
|  | #      OPTIONS: See the POD documentation for full details (./make_metadata | ||
|  | #               -man) or use 'pod2pdf' to generate a PDF version in the | ||
|  | #               directory holding this script. | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: 2023-07-06: A version in development 0.4.14 has been put aside | ||
|  | #               and this version (0.4.12) made into the main line version | ||
|  | #               because 4.14 was developing in a direction that doesn't fit | ||
|  | #               with the changes made to the HPR system in June/July 2023. | ||
|  | #               Will now move forward with version numbers. | ||
|  | #               2024-01-23: Added the 'open' pragma for UTF-8 | ||
|  | #               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.4.14 | ||
|  | #      CREATED: 2014-06-13 12:51:04 | ||
|  | #     REVISION: 2024-01-23 16:28:59 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use 5.010; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | use open ':encoding(UTF-8)'; | ||
|  | #use utf8; | ||
|  | 
 | ||
|  | use Carp; | ||
|  | use Getopt::Long; | ||
|  | use Pod::Usage; | ||
|  | 
 | ||
|  | use List::Util qw{max}; | ||
|  | use List::MoreUtils qw{uniq apply}; | ||
|  | use Config::General; | ||
|  | use Text::CSV_XS; | ||
|  | 
 | ||
|  | use IO::HTML; | ||
|  | use HTML::TreeBuilder 5 -weak; | ||
|  | use HTML::Entities; | ||
|  | 
 | ||
|  | use File::Find::Rule; | ||
|  | use File::Path qw{make_path}; | ||
|  | #use LWP::Simple; | ||
|  | use LWP::UserAgent; | ||
|  | use HTTP::Status qw(status_message); | ||
|  | 
 | ||
|  | use DBI; | ||
|  | 
 | ||
|  | use Data::Dumper; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (manually incremented) | ||
|  | # | ||
|  | our $VERSION = '0.4.14'; | ||
|  | 
 | ||
|  | # | ||
|  | # Script and directory names | ||
|  | # | ||
|  | ( my $PROG = $0 ) =~ s|.*/||mx; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Constants and defaults | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $basedir      = "$ENV{HOME}/HPR/IA"; | ||
|  | my $configfile   = "$basedir/.$PROG.cfg"; | ||
|  | my $dbconfigfile = "$basedir/.hpr_db.cfg"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Declarations | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Constants used in the metadata | ||
|  | # | ||
|  | my $mediatype   = 'audio'; | ||
|  | my $collection  = 'hackerpublicradio'; | ||
|  | my $language    = 'eng'; | ||
|  | my $contributor = 'HackerPublicRadio'; | ||
|  | 
 | ||
|  | # | ||
|  | # Variables, arrays and hashes | ||
|  | # | ||
|  | my ( $dbh, $sth1, $h1 ); | ||
|  | my ( $DEBUG, $verbose, $silent, $test, $from, $to, $count ); | ||
|  | my ( $list, $meta_only, $outfile, $scriptfile, $fetch, $assets); | ||
|  | my ( $acountfile, $ignore ); | ||
|  | my ( @range, $tree, %links, %meta, @source, @transcripts, @counts, @head, @data ); | ||
|  | my ( $ep_name, $ft, $file, $URL, $status, $incomplete ); | ||
|  | my ( $filepath, $max_epno, $filetemplate, $sourceURLtemplate ); | ||
|  | my ( $iauploadtemplate, $iauploadoptions ); | ||
|  | 
 | ||
|  | # | ||
|  | # File types we'll look for when working out (and collecting) the main audio | ||
|  | # file and their order | ||
|  | # | ||
|  | my @filetypes = (qw{ wav mp3 ogg }); | ||
|  | 
 | ||
|  | # | ||
|  | # Additional filetypes we'll look for if we're working round the lack of tags | ||
|  | # on the derived audio files. See the discussion at | ||
|  | # https://gitlab.anhonesthost.com/HPR/HPR_Public_Code/issues/34 | ||
|  | # | ||
|  | my @additional_audio = (qw{ flac mp3 ogg opus spx }); | ||
|  | 
 | ||
|  | # | ||
|  | # Templates for finding the transcripts. | ||
|  | # | ||
|  | # We expect these to be in $uploads/$episode/ where we expect to find: | ||
|  | #     $episode.json | ||
|  | #     $episode.srt | ||
|  | #     $episode.tsv | ||
|  | #     $episode.txt | ||
|  | #     $episode.vtt | ||
|  | # The regular expression below should find the transcripts given that the | ||
|  | # search is directed to the sub-directory in $uploads. | ||
|  | # | ||
|  | my $transcript_re = qr{hpr\d{4}\.(json|srt|tsv|txt|vtt)}; | ||
|  | 
 | ||
|  | # | ||
|  | # Names of CSV fields and their order | ||
|  | # | ||
|  | my @fields = ( | ||
|  |     qw{ | ||
|  |         identifier file mediatype collection title creator language | ||
|  |         description contributor date subject licenseurl | ||
|  |         } | ||
|  | ); | ||
|  | 
 | ||
|  | # | ||
|  | # Maximum episodes per run | ||
|  | # | ||
|  | my $max_episodes = 20; | ||
|  | 
 | ||
|  | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # A dispatch table to define how to build metadata. The keys are actually | ||
|  | # header names to be used in the metadata file and must relate to the @fields | ||
|  | # array above. Two types of values are expected here: 1) anonymous subroutine | ||
|  | # references which return the value for the field or 2) scalars (or strings) | ||
|  | # to be placed as-is. Some of the anonymous subroutines are written as | ||
|  | # closures (see | ||
|  | # http://en.wikipedia.org/wiki/Closure_%28computer_programming%29), which make | ||
|  | # it easier to build the calling code since we can pass in a variety of | ||
|  | # arguments internally while offering a simple no-argument call externally. | ||
|  | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | my %dispatch = ( | ||
|  |     identifier => sub { return make_item( $h1, $test ) }, | ||
|  |     file       => sub { | ||
|  |         return ( | ||
|  |             $meta_only | ||
|  |             ? '' | ||
|  |             : make_filename( $filepath, $filetemplate, \@filetypes, $h1 ) | ||
|  |         ); | ||
|  |     }, | ||
|  |     mediatype   => $mediatype, | ||
|  |     collection  => $collection, | ||
|  |     #title       => sub { return decode_entities( db_title($h1) ) }, | ||
|  |     title       => sub { return db_title($h1) }, | ||
|  |     creator     => sub { return $h1->{host} }, | ||
|  |     language    => $language, | ||
|  |     description => sub { return db_notes( $sourceURLtemplate, $h1, $tree, \@source ) }, | ||
|  |     contributor => $contributor, | ||
|  |     date       => sub { return $h1->{date} }, | ||
|  |     subject    => sub { return db_tags($h1) }, | ||
|  |     licenseurl => sub { return $h1->{license_url} } | ||
|  | ); | ||
|  | 
 | ||
|  | # | ||
|  | # Validate the data structures above to ensure there have been no mis-edits | ||
|  | # | ||
|  | foreach (@fields) { | ||
|  |     die "Mis-match between \@fields and \%dispatch!\n" | ||
|  |         unless ( defined( $dispatch{$_} ) ); | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # The main query on the database. We want episode details, the host's name, | ||
|  | # the series name and the Creative Commons licence the episode is under. | ||
|  | # Note that the 'LEFT JOIN' for 'miniseries' is for the case where the | ||
|  | # 'series' column of 'eps' is 'NULL'. This is not the case for the live | ||
|  | # database - yet! | ||
|  | # | ||
|  | my $sql = q{ | ||
|  |     SELECT | ||
|  |         e.id AS eps_id, | ||
|  |         e.date, | ||
|  |         e.title, | ||
|  |         sec_to_time(e.duration) AS duration, | ||
|  |         e.summary, | ||
|  |         e.notes, | ||
|  |         e.explicit, | ||
|  |         e.license, | ||
|  |         l.url as license_url, | ||
|  |         e.tags, | ||
|  |         e.valid, | ||
|  |         h.host, | ||
|  |         s.name AS s_name, | ||
|  |         s.description AS s_desc | ||
|  |     FROM eps e | ||
|  |         JOIN hosts h ON e.hostid = h.hostid | ||
|  |         LEFT JOIN miniseries s ON e.series = s.id | ||
|  |         JOIN licenses l ON e.license = l.short_name | ||
|  |     WHERE e.id = ? | ||
|  |     }; | ||
|  | 
 | ||
|  | # | ||
|  | # Enable Unicode mode | ||
|  | # | ||
|  | binmode STDOUT, ":encoding(UTF-8)"; | ||
|  | binmode STDERR, ":encoding(UTF-8)"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Options and arguments | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Option defaults | ||
|  | # | ||
|  | my $DEFDEBUG = 0; | ||
|  | 
 | ||
|  | my %options; | ||
|  | Options( \%options ); | ||
|  | 
 | ||
|  | # | ||
|  | # Default help shows minimal information | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | ||
|  |     if ( $options{'help'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # The -documentation or -man option shows the full POD documentation through | ||
|  | # a pager for convenience | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | ||
|  |     if ( $options{'documentation'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # Collect options | ||
|  | # | ||
|  | $DEBUG   = ( defined( $options{debug} )   ? $options{debug}   : $DEFDEBUG ); | ||
|  | $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 ); | ||
|  | $silent  = ( defined( $options{silent} )  ? $options{silent}  : 0 ); | ||
|  | $test    = ( defined( $options{test} )    ? $options{test}    : 0 ); | ||
|  | $from    = $options{from}; | ||
|  | $to      = $options{to}; | ||
|  | $count   = $options{count}; | ||
|  | $list    = $options{list}; | ||
|  | $meta_only = ( defined( $options{meta_only} ) ? $options{meta_only} : 0 ); | ||
|  | $fetch     = ( defined( $options{fetch} )     ? $options{fetch}     : 1 ); | ||
|  | $assets    = ( defined( $options{assets} )    ? $options{assets}    : 1 ); | ||
|  | $ignore | ||
|  |     = ( defined( $options{ignore_missing} ) ? $options{ignore_missing} : 0 ); | ||
|  | $outfile    = $options{output}; | ||
|  | $scriptfile = $options{script}; | ||
|  | $acountfile = $options{a_count}; | ||
|  | 
 | ||
|  | # | ||
|  | # The two config files can be overridden or default to the original | ||
|  | # declarations | ||
|  | # | ||
|  | my $cfgfile | ||
|  |     = ( defined( $options{config} ) ? $options{config} : $configfile ); | ||
|  | my $dbcfgfile | ||
|  |     = ( defined( $options{dbconfig} ) ? $options{dbconfig} : $dbconfigfile ); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Load and validate program configuration data | ||
|  | #------------------------------------------------------------------------------- | ||
|  | die "Configuration file $cfgfile not found\n" unless ( -e $cfgfile ); | ||
|  | 
 | ||
|  | my $conf = Config::General->new( | ||
|  |     -ConfigFile      => $configfile, | ||
|  |     -InterPolateVars => 1, | ||
|  |     -ExtendedAccess  => 1, | ||
|  | ); | ||
|  | my %config = $conf->getall(); | ||
|  | 
 | ||
|  | # | ||
|  | # Check the config settings | ||
|  | # | ||
|  | $filepath = $config{uploads}; | ||
|  | 
 | ||
|  | die "Path $filepath not found\n" unless ( -e $filepath ); | ||
|  | 
 | ||
|  | $max_epno          = $config{max_epno}; | ||
|  | $filetemplate      = $config{filetemplate}; | ||
|  | $sourceURLtemplate = $config{sourceURLtemplate}; | ||
|  | $iauploadtemplate  = $config{iauploadtemplate} . "\n"; | ||
|  | $iauploadoptions   = $config{iauploadoptions}; | ||
|  | 
 | ||
|  | die "Configuration data missing\n" | ||
|  |     unless $max_epno | ||
|  |     && $filetemplate | ||
|  |     && $sourceURLtemplate | ||
|  |     && $iauploadtemplate | ||
|  |     && $iauploadoptions; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Load database configuration data | ||
|  | #------------------------------------------------------------------------------- | ||
|  | die "Configuration file $dbcfgfile not found\n"  unless ( -e $dbcfgfile ); | ||
|  | 
 | ||
|  | my $dbconf = Config::General->new( | ||
|  |     -ConfigFile      => $dbcfgfile, | ||
|  |     -InterPolateVars => 1, | ||
|  |     -ExtendedAccess  => 1, | ||
|  | ); | ||
|  | my %dbcfg = $dbconf->getall(); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Validate options | ||
|  | #------------------------------------------------------------------------------- | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | ||
|  |     if ( ( !defined($from) and !defined($list) ) | ||
|  |     or ( defined($from) and defined($list) ) ); | ||
|  | 
 | ||
|  | # | ||
|  | # Deal with the values specified by one of: | ||
|  | #     -from=N1 -to=N2 | ||
|  | #     -from=N3 -count=N4 | ||
|  | #     -list="N1,N2,N3" | ||
|  | # | ||
|  | # End by populating @range with a list of episode numbers | ||
|  | # | ||
|  | if ( defined($list) ) { | ||
|  |     # | ||
|  |     # We have a list which we'll parse, validate, sort, make unique and filter | ||
|  |     # | ||
|  |     my $lcsv = Text::CSV_XS->new( { binary => 1, } ); | ||
|  |     if ( $lcsv->parse($list) ) { | ||
|  |         @range = uniq( sort { $a <=> $b } $lcsv->fields() ); | ||
|  |         @range = grep {/\d+/} @range; | ||
|  |         @range = grep { $_ > 0 && $_ <= $max_epno } @range; | ||
|  |         @range = apply { $_ =~ s/(^\s*|\s*$)// } @range; | ||
|  | 
 | ||
|  |         die "Invalid list; no elements\n" if scalar(@range) == 0; | ||
|  |         die "Invalid list; too many elements\n" | ||
|  |             if scalar(@range) > $max_episodes; | ||
|  |     } | ||
|  |     else { | ||
|  |         die "Failed to parse -list='$list'\n" . $lcsv->error_diag() . "\n"; | ||
|  |     } | ||
|  | } | ||
|  | else { | ||
|  |     # | ||
|  |     # We have -from=N, -to=M or -count=X | ||
|  |     # | ||
|  |     die "Invalid starting episode number ($from)\n" unless $from > 0; | ||
|  |     die "Do not combine -to and -count\n" | ||
|  |         if ( defined($to) && defined($count) ); | ||
|  | 
 | ||
|  |     if ( defined($count) ) { | ||
|  |         $to = $from + $count - 1; | ||
|  |     } | ||
|  |     elsif ( !defined($to) ) { | ||
|  |         $to = $from; | ||
|  |     } | ||
|  | 
 | ||
|  |     die "Invalid range; $from is greater than $to\n" unless $from <= $to; | ||
|  |     die "Invalid range; range is too big (>$max_episodes): $to - $from = " | ||
|  |         . ( $to - $from ) . "\n" if ( $to - $from ) > $max_episodes; | ||
|  | 
 | ||
|  |     @range = ( $from .. $to ); | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Check we got a sensible range | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | ||
|  |     unless scalar(@range) > 0; | ||
|  | 
 | ||
|  | # | ||
|  | # You can't be silent and verbose at the same time | ||
|  | # | ||
|  | $silent = 0 if $verbose; | ||
|  | 
 | ||
|  | # | ||
|  | # Fetching audio files is not relevant in 'metadata only' mode | ||
|  | # TODO: -nofetch with -meta_only incorrectly results in the download of files; | ||
|  | # see the Journal in regard to show 3004 in April 2020. | ||
|  | # TODO: 2023-07-06 Time to drop '-[no]fetch' completely? | ||
|  | # | ||
|  | $fetch = 0 if $meta_only; | ||
|  | 
 | ||
|  | # | ||
|  | # If there's a defined output filename then there are three options: a null | ||
|  | # string, a plain filename and a substitution string with '%d' sequences. The | ||
|  | # null string means the user used '-output' without a value, so we want to | ||
|  | # generate a substitution string. A string with '%d' requires a check to | ||
|  | # ensure there's the right number, one for $ubound and one for $lbound. The | ||
|  | # plain filename needs no more work. | ||
|  | # | ||
|  | # If no defined output filename we'll provide a default. | ||
|  | # | ||
|  | $outfile = expand_template( $outfile, 'metadata', 'csv', | ||
|  |     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) ); | ||
|  | 
 | ||
|  | # | ||
|  | # Check the -script option. Like -output the choices are: a null string, | ||
|  | # a plain filename or a substitution string with '%d' sequences. | ||
|  | # | ||
|  | $scriptfile = expand_template( $scriptfile, 'script', 'sh', | ||
|  |     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) ); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # In verbose mode report the options | ||
|  | #------------------------------------------------------------------------------- | ||
|  | if ($verbose) { | ||
|  |     my $fmt = "%18s: %s\n"; | ||
|  | 
 | ||
|  |     printf $fmt, 'From',  $from  if defined( $options{from} ); | ||
|  |     printf $fmt, 'To',    $to    if defined( $options{to} ); | ||
|  |     printf $fmt, 'Count', $count if defined( $options{count} ); | ||
|  |     printf $fmt, 'List',  $list  if defined( $options{list} ); | ||
|  |     printf $fmt, 'Output',        $outfile; | ||
|  |     printf $fmt, 'Output script', $scriptfile; | ||
|  |     printf $fmt, 'Output asset count', ( $acountfile ? $acountfile : '--' ); | ||
|  |     printf $fmt, 'Meta only', ( $meta_only ? 'Yes' : 'No' ); | ||
|  |     printf $fmt, 'Fetch',     ( $fetch     ? 'Yes' : 'No' ); | ||
|  |     printf $fmt, 'Silent',    ( $silent    ? 'Yes' : 'No' ); | ||
|  |     printf $fmt, 'Verbose',   ( $verbose   ? 'Yes' : 'No' ); | ||
|  |     printf $fmt, 'Debug',           $DEBUG; | ||
|  |     printf $fmt, 'Script config',   $cfgfile; | ||
|  |     printf $fmt, 'Database config', $dbcfgfile; | ||
|  | 
 | ||
|  |     print '-' x 80, "\n"; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Set up output and script file | ||
|  | #------------------------------------------------------------------------------- | ||
|  | open( my $out, '>:encoding(UTF-8)', $outfile ) | ||
|  |     or die "Unable to open $outfile for output: $!\n"; | ||
|  | 
 | ||
|  | # | ||
|  | # Open the script file and add a header. Set execute permission on the | ||
|  | # file for the owner, and nobody else. We keep a count of lines (other than | ||
|  | # the header stuff) written to this file. | ||
|  | # | ||
|  | my $script_lines = 0; | ||
|  | open( my $script, '>:encoding(UTF-8)', $scriptfile ) | ||
|  |     or die "Unable to open $scriptfile for output: $!\n"; | ||
|  | print $script "#!/usr/bin/env bash\n\n"; | ||
|  | print $script <<'EndHead'; | ||
|  | Upload () { | ||
|  |     local id=${1} | ||
|  |     local file=${2} | ||
|  |     local remote=${3:-} | ||
|  |     local options=${4:-} | ||
|  | 
 | ||
|  |     if [[ -e $file ]]; then | ||
|  |         if [[ -z $remote ]]; then | ||
|  |             ia upload ${id} ${file} ${options} | ||
|  |         else | ||
|  |             ia upload ${id} ${file} --remote-name=${remote} ${options} | ||
|  |         fi | ||
|  |     else | ||
|  |         echo "File missing: $file" | ||
|  |     fi | ||
|  | } | ||
|  | 
 | ||
|  | EndHead | ||
|  | 
 | ||
|  | chmod(0744,$scriptfile); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Make changes for test mode | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # NOTE: Commented out 2023-07-06 since this is dangerous | ||
|  | #if ($test) { | ||
|  | #    $dispatch{collection}  = 'test_collection'; | ||
|  | #    $dispatch{contributor} = 'perlist'; | ||
|  | #} | ||
|  | 
 | ||
|  | # | ||
|  | # Prepare to turn the data into CSV data. Since we're allowing embedded | ||
|  | # newlines (binary mode) add an explicit end of line. | ||
|  | # | ||
|  | # TODO: The new tool 'ia' does not accept newlines in the CSV, so no point in | ||
|  | # defining them here. They have been removed elsewhere in the code. | ||
|  | # | ||
|  | my $csv = Text::CSV_XS->new( { binary => 1, eol => "\r\n" } ); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Copy audio files from the website if requested and if needed. We look for | ||
|  | # the WAV file in the 'uploads' area but if it's not found we get the MP3 | ||
|  | # (which is all we have for older shows). | ||
|  | # | ||
|  | # TODO: Given we're redirecting requests for audio files on the HPR site to | ||
|  | # archive.org there's potential for a nasty paradox here if the rewrite rules | ||
|  | # aren't perfect. We do the MP3 thing only for old shows, so we need to make | ||
|  | # sure that it works when we need it. | ||
|  | # TODO: Are there any logic flaws here? This was designed before we started | ||
|  | # creating all the audio formats we have now (partly to avoid the 'derive' | ||
|  | # process on archive.org from making copies without tags). | ||
|  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # NOTE: Commented out 2023-07-06 because (a) this situation should not occur | ||
|  | # again now that all audio versions are on the IA and we get everything for | ||
|  | # new shows, (b) the URLs used for downloads are obsolete. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | #if ( $fetch && !$meta_only ) { | ||
|  | # | ||
|  | #    foreach my $episode (@range) { | ||
|  | #        $ft = 0; | ||
|  | # | ||
|  | #        # | ||
|  | #        # Does the WAV version already exist? | ||
|  | #        # | ||
|  | #        $file = sprintf( $filetemplate, $episode, $filetypes[$ft] ); | ||
|  | #        if ( ! -e "$filepath/$file" ) { | ||
|  | #            # | ||
|  | #            # Doesn't exist, try the alternative version | ||
|  | #            # | ||
|  | #            $ft++; | ||
|  | #            $file = sprintf( $filetemplate, $episode, $filetypes[$ft] ); | ||
|  | #            if ( !-e "$filepath/$file" ) { | ||
|  | #                # | ||
|  | #                # We need to download the file | ||
|  | #                # | ||
|  | #                $URL = sprintf( $URLtemplate, $file ); | ||
|  | #                print STDERR "Downloading $URL\n" unless $silent; | ||
|  | #                $status = getstore( $URL, "$filepath/$file" ); | ||
|  | #                $status == 200 or warn "$status : $URL ($file)\n"; | ||
|  | #            } | ||
|  | #        } | ||
|  | # | ||
|  | #        die "Unable to find or download $file\n" | ||
|  | #            unless ( -e "$filepath/$file" ); | ||
|  | #    } | ||
|  | # | ||
|  | #} | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Connect to the database | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $dbhost = $dbcfg{database}->{host} // '127.0.0.1'; | ||
|  | my $dbport = $dbcfg{database}->{port} // 3306; | ||
|  | my $dbname = $dbcfg{database}->{name}; | ||
|  | my $dbuser = $dbcfg{database}->{user}; | ||
|  | my $dbpwd  = $dbcfg{database}->{password}; | ||
|  | $dbh = DBI->connect( "dbi:mysql:database=$dbname;host=$dbhost;port=$dbport", | ||
|  |     $dbuser, $dbpwd, { AutoCommit => 1 } ) | ||
|  |     or croak $DBI::errstr; | ||
|  | 
 | ||
|  | # | ||
|  | # Enable client-side UTF8 | ||
|  | # | ||
|  | $dbh->{mysql_enable_utf8} = 1; | ||
|  | $dbh->{mysql_enable_utf8mb4} = 1; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Collect the necessary information for constructing the metadata for each of | ||
|  | # the selected episodes. | ||
|  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # Each key of the %meta hash will contain an arrayref with an element per | ||
|  | # episode in the @range array. Some of these elements will be objects | ||
|  | # (arrayrefs or hashrefs) containing multi-part items such as 'subject' | ||
|  | # strings from the 'tags' field in the database. The %meta keys correspond to | ||
|  | # the strings of the @fields array which correspond to IA metadata fields and | ||
|  | # are also used by the %dispatch hash to contain actions to perform to make | ||
|  | # the contents. A few of the %meta keys correspond to other items relating to | ||
|  | # the episode such as supplementary audio and supplementary files like | ||
|  | # pictures. | ||
|  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # NOTE: See the dump of the %meta hash in the Journal that goes with this | ||
|  | # project (Example 1). It should give a better representation of this | ||
|  | # structure. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | $sth1 = $dbh->prepare($sql); | ||
|  | 
 | ||
|  | $incomplete = 0; | ||
|  | foreach my $episode (@range) { | ||
|  |     # | ||
|  |     # Get the episode from the database | ||
|  |     # | ||
|  |     $sth1->execute($episode); | ||
|  |     if ( $dbh->err ) { | ||
|  |         croak $dbh->errstr; | ||
|  |     } | ||
|  | 
 | ||
|  |     $h1 = $sth1->fetchrow_hashref; | ||
|  |     unless ($h1) { | ||
|  |         # | ||
|  |         # Somehow this episode doesn't exist in the database. No idea why, but | ||
|  |         # skip this one and keep going anyway. | ||
|  |         # | ||
|  |         carp "Failed to find requested episode number $episode\n"; | ||
|  |         next; | ||
|  |     } | ||
|  | 
 | ||
|  |     print STDERR "D> ", $h1->{title}, "\n"; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Make the episode name with leading zeroes | ||
|  |     # | ||
|  |     $ep_name = sprintf( 'hpr%04d', $episode ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Check whether this episode has a summary and tags. Count up the number | ||
|  |     # that are missing either of these. | ||
|  |     # TODO: Time to remove this check? | ||
|  |     # | ||
|  |     if ( length( $h1->{summary} ) == 0 || length( $h1->{tags} ) == 0 ) { | ||
|  |         print STDERR "Episode $episode is missing summary and/or tags\n" | ||
|  |             unless ($silent); | ||
|  |         $incomplete++; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Handle supplementary audio | ||
|  |     # | ||
|  |     unless ( defined( $meta{audio} ) ) { | ||
|  |         $meta{audio} = []; | ||
|  |     } | ||
|  | 
 | ||
|  |     my @audio; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Look for other audio files for upload. | ||
|  |     # TODO: Consider catering for hpr\d{4}(\.\d)?\.(flac|mp3|ogg|opus|spx) | ||
|  |     # e.g. hpr2781.1.ogg which could happen if the audio has had to be | ||
|  |     # re-released. | ||
|  |     # | ||
|  |     foreach my $ext ( @additional_audio ) { | ||
|  |         my $audio | ||
|  |             = sprintf( "%s/hpr%04d.%s", $config{uploads}, $episode, $ext ); | ||
|  |         if ( -e $audio ) { | ||
|  |             push( @audio, $audio ); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Look for the source file if provided. It will be called | ||
|  |     # 'hpr9999_source.{wav,flac,mp3,ogg,opus}' (though it's doubtful if people | ||
|  |     # are sending in opus). For safety we'll accept any extension. | ||
|  |     # | ||
|  |     @source = File::Find::Rule->file() | ||
|  |         ->name("hpr${episode}_source.*") | ||
|  |         ->in($filepath); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Generate the upload command(s) for the source file(s). We upload to the | ||
|  |     # main directory for the item and request that no 'derive' process be run | ||
|  |     # on the file(s). If we're just generating metadata it's not really | ||
|  |     # necessary to do this. Rather than having a different format here we now | ||
|  |     # use a generic format from the configuration file but pass in a null | ||
|  |     # remote filename. We also add default options, also from the | ||
|  |     # configuration file in all cases so we don't derive anything and don't | ||
|  |     # generate an archive of old stuff. | ||
|  |     # | ||
|  |     # TODO: What if the source file(s) had been omitted in the first pass, and | ||
|  |     # we need to upload them now? Could just do it by hand of course. | ||
|  |     # | ||
|  |     unless ($meta_only) { | ||
|  |         for my $sou (@source) { | ||
|  |             printf $script $iauploadtemplate, | ||
|  |                 'hpr' . $episode, | ||
|  |                 $sou, | ||
|  |                 '', | ||
|  |                 $iauploadoptions; | ||
|  |             $script_lines++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Turn all source audio files into URLs pointing to the IA item | ||
|  |     # | ||
|  |     for (my $i = 0; $i <= $#source; $i++) { | ||
|  |         ( my $bn = $source[$i] ) =~ s|.*/||mx; | ||
|  |         $source[$i] = sprintf( $config{IAURLtemplate}, "hpr$episode", $bn ); | ||
|  |     } | ||
|  | 
 | ||
|  |     #push( @audio, @source ) if (@source); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Save whatever was found, whether an arrayref of audio or undef | ||
|  |     # | ||
|  |     push( @{ $meta{audio} }, \@audio ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Most shows have transcripts since Q2 2023. They aren't referred to in | ||
|  |     # the show notes or database (not at present anyway). They get added to | ||
|  |     # the static page on the HPR site with a footer template on every page. | ||
|  |     # The files themselves arrive in a sub-directory of $uploads (usually | ||
|  |     # `/data/IA/uploads/hpr1234/`) as part of the show components. For now | ||
|  |     # we'll search for them and upload them using the Bash script we're | ||
|  |     # building which is run after the initial 'ia upload'. | ||
|  |     # | ||
|  |     unless ($meta_only) { | ||
|  |         @transcripts = File::Find::Rule->file() | ||
|  |             ->name($transcript_re) | ||
|  |             ->in("$filepath/$ep_name"); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Write 'Upload' function calls to the script for all of the transcripts | ||
|  |         # | ||
|  |         for my $ts (@transcripts) { | ||
|  |             ( my $bn = $ts ) =~ s|.*/||mx; | ||
|  |             printf $script $iauploadtemplate, $ep_name,    # identifier | ||
|  |                 $ts,                                       # local file path | ||
|  |                 "$ep_name/$bn",                            # Remote file path | ||
|  |                 $iauploadoptions; | ||
|  |             $script_lines++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Initialise the hash element for holding links to files in the notes per | ||
|  |     # show | ||
|  |     # | ||
|  |     unless ( defined( $meta{links} ) ) { | ||
|  |         $meta{links} = []; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Scan the database-stored notes looking for links to files on the HPR | ||
|  |     # server. For any HTML files found in that pass, download them and perform | ||
|  |     # another scan. If further HTML files are found download and scan them and | ||
|  |     # so on recursively. Any non-HTML files are left for downloading later. | ||
|  |     # | ||
|  |     #<<<                                 (Stop perltidy reformatting oddly) | ||
|  |     if (find_links_in_notes($episode, $h1->{notes}, \$tree, \%links, \%config, | ||
|  |          $verbose, $silent) > 0) { | ||
|  |     #>>> | ||
|  |         # | ||
|  |         # Save the links we found. Do this by copying the %links hash into the | ||
|  |         # %meta hash | ||
|  |         # | ||
|  |         push( @{ $meta{links} }, {%links} ); | ||
|  |     } | ||
|  |     else { | ||
|  |         push( @{ $meta{links} }, undef ); | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Build the metadata hash using the dispatch table. For each field we will | ||
|  |     # have an array of data. This may be a scalar for single-valued items or | ||
|  |     # an array for multi-valued ones. | ||
|  |     # | ||
|  |     foreach my $fld (@fields) { | ||
|  |         unless ( defined( $meta{$fld} ) ) { | ||
|  |             $meta{$fld} = []; | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # If it's a code reference then call the code and save the result | ||
|  |         # otherwise save the data | ||
|  |         # | ||
|  |         if ( ref( $dispatch{$fld} ) eq 'CODE' ) { | ||
|  |             push( @{ $meta{$fld} }, $dispatch{$fld}->() ); | ||
|  |         } | ||
|  |         else { | ||
|  |             push( @{ $meta{$fld} }, $dispatch{$fld} ); | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | _debug( $DEBUG >= 3, Dumper(\%meta) ); | ||
|  | 
 | ||
|  | die "Nothing to do\n" unless %meta; | ||
|  | 
 | ||
|  | # | ||
|  | # If we're not ignoring missed summaries and tags and we found them then abort | ||
|  | # with a message. | ||
|  | # TODO: 2023-07-06 Not needed any more? | ||
|  | # | ||
|  | if ( $incomplete > 0 ) { | ||
|  |     unless ($ignore) { | ||
|  |         die "Aborted due to missing summaries and/or tags\n"; | ||
|  |     } | ||
|  |     else { | ||
|  |         say "Missing summaries and/or tags - ignored" unless ($silent); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # ~~ Explanation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # Considering '%meta' as a "sideways spreadsheet" where the keys are row | ||
|  | # headings and the elements within the arrayrefs held as the hash values, the | ||
|  | # "cells" (array elements) may be arrayrefs with multiple elements (i.e. the | ||
|  | # spreadsheet is 3-dimensional!). We need to know the maximum number of array | ||
|  | # elements per cell because we have to expand the resulting CSV "spreadsheet" | ||
|  | # we'll generate for the 'ia upload' command by providing headers like | ||
|  | # "subject[1]" and "subject[2]". Hard to explain without a diagram! | ||
|  | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|  | # | ||
|  | # Compute the maximum number of sub-fields when a field can be multi-valued. | ||
|  | # This is because all CSV rows must have the same number of fields; the | ||
|  | # maximum of course. | ||
|  | # | ||
|  | @counts = map { | ||
|  |     max( map { ref($_) eq 'ARRAY' ? scalar(@$_) : 1 } @{ $meta{$_} } ) | ||
|  | } @fields; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Report on the collected data if requested. | ||
|  | # See 'Example 1' in the Journal for this project for what the '%meta' hash | ||
|  | # looks like at this stage. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | if ($verbose) { | ||
|  |     foreach my $i ( 0 .. $#range ) { | ||
|  |         my $ind = 0; | ||
|  |         foreach my $fld (@fields) { | ||
|  |             printf "%20s[%d]: %s\n", $fld, $counts[$ind], | ||
|  |                 defined( $meta{$fld}->[$i] ) | ||
|  |                 ? ( | ||
|  |                 ref( $meta{$fld}->[$i] ) eq 'ARRAY' | ||
|  |                 ? join( ",", @{ $meta{$fld}->[$i] } ) | ||
|  |                 : $meta{$fld}->[$i] | ||
|  |                 ) | ||
|  |                 : ''; | ||
|  |             $ind++; | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # Each show will (most likely) have an array of audio types in an | ||
|  |         # array of arrays. | ||
|  |         # | ||
|  |         if ( defined( $meta{audio}->[$i] ) ) { | ||
|  |             printf "%23s: %s\n", 'added audio', | ||
|  |                 scalar( @{ $meta{audio}->[$i] } ); | ||
|  |         } | ||
|  |         else { | ||
|  |             printf "%23s: 0\n", 'added audio'; | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # Each show may have links. They will be stored as an array of hashes | ||
|  |         # where each hash has sub-hashes, one per "asset". Some will just be | ||
|  |         # 'undef' meaning that they have no assets. | ||
|  |         # | ||
|  |         if ( defined( $meta{links}->[$i] ) ) { | ||
|  |             printf "%23s: %s\n", 'links', | ||
|  |                 scalar( keys( %{ $meta{links}->[$i] } ) ); | ||
|  |         } | ||
|  |         else { | ||
|  |             printf "%23s: 0\n", 'links'; | ||
|  |         } | ||
|  | 
 | ||
|  |         print '-' x 80, "\n"; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Output asset counts if requested | ||
|  | # TODO: Used in 'past_upload', which is no longer used (since that project is | ||
|  | # complete). | ||
|  | #------------------------------------------------------------------------------- | ||
|  | if ($acountfile) { | ||
|  |     my $acount_lines = 0; | ||
|  |     open( my $acount, '>:encoding(UTF-8)', $acountfile ) | ||
|  |         or die "Unable to open $acountfile for output: $!\n"; | ||
|  | 
 | ||
|  |     foreach my $i ( 0 .. $#range ) { | ||
|  |         if ( defined( $meta{links}->[$i] ) ) { | ||
|  |             printf $acount "%s %d\n", $meta{identifier}->[$i], | ||
|  |                 scalar( keys( %{ $meta{links}->[$i] } ) ); | ||
|  |             $acount_lines++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     close($acount); | ||
|  | 
 | ||
|  |     # | ||
|  |     # If we never wrote anything to the asset count file then delete it | ||
|  |     # | ||
|  |     if ( $acount_lines eq 0 ) { | ||
|  |         print "Deleting empty '$acountfile'\n" if $verbose; | ||
|  |         unlink($acountfile); | ||
|  |     } | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Download any linked files unless the -noassets option was used. | ||
|  | # Report the details in verbose mode. Only download if the file is not already | ||
|  | # downloaded. | ||
|  | # NOTE: HTML downloads will have already happened since we need to recursively | ||
|  | # parse and modify them. So if an asset is HTML on the HPR site an earlier | ||
|  | # stage will have downloaded it so that it can be parsed for any further HTML | ||
|  | # or assets. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | if ($assets) { | ||
|  |     foreach my $i ( 0 .. $#range ) { | ||
|  |         if ( defined( $meta{links}->[$i] ) ) { | ||
|  |             for my $key ( keys( %{ $meta{links}->[$i] } ) ) { | ||
|  |                 my $linkfile = $meta{links}->[$i]->{$key}->{cached}; | ||
|  |                 if ( ! -e $linkfile ) { | ||
|  |                     download_url( $key, $linkfile, | ||
|  |                         $meta{links}->[$i]->{$key}->{new}, | ||
|  |                         $verbose, $silent ); | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Generate the completed CSV | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # First build and output the header fields. If there's a single header then just | ||
|  | # output it, but if it's a multi-item thing output the maximum with indexes | ||
|  | # (like "subject[0]"). | ||
|  | # | ||
|  | for my $ind ( 0 .. $#fields ) { | ||
|  |     if ( $counts[$ind] == 1 ) { | ||
|  |         push( @head, $fields[$ind] ); | ||
|  |     } | ||
|  |     else { | ||
|  |         for ( my $i = 0; $i < $counts[$ind]; $i++ ) { | ||
|  |             push( @head, $fields[$ind] . "[$i]" ); | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | $csv->print( $out, \@head ); | ||
|  | 
 | ||
|  | # | ||
|  | # Build and output the data. The structure we have is a hash where each | ||
|  | # element is an array with an element per episode (0 .. $range). The | ||
|  | # per-episode elements may be scalars (for single-value fields) or arrays (for | ||
|  | # multi-value). In the latter case we have to make sure we have filled all of | ||
|  | # the header positions (e.g. we have 10 "subject" headers, but this episode | ||
|  | # has only 1 - we have to fill in the remaining 9 slots with empty items). | ||
|  | # | ||
|  | foreach my $i ( 0 .. $#range ) { | ||
|  |     @data = (); | ||
|  |     my $ind = 0; | ||
|  |     foreach my $fld (@fields) { | ||
|  |         my $count = 0; | ||
|  |         if ( ref( $meta{$fld}->[$i] ) eq 'ARRAY' ) { | ||
|  |             foreach my $elem ( @{ $meta{$fld}->[$i] } ) { | ||
|  |                 push( @data, $elem ); | ||
|  |                 $count++; | ||
|  |             } | ||
|  |         } | ||
|  |         else { | ||
|  |             push( @data, $meta{$fld}->[$i] ); | ||
|  |             $count++; | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # Cater for any blank slots | ||
|  |         # | ||
|  |         if ( $count < $counts[$ind] ) { | ||
|  |             push( @data, undef ) for 1 .. ( $counts[$ind] - $count ); | ||
|  |         } | ||
|  | 
 | ||
|  |         $ind++; | ||
|  |     } | ||
|  | 
 | ||
|  |     #print Dumper( \@data ), "\n"; | ||
|  |     $csv->print( $out, \@data ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # If there are additional audio files or supplementary files for the | ||
|  |     # current show write a CSV row for each of them here. | ||
|  |     # NOTE: originally the CSV format did not require the second and | ||
|  |     # subsequent row belonging to an IA identifier to have an identifier in it | ||
|  |     # (maybe it was even illegal in the early days). Now the ia tool seems to | ||
|  |     # have made it mandatory - surprisingly. | ||
|  |     # | ||
|  |     if ( defined( $meta{audio}->[$i] ) ) { | ||
|  |         for my $audio (@{$meta{audio}->[$i]}) { | ||
|  |             # | ||
|  |             # Each row is a file path which we'll add to the CSV. We use the | ||
|  |             # length of the @header array to get the number of CSV fields | ||
|  |             # | ||
|  |             @data = (); | ||
|  | #           push( @data, undef, $audio ); | ||
|  |             push( @data, $meta{identifier}->[$i], $audio ); | ||
|  |             push( @data, undef ) for 1 .. ( scalar(@head) - 2 ); | ||
|  |             $csv->print( $out, \@data ); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # The 'links' key holds an array of hashes containing details of the | ||
|  |     # 'assets' relating to a show (files other than audio files, but including | ||
|  |     # the original "source" audio). In make_metadata V0.3.x we used these | ||
|  |     # hashes to add rows to the CSV file which results in the upload of these | ||
|  |     # files.  However, this mechanism does not cater for (a) the uploading of | ||
|  |     # directories to the IA, and (b) the ability to disable the "derive" | ||
|  |     # mechanism for a given file. Therefore we build a Bash script containing | ||
|  |     # a function that calls 'ia' commands which can be used instead, but only | ||
|  |     # if $meta_only is not set. | ||
|  |     # TODO: 2023-07-06 We want to upload the transcripts created by 'whisper' | ||
|  |     # and placed in the 'uploads' directory. We will add these to the 'script' | ||
|  |     # file. | ||
|  |     # | ||
|  |     if ( ! $meta_only && defined( $meta{links}->[$i] ) ) { | ||
|  |         for my $key ( keys( %{ $meta{links}->[$i] } ) ) { | ||
|  |             # | ||
|  |             # Build a call to the 'Upload' function being explicit about the | ||
|  |             # remote name | ||
|  |             # | ||
|  |             ( my $rem = $meta{links}->[$i]->{$key}->{cached} ) | ||
|  |                 =~ s|$config{uploads}/||; | ||
|  |             printf $script $iauploadtemplate, | ||
|  |                 $meta{identifier}->[$i], | ||
|  |                 $meta{links}->[$i]->{$key}->{cached}, | ||
|  |                 $rem, | ||
|  |                 $iauploadoptions; | ||
|  |             $script_lines++; | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Report the output and script file names | ||
|  | # | ||
|  | unless ($silent) { | ||
|  |     print "Output file: $outfile\n"; | ||
|  |     print "Script file: $scriptfile\n" unless ( $script_lines eq 0 ); | ||
|  | } | ||
|  | 
 | ||
|  | close($out); | ||
|  | close($script); | ||
|  | 
 | ||
|  | # | ||
|  | # If we never wrote anything to the script file then delete it | ||
|  | # | ||
|  | if ($script_lines eq 0) { | ||
|  |     print "Deleting empty '$scriptfile'\n" if $verbose; | ||
|  |     unlink($scriptfile); | ||
|  | } | ||
|  | 
 | ||
|  | exit; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: make_item | ||
|  | #      PURPOSE: Build an Archive.org item name for this item | ||
|  | #   PARAMETERS: $h              - hashref returned from the database | ||
|  | #               $testmode       - Boolean denoting test mode | ||
|  | #      RETURNS: The newly fashioned item string | ||
|  | #  DESCRIPTION: The method for generating an unique item string for | ||
|  | #               Archive.org is embodied in this function | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub make_item { | ||
|  |     my ( $h, $testmode ) = @_; | ||
|  | 
 | ||
|  |     return sprintf( "%shpr%04d", ( $testmode ? 'test_' : '' ), $h->{eps_id} ); | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: make_filename | ||
|  | #      PURPOSE: Determine the filename path for the metadata | ||
|  | #   PARAMETERS: $path           - relative path to the files | ||
|  | #               $template       - template for forming the file name | ||
|  | #               $ftypes         - arrayref holding ordered file types | ||
|  | #               $h              - hashref returned from the database | ||
|  | #      RETURNS: The path to the file | ||
|  | #  DESCRIPTION: Forms the file path then checks that there really is a file of | ||
|  | #               that name. Since we don't have *.wav files for every episode | ||
|  | #               we might have one of the other formats. The array referenced | ||
|  | #               by $ftypes contains a list of these file types, starting with | ||
|  | #               the WAV default. At the moment we only look at the second | ||
|  | #               choice if there is no WAV, but we could be more fancy if | ||
|  | #               necessary. | ||
|  | #               TODO The design of this code is not good. The earlier fetch | ||
|  | #               stage will have determined what is available. We are | ||
|  | #               potentially repeating these checks here, though they are | ||
|  | #               necessary if -nofetch was specified. Needs some further | ||
|  | #               thought. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub make_filename { | ||
|  |     my ( $path, $template, $ftypes, $h ) = @_; | ||
|  | 
 | ||
|  |     my $file; | ||
|  |     my $ft = 0; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Look for the WAV file first. If found, return it | ||
|  |     # | ||
|  |     $file = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] ); | ||
|  |     if ( -e $file ) { | ||
|  |         return $file; | ||
|  |     } | ||
|  |     else { | ||
|  |         # | ||
|  |         # Look for the alternative type of file and return that if found | ||
|  |         # | ||
|  |         $ft++; | ||
|  |         $file | ||
|  |             = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] ); | ||
|  |         if ( -e $file ) { | ||
|  |             return $file; | ||
|  |         } | ||
|  |         else { | ||
|  |             warn "No file found for hpr" . $h->{eps_id} . "\n"; | ||
|  |             return; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: db_title | ||
|  | #      PURPOSE: Return a title for the Archive.org item | ||
|  | #   PARAMETERS: $h      - hashref returned from the database | ||
|  | #      RETURNS: The title | ||
|  | #  DESCRIPTION: Makes the title string we want for the item | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub db_title { | ||
|  |     my ($h) = @_; | ||
|  | 
 | ||
|  |     # print STDERR "D> ", $h->{title}, "\n"; | ||
|  |     return sprintf( "hpr%04d :: %s", $h->{eps_id}, $h->{title} ); | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: db_notes | ||
|  | #      PURPOSE: Return the description for an episode | ||
|  | #   PARAMETERS: $sourceURLtpl   - printf template for making the HPR link to | ||
|  | #                                 the original show (from config file) | ||
|  | #               $h              - hashref returned from the database | ||
|  | #               $tree           - parsed notes | ||
|  | #               $asource        - arrayref containing source audio file URLs | ||
|  | #      RETURNS: The description built from the database for Archive.org | ||
|  | #  DESCRIPTION: This routine generates the item description for Archive.org as | ||
|  | #               HTML by concatenating the summary, series name (if there is | ||
|  | #               one), duration, link to episode on the HPR site and the notes | ||
|  | #               into a long string. The main work is in the formatting of the | ||
|  | #               notes element which is stored as HTML in the database. Since | ||
|  | #               we are making a single string from the multi-line data we have | ||
|  | #               to take special action to preserve the contents of <pre> tags. | ||
|  | #               See the comments in 'flatten_pre' for what is being done to | ||
|  | #               achieve this. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub db_notes { | ||
|  |     my ( $sourceURLtpl, $h, $tree, $asource ) = @_; | ||
|  | 
 | ||
|  |     my ( $lines, $episode, $desc, $sourceurl, $notes ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Ensure leading zeroes on the episode number and prepend 'hpr' | ||
|  |     # | ||
|  |     $episode = sprintf('hpr%04d', $h->{eps_id}); | ||
|  | 
 | ||
|  | #    $sourceurl = sprintf( "http://hackerpublicradio.org/eps.php?id=%04d", | ||
|  | #        $h->{eps_id} ); | ||
|  |     $sourceurl = sprintf( $sourceURLtpl, $episode ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Make the header with newlines so we can count them | ||
|  |     # TODO: Should we use the series number 0 which denotes "no series" rather | ||
|  |     # than the name "general"? Need to adjust the query to return this if so. | ||
|  |     # FIXME: The "not series" 'general' is being removed and the logic will | ||
|  |     # use NULL for an episode that is not in a series. The local copy of the | ||
|  |     # database has had this change made, but not the live one. We have to | ||
|  |     # cater for both. When it goes we can drop references to 'general'. | ||
|  |     # | ||
|  |     $desc = sprintf( "Summary: %s\n", $h->{summary} ); | ||
|  |     unless ( !defined( $h->{s_name} ) or $h->{s_name} eq 'general' ) { | ||
|  |         $desc .= sprintf( "Series: %s\n", coalesce( $h->{s_name}, '' ) ); | ||
|  |     } | ||
|  |     #$desc .= sprintf( "Duration: %s\n", $h->{duration} ); | ||
|  |     $desc .= sprintf( "Source: <a href=\"%s\">%s</a>\n", $sourceurl, $sourceurl ); | ||
|  |     # | ||
|  |     # Add a pointer to the source audio if there is any (cater for multiple, | ||
|  |     # but it's not likely we'll get more than one) | ||
|  |     # | ||
|  |     if (@{$asource}) { | ||
|  |         foreach my $aurl (@{$asource}) { | ||
|  |             $desc .= sprintf( "Original audio: <a href=\"%s\">%s</a>\n", | ||
|  |                 $aurl, $aurl ); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Count the lines so we can add <br/> tags at the end to make the header | ||
|  |     # a standard size of 4 lines high to accomodate the media widget. | ||
|  |     # TODO: Rationalise this since there is no media widget any more. | ||
|  |     # Temporarily boosted the height to 6 lines now the Original audio has | ||
|  |     # been added. | ||
|  |     # | ||
|  |     $lines = $desc =~ tr/\n//; | ||
|  |     $desc =~ s#\n#<br />#g; | ||
|  |     $desc .= ( "<br />" x ( 6 - $lines ) ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Prepare the notes and add to the header. Actions are: | ||
|  |     # 1. Make all <pre> nodes compatible by adding <br/> | ||
|  |     # 2. Generate entities for all non-ASCII | ||
|  |     # 3. Do the silly thing of double-encoding entities to counteract the IA | ||
|  |     #    bug | ||
|  |     # 4. Remove all newlines ('ia' doesn't seem to know what to do with them) | ||
|  |     # 5. Trim trailing spaces | ||
|  |     # | ||
|  |     # ---- | ||
|  |     # NOTE: Removed the double-encoding on 2017-10-07 since (unannounced) it | ||
|  |     # looks as if the IA have fixed this bug. There is some debate over | ||
|  |     # whether we should be encoding non-ASCII also. | ||
|  |     # ---- | ||
|  |     # NOTE: 2017-11-25 restored the double-encoding since the bug bit again at | ||
|  |     # the last upload. This is not the whole story though. Added a step to | ||
|  |     # decode the notes before re-encoding the non-ASCII bits. This is to | ||
|  |     # counteract what seems to be being done by Pandoc if we process the notes | ||
|  |     # from plain text. | ||
|  |     # ---- | ||
|  |     # NOTE: 2021-02-20 found a problem with the strategy of decoding | ||
|  |     # everything and then encoding it again (selectively). The issue was seen | ||
|  |     # in HPR show 3284 where the HTML notes contains a <pre> section with C++ | ||
|  |     # code in it which uses characters like '<' and '>'. These were encoded as | ||
|  |     # HTML entities at shownote preparation time since they were flagged as | ||
|  |     # bad HTML by my validator script. The original algorithm then decoded | ||
|  |     # these characters but couldn't encode them with encode_entities because | ||
|  |     # that would have encoded all the HTML tags. The decode/encode stuff is | ||
|  |     # therefore HTML context dependent, and we don't have the means of | ||
|  |     # handling this with the current script. So, the solution seems to be not | ||
|  |     # to decode and encode at all. After running flatten_pre, simply run the | ||
|  |     # re_encode_entities and trust that there's nothing in the notes that need | ||
|  |     # special action here. Anyway, the preparation stage should have handled | ||
|  |     # whatever's necessary. | ||
|  |     # ---- | ||
|  |     # | ||
|  |     #$notes = flatten_pre( $h->{notes} ); | ||
|  |     $notes = flatten_pre($tree); | ||
|  |     ## $notes = decode_entities($notes); | ||
|  |     ## $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); | ||
|  |     ## $notes = re_encode_entities($notes); | ||
|  |     $notes =~ s/\n//g; | ||
|  | 
 | ||
|  |     $desc .= $notes; | ||
|  |     $desc =~ s/\s*$//; | ||
|  | 
 | ||
|  |     return $desc; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: db_tags | ||
|  | #      PURPOSE: Return the episode search tags as an array for populating | ||
|  | #               'subject' fields on Archive.org | ||
|  | #   PARAMETERS: $h      - hashref returned from the database | ||
|  | #      RETURNS: A reference to an array of tags or undef if there are none | ||
|  | #  DESCRIPTION: In the database the tags are held as a comma-delimited string. | ||
|  | #               We need to turn this into an array to populate 'subject' | ||
|  | #               fields in the CSV file. If there are no tags the caller needs | ||
|  | #               an undef to be returned to indicate that a blank field needs | ||
|  | #               to be constructed in the CSV. The format of the database | ||
|  | #               string may be messy since we rely on the show's host to submit | ||
|  | #               this information. We split the string taking this into | ||
|  | #               consideration. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub db_tags { | ||
|  |     my ($h) = @_; | ||
|  | 
 | ||
|  |     return $h->{tags} ? [ split( /\s*,\s*/, $h->{tags} ) ] : undef; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: flatten_pre | ||
|  | #      PURPOSE: Process notes "flattening" <pre> contents | ||
|  | #   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and | ||
|  | #                       partially processed notes | ||
|  | #      RETURNS: Processed notes | ||
|  | #  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also | ||
|  | #               contain some formatting tags like <em> and <code>, but spaces | ||
|  | #               and newlines are significant. The Internet Archive upload API | ||
|  | #               uses HTTP headers which are text strings without newlines, so | ||
|  | #               when these tags are uploaded through this route some | ||
|  | #               formatting is lost. What this routine does is parse the | ||
|  | #               contents of all <pre> sections in $notes, adding <br/> tags | ||
|  | #               to replace newlines. It has to perform a full parse | ||
|  | #               since the contents may include HTML tags and these need to be | ||
|  | #               passed through intact. It calls the subroutine 'flatten_item' to | ||
|  | #               deal with the recursive nature of HTML tags. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub flatten_pre { | ||
|  |     my ($tree) = @_; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Find all the <pre> tags | ||
|  |     # | ||
|  |     my @pre_tags = $tree->look_down( _tag => 'pre', ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Walk the various <pre> elements in the document | ||
|  |     # | ||
|  |     foreach my $tag (@pre_tags) { | ||
|  |         # | ||
|  |         # Save the tag and empty the original | ||
|  |         # | ||
|  |         my $saved = $tag->clone(); | ||
|  |         $tag->delete_content(); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Walk the saved content and rebuild the tag into $atag using the | ||
|  |         # nested arrayref structure permitted by HTML::Element for | ||
|  |         # convenience (the alternative is a little nasty). See the | ||
|  |         # documentation for 'new_from_lol' in HTML::Element. | ||
|  |         # | ||
|  |         my $atag; | ||
|  |         foreach my $item ( @{ $saved->content_array_ref } ) { | ||
|  |             push( @$atag, flatten_item($item) ); | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # Rebuild the tag from the arrayref we built. We treat the arrayref | ||
|  |         # structure we just built as an array because otherwise the top level | ||
|  |         # is interpreted as a spurious <null> tag. | ||
|  |         # | ||
|  |         $tag->push_content(@$atag); | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Trim out the original notes from the enclosing tags we added earlier | ||
|  |     # | ||
|  |     my $body = $tree->look_down( _tag => 'body' ); | ||
|  |     ( my $result = $body->as_HTML( undef, ' ', {} ) ) | ||
|  |         =~ s{(^<body[^>]*>|</body>$)}{}gi; | ||
|  | 
 | ||
|  |     return $result; | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: flatten_item | ||
|  | #      PURPOSE: Recursively "flatten" items within the enclosing <pre> | ||
|  | #   PARAMETERS: $item   an HTML::Element item parsed from the original | ||
|  | #                       <pre> section | ||
|  | #      RETURNS: An arrayref if the last seen item was a tag, otherwise a list | ||
|  | #  DESCRIPTION: Since <pre> sections can contain inline elements which change | ||
|  | #               the rendering of the text we need to parse these as we add | ||
|  | #               <br/> tags. This routine does this by recursively descending | ||
|  | #               through the contents. A common tag sequence is <pre><code> for | ||
|  | #               scripts and the like. This routine deals with such sequences. | ||
|  | #               It expects to receive the contents in sequence and builds the | ||
|  | #               result as a nested arrayref structure. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub flatten_item { | ||
|  |     my ($item) = @_; | ||
|  | 
 | ||
|  |     return unless defined($item); | ||
|  | 
 | ||
|  |     my ( @result, %attr ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Is it a sub-tag or non-tag content? | ||
|  |     # | ||
|  |     if ( ref($item) ) { | ||
|  |         # | ||
|  |         # It's a tag. Save the tag name and any attributes and recurse into | ||
|  |         # it. Return an arrayref | ||
|  |         # | ||
|  |         push( @result, $item->tag() ); | ||
|  |         %attr = $item->all_external_attr(); | ||
|  |         push( @result, \%attr ) if %attr; | ||
|  |         for my $child ( $item->content_list() ) { | ||
|  |             push( @result, flatten_item($child) ); | ||
|  |         } | ||
|  |         return \@result; | ||
|  |     } | ||
|  |     else { | ||
|  |         # | ||
|  |         # It's non-tag content. Join the lines with <br/> tags.  Return an | ||
|  |         # array (since this is a simple list). | ||
|  |         # | ||
|  |         # Note that we split with a LIMIT of -1 which causes any trailing list | ||
|  |         # items to be returned; default behaviour is to drop them. | ||
|  |         # | ||
|  |         $item =~ s/\r//g; | ||
|  |         my @content = split( /\n/, $item, -1 ); | ||
|  |         if (@content) { | ||
|  |             # | ||
|  |             # Remove a leading blank line - usually the result of | ||
|  |             # a "<pre>'NL'text" sequence | ||
|  |             # | ||
|  |             shift(@content) if ( $content[0] =~ /^\s*$/ ); | ||
|  | 
 | ||
|  |             # | ||
|  |             # Join back the lines with <br/> tags between them. | ||
|  |             # | ||
|  |             foreach my $txt (@content) { | ||
|  |                 push( @result, $txt, ['br'] ); | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # Remove the <br/> at the end, it's spurious | ||
|  |             # | ||
|  |             pop(@result); | ||
|  |         } | ||
|  | 
 | ||
|  |         return (@result); | ||
|  |     } | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: re_encode_entities | ||
|  | #      PURPOSE: Find all encoded entities and encode them all over again | ||
|  | #   PARAMETERS: $notes  - string containing unprocessed notes | ||
|  | #      RETURNS: Processed notes | ||
|  | #  DESCRIPTION: Uses a brutal regular expression substitution approach, but | ||
|  | #               since this is a very unusual requirement brought about by what | ||
|  | #               is essentially a bug in the way the Internet Archive stores | ||
|  | #               and processes metadata, we have no choice. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: It looks as if the software on archive.org saves HTML metadata | ||
|  | #               in an XML file by encoding it as entities. So HTML tags are | ||
|  | #               turned into <TAG> sequences. Then when HTML is required | ||
|  | #               the XML is decoded back to <TAG>. Unfortunately any existing | ||
|  | #               entities in the HTML like '<' will also be decoded in this | ||
|  | #               phase which may result in invalid HTML sequences. In this | ||
|  | #               routine we are converting existing entities so that the | ||
|  | #               decoding phase turns them into valid entities. The archive.org | ||
|  | #               software should be doing this, but it isn't, and any messages | ||
|  | #               sent to the staff there are ignored. Of course, there may be | ||
|  | #               a point at which this bug is corrected and the double encoding | ||
|  | #               process here becomes redundant. Then the call to this routine | ||
|  | #               can be omitted. | ||
|  | #               Note also that the IA editor corrupts HTML containing entities | ||
|  | #               so should not be used. To make a change, edit the notes on the | ||
|  | #               HPR database and generate new metadata with this script. Use | ||
|  | #               the -meta_only option to avoid the need for the media (unless | ||
|  | #               it too is being updated of course) and re-submit the CSV file | ||
|  | #               to ias3upload.pl or ia. | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub re_encode_entities { | ||
|  |     my ($notes) = @_; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Replace any '&xxx;' sequence by '&xxx;' | ||
|  |     # | ||
|  |     $notes =~ s#\&([^;]+;)#&$1#g; | ||
|  |     return $notes; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: find_links_in_notes | ||
|  | #      PURPOSE: Finds HPR links in show notes | ||
|  | #   PARAMETERS: $episode        episode number we're dealing with | ||
|  | #               $notes          a string containing the shownotes | ||
|  | #               $rtree          reference to an HTML::TreeBuilder object built | ||
|  | #                               from the notes | ||
|  | #               $rlinks         hashref containing the links found | ||
|  | #               $rconfig        hashref containing config data | ||
|  | #               $verbose        setting controlling what reports are generated | ||
|  | #               $silent         setting controlling what reports are generated | ||
|  | #      RETURNS: Count of the number of links | ||
|  | #  DESCRIPTION: Parses the notes passed as an argument then scans these notes | ||
|  | #               looking for links which relate to items on the HPR server. | ||
|  | #               Normally these are files and images but could be anchor | ||
|  | #               references as well. If the latter then the URL has to be | ||
|  | #               parsed to remove the anchor. Unique links are saved in a hash | ||
|  | #               as the path to the file that will be saved on the VPS and as | ||
|  | #               the link to the item in the new notes. This hash will be | ||
|  | #               passed back to the caller so it can get the file(s) and | ||
|  | #               prepare them for upload to the IA. Finally, the link in the | ||
|  | #               notes is modified to refer to the file that will be uploaded | ||
|  | #               to the IA. The function returns the number of links it has | ||
|  | #               recorded (not the number it has changed), as well as passing | ||
|  | #               back the parsed tree and the link hash it has constructed. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub find_links_in_notes { | ||
|  |     my ( $episode, $notes, $rtree, $rlinks, $rconfig, $verbose, $silent ) | ||
|  |         = @_; | ||
|  | 
 | ||
|  |     my ( $linkre,  $epstr,    $uri,     $slink ); | ||
|  |     my ( $oldfile, $newfile, $newURL ); | ||
|  | 
 | ||
|  |     _debug( $DEBUG >= 3, "Entered find_links_in_notes\n" ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Initialise links | ||
|  |     # | ||
|  |     undef %$rlinks; # TODO: Consider whether we should do this? | ||
|  | 
 | ||
|  |     # | ||
|  |     # Create an object that can be shared downstream | ||
|  |     # | ||
|  |     $$rtree = HTML::TreeBuilder->new; | ||
|  |     $$rtree->ignore_unknown(0); | ||
|  |     $$rtree->no_expand_entities(1); | ||
|  |     $$rtree->p_strict(1); | ||
|  |     $$rtree->store_comments(1); | ||
|  |     $$rtree->warn(1); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Parse the notes. Die if we fail because then we know this show needs | ||
|  |     # some urgent attention. | ||
|  |     # | ||
|  |     $$rtree->parse_content($notes) | ||
|  |         or die "HTML::TreeBuilder failed to parse notes: $!\n"; | ||
|  | 
 | ||
|  |     # | ||
|  |     # NOTE: No longer filtering out links not relating to the episode. | ||
|  |     # Experience with 'upload_manager' has shown that there are some weird | ||
|  |     # links in the HPR database which will cause problems. | ||
|  |     # | ||
|  |     # Regular expression to match links to the HPR server. Examples: | ||
|  |     #   http://hackerpublicradio.org/eps/hpr2153.png | ||
|  |     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html | ||
|  |     # Also things like this (**Why Ken?**) | ||
|  |     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg | ||
|  |     # Don't match things like when *not* processing 1986: | ||
|  |     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2 | ||
|  |     # | ||
|  |     $epstr = sprintf( "hpr%04d", $episode ); | ||
|  | #   my $re | ||
|  | #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x; | ||
|  |     $linkre = qr{ | ||
|  |         ^https?:// | ||
|  |         (?:www.)? | ||
|  |         (?:hacker|hobby)publicradio.org/eps/ | ||
|  |         (.+)$ | ||
|  |     }x; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Scan the HTML tree for links we care about | ||
|  |     # | ||
|  |     for ( @{ $$rtree->extract_links( 'a', 'img' ) } ) { | ||
|  |         my ( $link, $element, $attr, $tag ) = @$_; | ||
|  | 
 | ||
|  |         # | ||
|  |         # Standardise the link (expands relative URLs, removes any fragment). | ||
|  |         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative | ||
|  |         # URIs are removed. | ||
|  |         # | ||
|  |         local $URI::ABS_REMOTE_LEADING_DOTS = 1; | ||
|  |         $uri = URI->new_abs( $link, $rconfig->{baseURL} ); | ||
|  |         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque ); | ||
|  | 
 | ||
|  |         _debug( $DEBUG >= 3, "\$uri = $uri\n" ); | ||
|  |         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment ) | ||
|  |             if $uri->fragment; | ||
|  |         _debug( $DEBUG >= 3, "\$slink = $slink, \n" ); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Is it an HPR link? | ||
|  |         # | ||
|  |         if ( $slink =~ $linkre ) { | ||
|  |             # | ||
|  |             # Save the last bracketed match, without any 'fragment' if there | ||
|  |             # is one (we want this not to be URL-related) | ||
|  |             # NOTE: Will we ever have a fragment here? | ||
|  |             # | ||
|  |             ( $oldfile = "$+" ) =~ s/#.*$//; | ||
|  |             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" ); | ||
|  | 
 | ||
|  |             # | ||
|  |             # Does this file path begin with an 'hpr' prefix? If so is it the | ||
|  |             # show id? If not we don't want to process it. | ||
|  |             # | ||
|  |             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) { | ||
|  |                 if ( $1 ne $epstr ) { | ||
|  |                     _debug( $DEBUG >= 3, "Ignored $slink\n" ); | ||
|  |                     next; | ||
|  |                 } | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # The path and URL might end with a slash which means the URL is | ||
|  |             # relying on the Web server to fill in the filename as | ||
|  |             # 'index.html'. We have to make this explicit. | ||
|  |             # | ||
|  |             if ( $slink =~ /\/$/ ) { | ||
|  |                 $slink    .= 'index.html'; | ||
|  |                 $oldfile .= 'index.html'; | ||
|  |             } | ||
|  | 
 | ||
|  | 
 | ||
|  |             # | ||
|  |             # Save the original link if it's unique. We have a hashref in | ||
|  |             # $rlinks (pointing to a global hash). | ||
|  |             # | ||
|  |             # We add a key made from the parsed link in which we store an | ||
|  |             # anonymous hashref containing the name of the file we'll store in | ||
|  |             # the cache area for upload, and the new IA-based URL we'll use in | ||
|  |             # the notes. We rename files that don't start with 'hprNNNN_' with | ||
|  |             # that prefix to make it clear it belongs to the show (it mainly | ||
|  |             # helps organise and manage the cache if truth be told) | ||
|  |             # | ||
|  |             unless ( exists( $rlinks->{$slink} ) ) { | ||
|  |                 # | ||
|  |                 # Originally we turned "hpr9999/file.dat" into | ||
|  |                 # "hpr9999_file.dat". We don't want to do this any more so the | ||
|  |                 # code is much simpler | ||
|  |                 # | ||
|  |                 $newfile = $rconfig->{uploads} . "/$oldfile"; | ||
|  |                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile ); | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Save the link details as a sub-hash indexed by the | ||
|  |                 # standardised URL. Elements are: | ||
|  |                 # { | ||
|  |                 #     cached => 'file path for the cache area', | ||
|  |                 #     new    => 'URL to be used on the IA', | ||
|  |                 # } | ||
|  |                 # | ||
|  |                 $rlinks->{$slink}           = {}; | ||
|  |                 $rlinks->{$slink}->{cached} = $newfile; | ||
|  |                 $rlinks->{$slink}->{new}    = $newURL; | ||
|  | 
 | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # Simply change the content of the link (in the parsed HTML) with | ||
|  |             # the new URL built above. We know the attribute ('src' or 'href') | ||
|  |             # from what 'extract_links' returned. Deal with any fragment we | ||
|  |             # found as well. | ||
|  |             # | ||
|  |             if ( $uri->fragment ) { | ||
|  |                 $element->attr( $attr, | ||
|  |                     $rlinks->{$slink}->{new} . '#' . $uri->fragment ); | ||
|  |             } | ||
|  |             else { | ||
|  |                 $element->attr( $attr, $rlinks->{$slink}->{new} ); | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # If we found any links the notes will have been changed, but since the | ||
|  |     # tree is shared with the caller this will result in changes to the | ||
|  |     # metadata. However, we want to look at any HTML files in those links | ||
|  |     # | ||
|  |     if ( scalar( keys(%$rlinks) ) > 0 ) { | ||
|  |         # | ||
|  |         # If any links are HTML we need to download them and recursively parse, | ||
|  |         # record and possibly change their contents. This updates $rlinks | ||
|  |         # | ||
|  |         foreach my $key ( keys(%$rlinks) ) { | ||
|  |             if ( $key =~ /\.html$/ ) { | ||
|  |                 my $linkfile = $rlinks->{$key}->{cached}; | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Get the file unless we've already collected it | ||
|  |                 # | ||
|  |                 if ( ! -e $linkfile ) { | ||
|  |                     download_url( $key, $linkfile, $rlinks->{$key}->{new}, | ||
|  |                         $verbose, $silent ); | ||
|  |                 } | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Do the recursive parsing since we need to know about further | ||
|  |                 # links | ||
|  |                 # | ||
|  |                 find_links_in_file( | ||
|  |                     $episode, $linkfile, $rlinks, | ||
|  |                     $rconfig, $verbose,  $silent | ||
|  |                 ); | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Return the link count after all recursion through supplementary files | ||
|  |     # and so on | ||
|  |     # | ||
|  |     return scalar( keys(%$rlinks) ); | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: find_links_in_file | ||
|  | #      PURPOSE: Finds HPR links in related files | ||
|  | #   PARAMETERS: $episode        episode number we're dealing with | ||
|  | #               $filename       file being examined | ||
|  | #               $rlinks         hashref containing the links found | ||
|  | #               $rconfig        hashref containing config data | ||
|  | #               $verbose        setting controlling what reports are generated | ||
|  | #               $silent         setting controlling what reports are generated | ||
|  | #      RETURNS: The number of new links found and downloaded | ||
|  | #  DESCRIPTION: The routine is related to 'find_links_in_notes' but doesn't work | ||
|  | #               quite the same way. It is given the name of an HTML which has | ||
|  | #               already been downloaded. It parses this with HTML::TreeBuilder | ||
|  | #               because it neds to be scanned and possibly edited. The HTML | ||
|  | #               tree is scanned for <a> and <img> tags. If the URL returned is | ||
|  | #               an HPR URL then we will want to change it to an IA one so the | ||
|  | #               show can reference its components on archive.org and be | ||
|  | #               independent of HPR. If a link is found which hasn't been seen | ||
|  | #               before it is saved in the $rlinks hash keyed by the original | ||
|  | #               URL. The link URL is replaced by the IA version in the HTML | ||
|  | #               tree. Once the scan is complete if URL edits have been counted | ||
|  | #               the file is rewritten from the modified tree. Then if any new | ||
|  | #               links have been found these are checked, and if any are HTML | ||
|  | #               they are downloaded ready to be parsed. Parsing is done by | ||
|  | #               recursively calling this routine. Other links, which are not | ||
|  | #               HTML exist in the $rlinks hash and are downloaded later in the | ||
|  | #               script since they do not need to be edited. | ||
|  | #               NOTE: As of 2017-12-10 this code has been used in a live | ||
|  | #               situation but the recursive capabilities have not been | ||
|  | #               invoked. This is because we have shows with the usual short | ||
|  | #               notes which reference longer notes, and sometimes the latter | ||
|  | #               have links to other files. However, so far none of these cases | ||
|  | #               have referenced further HTML files needing scanning and | ||
|  | #               editing. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub find_links_in_file { | ||
|  |     my ( $episode, $filename, $rlinks, $rconfig, $verbose, $silent ) = @_; | ||
|  | 
 | ||
|  |     my ( $linkre, $epstr, $tree, $uri, $slink ); | ||
|  |     my ( $oldfile, $newfile, $newURL ); | ||
|  |     my ( $encoding, $linkcount, $linkedits ); | ||
|  | 
 | ||
|  |     _debug( $DEBUG >= 3, "Entered find_links_in_file\n" ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Create a tree object | ||
|  |     # | ||
|  |     $tree = HTML::TreeBuilder->new; | ||
|  |     $tree->ignore_unknown(0); | ||
|  |     $tree->no_expand_entities(1); | ||
|  |     $tree->p_strict(1); | ||
|  |     $tree->store_comments(1); | ||
|  |     $tree->warn(1); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Parse the file using IO::HTML to grab it. Die if we fail because then we | ||
|  |     # know this stuff needs some urgent attention. | ||
|  |     # | ||
|  |     $tree->parse_file( html_file($filename) ) | ||
|  |         or die "HTML::TreeBuilder failed to process $filename: $!\n"; | ||
|  | 
 | ||
|  |     # | ||
|  |     # NOTE: No longer filtering out links not relating to the episode. | ||
|  |     # Experience with 'upload_manager' has shown that there are some weird | ||
|  |     # links in the HPR database which will cause problems. | ||
|  |     # | ||
|  |     # Regular expression to match links to the HPR server. Examples: | ||
|  |     #   http://hackerpublicradio.org/eps/hpr2153.png | ||
|  |     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html | ||
|  |     # Also things like this (**Why Ken?**) | ||
|  |     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg | ||
|  |     # Don't match things like when *not* processing 1986: | ||
|  |     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2 | ||
|  |     # | ||
|  |     $epstr = sprintf( "hpr%04d", $episode ); | ||
|  | #   my $re | ||
|  | #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x; | ||
|  |     $linkre = qr{ | ||
|  |         ^https?:// | ||
|  |         (?:www.)? | ||
|  |         (?:hacker|hobby)publicradio.org/eps/ | ||
|  |         (.+)$ | ||
|  |     }x; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Counting new links found and stashed as well as edits made | ||
|  |     # | ||
|  |     $linkcount = 0; | ||
|  |     $linkedits = 0; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Scan the HTML tree for links we care about | ||
|  |     # | ||
|  |     for ( @{ $tree->extract_links( 'a', 'img' ) } ) { | ||
|  |         my ( $link, $element, $attr, $tag ) = @$_; | ||
|  | 
 | ||
|  |         # | ||
|  |         # Standardise the link (expands relative URLs, removes any fragment) | ||
|  |         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative | ||
|  |         # URIs are removed. | ||
|  |         # | ||
|  |         local $URI::ABS_REMOTE_LEADING_DOTS = 1; | ||
|  |         $uri = URI->new_abs( $link, $rconfig->{baseURL} ); | ||
|  |         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque ); | ||
|  | 
 | ||
|  |         _debug( $DEBUG >= 3, "\$uri = $uri\n" ); | ||
|  |         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment ) | ||
|  |             if $uri->fragment; | ||
|  |         _debug( $DEBUG >= 3, "\$slink = $slink, \n" ); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Is it an HPR link? | ||
|  |         # | ||
|  |         if ( $slink =~ $linkre ) { | ||
|  |             # | ||
|  |             # Save the last bracketed match, without any 'fragment' if there | ||
|  |             # is one (we want this not to be URL-related) | ||
|  |             # NOTE: Will we ever have a fragment here? | ||
|  |             # | ||
|  |             ( $oldfile = "$+" ) =~ s/#.*$//; | ||
|  |             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" ); | ||
|  | 
 | ||
|  |             # | ||
|  |             # Does this file path begin with an 'hpr' prefix? If so is it the | ||
|  |             # show id? If not we don't want to process it. | ||
|  |             # | ||
|  |             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) { | ||
|  |                 if ( $1 ne $epstr ) { | ||
|  |                     _debug( $DEBUG >= 3, "Ignored $slink\n" ); | ||
|  |                     next; | ||
|  |                 } | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # The path and URL might end with a slash which means the URL is | ||
|  |             # relying on the Web server to fill in the filename as | ||
|  |             # 'index.html'. We have to make this explicit. | ||
|  |             # | ||
|  |             if ( $slink =~ /\/$/ ) { | ||
|  |                 $slink    .= 'index.html'; | ||
|  |                 $oldfile .= 'index.html'; | ||
|  |             } | ||
|  | 
 | ||
|  | 
 | ||
|  |             # | ||
|  |             # Save the original link if it's unique. We have a hashref in | ||
|  |             # $rlinks (pointing to a global hash). | ||
|  |             # | ||
|  |             # We add a key made from the parsed link in which we store an | ||
|  |             # anonymous hashref containing the name of the file we'll store in | ||
|  |             # the cache area for upload, and the new IA-based URL we'll use in | ||
|  |             # the notes. We rename files that don't start with 'hprNNNN_' with | ||
|  |             # that prefix to make it clear it belongs to the show (it mainly | ||
|  |             # helps organise and manage the cache if truth be told) | ||
|  |             # | ||
|  |             unless ( exists( $rlinks->{$slink} ) ) { | ||
|  |                 # | ||
|  |                 # Originally we turned "hpr9999/file.dat" into | ||
|  |                 # "hpr9999_file.dat". We don't want to do this any more so the | ||
|  |                 # code is much simpler | ||
|  |                 # | ||
|  |                 $newfile = $rconfig->{uploads} . "/$oldfile"; | ||
|  |                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile ); | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Save the link details as a sub-hash indexed by the | ||
|  |                 # standardised URL. Elements are: | ||
|  |                 # { | ||
|  |                 #     cached => 'file path for the cache area', | ||
|  |                 #     new    => 'URL to be used on the IA', | ||
|  |                 # } | ||
|  |                 # | ||
|  |                 $rlinks->{$slink}           = {}; | ||
|  |                 $rlinks->{$slink}->{cached} = $newfile; | ||
|  |                 $rlinks->{$slink}->{new}    = $newURL; | ||
|  | 
 | ||
|  |                 $linkcount++; | ||
|  | 
 | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # Simply change the content of the link (in the parsed HTML) with | ||
|  |             # the new URL built above. We know the attribute ('src' or 'href') | ||
|  |             # from what 'extract_links' returned. Deal with any fragment we | ||
|  |             # found as well. | ||
|  |             # | ||
|  |             if ( $uri->fragment ) { | ||
|  |                 $element->attr( $attr, | ||
|  |                     $rlinks->{$slink}->{new} . '#' . $uri->fragment ); | ||
|  |             } | ||
|  |             else { | ||
|  |                 $element->attr( $attr, $rlinks->{$slink}->{new} ); | ||
|  |             } | ||
|  |             $linkedits++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # If we found HPR links then we'll have changed them in the tree, so we | ||
|  |     # need to update the file. We use 'as_HTML' with no entities, indentation | ||
|  |     # by one space and no optional end tags (see HTML::Element). If the file | ||
|  |     # gets examined after the edits there should be no eligible HPR links, so | ||
|  |     # no action. | ||
|  |     # | ||
|  |     if ( $linkedits > 0 ) { | ||
|  |         open( my $out, ">:encoding(UTF-8)", $filename ) | ||
|  |             or die "Unable to open $filename for writing: $!\n"; | ||
|  |         print $out $tree->as_HTML( undef, ' ', {} ); | ||
|  |         close($out); | ||
|  |         print STDERR "Altered links in $filename\n" unless $silent; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # If we found new links then $linkcount will be non-zero and we need to | ||
|  |     # download and parse them, otherwise nothing else to do. | ||
|  |     # | ||
|  |     if ( $linkcount > 0 ) { | ||
|  |         # | ||
|  |         # If any links are HTML we need to download them and recursively parse, | ||
|  |         # record and possibly change their contents | ||
|  |         # | ||
|  |         foreach my $key ( keys(%$rlinks) ) { | ||
|  |             if ( $key =~ /\.html$/ ) { | ||
|  |                 my $linkfile = $rlinks->{$key}->{cached}; | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Get the file unless we've already collected it | ||
|  |                 # | ||
|  |                 if ( !-e $linkfile ) { | ||
|  |                     download_url( $key, $linkfile, $rlinks->{$key}->{new}, | ||
|  |                         $verbose, $silent ); | ||
|  |                 } | ||
|  | 
 | ||
|  |                 # | ||
|  |                 # Do the recursive parsing since we need to know about further | ||
|  |                 # links | ||
|  |                 # | ||
|  |                 find_links_in_file( | ||
|  |                     $episode, $linkfile, $rlinks, | ||
|  |                     $rconfig, $verbose,  $silent | ||
|  |                 ); | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Return the link count | ||
|  |     # | ||
|  |     return $linkcount; | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: download_url | ||
|  | #      PURPOSE: Download a file from an URL | ||
|  | #   PARAMETERS: $from           the URL the file is to be collected from | ||
|  | #               $to             the path to which the file is to be written | ||
|  | #               $upload         the eventual IA URL (for information) | ||
|  | #               $verbose        setting controlling what reports are generated | ||
|  | #               $silent         setting controlling what reports are generated | ||
|  | #      RETURNS: The status value from the download | ||
|  | #  DESCRIPTION: If 'verbose' is true the details of the download are reported. | ||
|  | #               We are to download from the URL specified as $from, and the | ||
|  | #               destination is the path in $to which is in a temporary cache | ||
|  | #               area. This may result in the original file being renamed. The | ||
|  | #               value in $upload shows the URL the file will be available at | ||
|  | #               on the IA once the metadata has been used for uploads. A brief | ||
|  | #               message is written to STDERR by default then the HTTP download | ||
|  | #               is initiated with a warning generated if the download is not | ||
|  | #               successful. | ||
|  | #               Moved from LWP::Simple to LWP::UserAgent to get more control | ||
|  | #               over the download. The former failed when an HTTPS URL was | ||
|  | #               used and the server didn't offer this type of connection, | ||
|  | #               whereas the latter does not. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub download_url { | ||
|  |     my ( $from, $to, $upload, $verbose, $silent ) = @_; | ||
|  | 
 | ||
|  |     my ( $dirname, $status ); | ||
|  | 
 | ||
|  |     if ($verbose) { | ||
|  |         print "Link to be downloaded: $from\n"; | ||
|  |         print "  to ",              $to,     "\n"; | ||
|  |         print "  and uploaded as ", $upload, "\n"; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Extract the directory from the path. If the directory doesn't exist then | ||
|  |     # make it. | ||
|  |     # | ||
|  |     ( $dirname = $to ) =~ s|/?[^/]*$||mx; | ||
|  |     make_path($dirname) unless -d $dirname; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Collect the file and save it | ||
|  |     # | ||
|  |     print STDERR "Downloading $from\n" unless $silent; | ||
|  |     $status = getstore( $from, $to, $verbose ); | ||
|  |     $status == 200 or warn "Download failed: $status : $from ($to)\n"; | ||
|  | 
 | ||
|  |     print '~' x 80, "\n" if $verbose; | ||
|  | 
 | ||
|  |     return $status; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: getstore | ||
|  | #      PURPOSE: Get an URL and store the result in a file | ||
|  | #   PARAMETERS: $from           the URL to download | ||
|  | #               $to             where to put the result | ||
|  | #               $verbose        setting controlling what reports are generated | ||
|  | #      RETURNS: The status code from the 'get' | ||
|  | #  DESCRIPTION: When using LWP::Simple an attempt to fetch an URL with an | ||
|  | #               'https' method failed since the server doesn't offer this | ||
|  | #               service. Using LWP::UserAgent this is not a problem, so we are | ||
|  | #               effectively emulating LWP::Simple in a more complex way! | ||
|  | #               However, we now have a means of taking action when the | ||
|  | #               download fails in some predictable way. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub getstore { | ||
|  |     my ( $from, $to, $verbose ) = @_; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Create the agent, and identify it (a bit) | ||
|  |     # | ||
|  |     my $ua = LWP::UserAgent->new; | ||
|  |     $ua->agent("make_metadata/$VERSION "); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Get the URL and store it to the requested file | ||
|  |     # | ||
|  |     my $res = $ua->get( $from, ':content_file' => $to ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Report what happened | ||
|  |     # | ||
|  |     print $res->status_line, "\n" if $verbose; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Return status | ||
|  |     # | ||
|  |     return $res->code; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: expand_template | ||
|  | #      PURPOSE: Turns a filename template into a filename or a default | ||
|  | #   PARAMETERS: $option         The input from the -output or -script option | ||
|  | #               $prefix         Filename prefix | ||
|  | #               $suffix         Filename suffix | ||
|  | #               $lbound         Lower bound of the episode range | ||
|  | #               $ubound         Upper bound of the episode range | ||
|  | #      RETURNS: The expanded template or the default | ||
|  | #  DESCRIPTION: The -output or -script options both take an optional template, | ||
|  | #               so we need to expand it with episode numbers. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub expand_template { | ||
|  |     my ( $option, $prefix, $suffix, $lbound, $ubound ) = @_; | ||
|  | 
 | ||
|  |     if ( defined($option) ) { | ||
|  |         if ( $option =~ /^$/ ) { | ||
|  |             # | ||
|  |             # Build a default template depending on the episode range | ||
|  |             # | ||
|  |             if ( $lbound == $ubound ) { | ||
|  |                 $option = $prefix . '_%04d.' . $suffix; | ||
|  |                 $option = sprintf( $option, $lbound ); | ||
|  |             } | ||
|  |             else { | ||
|  |                 $option = $prefix . '_%04d-%04d.' . $suffix; | ||
|  |                 $option = sprintf( $option, $lbound, $ubound ); | ||
|  |             } | ||
|  |         } | ||
|  |         elsif ( $option =~ /%(\d*)d/ ) { | ||
|  |             # | ||
|  |             # Caller specified a template. We need to check it | ||
|  |             # | ||
|  |             my $count = () = $option =~ /%(\d*)d/g; | ||
|  |             die "Invalid - too many '%d' sequences in '$option'\n" | ||
|  |                 if ( $count > ( $lbound == $ubound ? 1 : 2 ) ); | ||
|  |             die "Invalid - too few '%d' sequences in '$option'\n" | ||
|  |                 if ( $count < ( $lbound == $ubound ? 1 : 2 ) ); | ||
|  |             $option =~ s/%(\d*)d/%04d/g; | ||
|  |             $option = sprintf( $option, $lbound, $ubound ); | ||
|  |         } | ||
|  |     } | ||
|  |     else { | ||
|  |         # | ||
|  |         # The default | ||
|  |         # | ||
|  |         $option = "$prefix.$suffix" | ||
|  |     } | ||
|  | 
 | ||
|  |     return $option; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: coalesce | ||
|  | #      PURPOSE: To find the first defined argument and return it | ||
|  | #   PARAMETERS: Arbitrary number of arguments | ||
|  | #      RETURNS: The first defined argument or undef if there are none | ||
|  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | ||
|  | #               returned when doing so might be a problem. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub coalesce { | ||
|  |     foreach (@_) { | ||
|  |         return $_ if defined($_); | ||
|  |     } | ||
|  |     return undef;    ## no critic | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: _debug | ||
|  | #      PURPOSE: Prints debug reports | ||
|  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | ||
|  | #               $message        Message to print | ||
|  | #      RETURNS: Nothing | ||
|  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | ||
|  | #               newline and then adds one in the 'print' to the caller doesn't | ||
|  | #               have to bother. Prepends the message with 'D> ' to show it's | ||
|  | #               a debug message. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub _debug { | ||
|  |     my ( $active, $message ) = @_; | ||
|  | 
 | ||
|  |     chomp($message); | ||
|  |     print "D> $message\n" if $active; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: Options | ||
|  | #      PURPOSE: Processes command-line options | ||
|  | #   PARAMETERS: $optref     Hash reference to hold the options | ||
|  | #      RETURNS: Undef | ||
|  | #  DESCRIPTION: Process the options we want to offer. See the documentation | ||
|  | #               for details | ||
|  | #       THROWS: no exceptions | ||
|  | #     COMMENTS: none | ||
|  | #     SEE ALSO: n/a | ||
|  | #=============================================================================== | ||
|  | sub Options { | ||
|  |     my ($optref) = @_; | ||
|  | 
 | ||
|  |     my @options = ( | ||
|  |         "help",               "documentation|man", | ||
|  |         "debug=i",            "verbose!", | ||
|  |         "silent!",            "test!", | ||
|  |         "meta_only|noaudio!", "from=i", | ||
|  |         "to=i",               "count=i", | ||
|  |         "list=s",             "output:s", | ||
|  |         "fetch!",             "ignore_missing|im!", | ||
|  |         "assets!",            "script:s", | ||
|  |         "a_count=s",          "dbconfig=s", | ||
|  |         "config=s", | ||
|  |     ); | ||
|  | 
 | ||
|  |     if ( !GetOptions( $optref, @options ) ) { | ||
|  |         pod2usage( | ||
|  |             -msg     => "$PROG version $VERSION\n", | ||
|  |             -exitval => 1, | ||
|  |             -verbose => 0 | ||
|  |         ); | ||
|  |     } | ||
|  | 
 | ||
|  |     return; | ||
|  | } | ||
|  | 
 | ||
|  | __END__ | ||
|  | 
 | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #  Application Documentation | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #{{{ | ||
|  | 
 | ||
|  | =head1 NAME | ||
|  | 
 | ||
|  | make_metadata - Generate metadata from the HPR database for Archive.org | ||
|  | 
 | ||
|  | =head1 VERSION | ||
|  | 
 | ||
|  | This documentation refers to make_metadata version 0.4.14 | ||
|  | 
 | ||
|  | 
 | ||
|  | =head1 USAGE | ||
|  | 
 | ||
|  |     make_metadata [-help] [-documentation] | ||
|  | 
 | ||
|  |     make_metadata -from=FROM [-to=TO] [-count=COUNT] [-output[=FILE]] | ||
|  |         [-script[=FILE]] [-a_count=FILE] [-[no]meta_only] [-[no]fetch] | ||
|  |         [-[no]assets] [-[no]silent] [-[no]verbose] [-[no]test] | ||
|  |         [-[no]ignore_missing] [-config=FILE] [-dbconfig=FILE] [-debug=N] | ||
|  | 
 | ||
|  |     make_metadata -list=LIST [-output[=FILE]] [-script[=FILE]] | ||
|  |         [-[no]meta_only] [-[no]fetch] [-[no]assets] [-[no]silent] | ||
|  |         [-[no]verbose] [-[no]test] [-[no]ignore_missing] [-config=FILE] | ||
|  |         [-dbconfig=FILE] [-debug=N] | ||
|  | 
 | ||
|  |     Examples: | ||
|  | 
 | ||
|  |     make_metadata -from=1234 -nofetch | ||
|  | 
 | ||
|  |     make_metadata -from=1234 -to=1235 | ||
|  | 
 | ||
|  |     make_metadata -from=1234 -count=10 | ||
|  | 
 | ||
|  |     make_metadata -from=1 -to=3 -output=metadata_1-3.csv | ||
|  | 
 | ||
|  |     make_metadata -from=1500 -to=1510 -out=metadata_1500-1510.csv -verbose | ||
|  | 
 | ||
|  |     make_metadata -from=1500 -to=1510 -out=metadata_%d-%d.csv -verbose | ||
|  | 
 | ||
|  |     make_metadata -from=500 -to=510 -out=metadata_%04d-%04d.csv -verbose | ||
|  | 
 | ||
|  |     make_metadata -from=1500 -to=1510 -out -verbose | ||
|  | 
 | ||
|  |     make_metadata -from=1500 -to=1510 -out | ||
|  | 
 | ||
|  |     make_metadata -from=1675 -to=1680 -out=metadata_%d-%d.csv -meta_only | ||
|  | 
 | ||
|  |     make_metadata -from=1450 -test | ||
|  | 
 | ||
|  |     make_metadata -list='1234,2134,2314' -out -meta_only | ||
|  | 
 | ||
|  |     make_metadata -list="931,932,933,935,938,939,940" -out -meta -ignore | ||
|  | 
 | ||
|  |     make_metadata -dbconf=.hpr_livedb.cfg -from=1234 -to=1235 | ||
|  | 
 | ||
|  |     make_metadata -from=3004 -out -meta_only -noassets | ||
|  | 
 | ||
|  | 
 | ||
|  | =head1 OPTIONS | ||
|  | 
 | ||
|  | =over 8 | ||
|  | 
 | ||
|  | =item B<-help> | ||
|  | 
 | ||
|  | Reports brief information about how to use the script and exits. To see the | ||
|  | full documentation use the option B<-documentation> or B<-man>. Alternatively, | ||
|  | to generate a PDF version use the I<pod2pdf> tool from | ||
|  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | ||
|  | installed with the cpan tool as App::pod2pdf. | ||
|  | 
 | ||
|  | =item B<-documentation> or B<-man> | ||
|  | 
 | ||
|  | Reports full information about how to use the script and exits. Alternatively, | ||
|  | to generate a PDF version use the I<pod2pdf> tool from | ||
|  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | ||
|  | installed with the cpan tool as App::pod2pdf. | ||
|  | 
 | ||
|  | =item B<-debug=N> | ||
|  | 
 | ||
|  | Run in debug mode at the level specified by I<N>. Possible values are: | ||
|  | 
 | ||
|  | =over 4 | ||
|  | 
 | ||
|  | =item B<0> | ||
|  | 
 | ||
|  | No debugging (the default). | ||
|  | 
 | ||
|  | =item B<1> | ||
|  | 
 | ||
|  | TBA | ||
|  | 
 | ||
|  | =item B<2> | ||
|  | 
 | ||
|  | TBA | ||
|  | 
 | ||
|  | =item B<3> | ||
|  | 
 | ||
|  | TBA | ||
|  | 
 | ||
|  | =item B<4 and above> | ||
|  | 
 | ||
|  | The metadata hash is dumped. | ||
|  | 
 | ||
|  | Each call of the function I<find_links_in_notes> is reported. On finding an | ||
|  | <a> or <img> tag the I<uri> value is shown, as is any fragment and the related | ||
|  | link. The original file is reported here. | ||
|  | 
 | ||
|  | Each call of the function I<find_links_in_file> is reported. On finding an | ||
|  | <a> or <img> tag the I<uri> value is shown, as is any fragment and the related | ||
|  | link. The original file is reported here, and if a link is to be ignored this | ||
|  | is reported. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =item B<-from=NUMBER> | ||
|  | 
 | ||
|  | This option defines the starting episode number of a group. It is mandatory to | ||
|  | provide either the B<-from=NUMBER> option or the B<-list=LIST> option (see | ||
|  | below). | ||
|  | 
 | ||
|  | =item B<-to=NUMBER> | ||
|  | 
 | ||
|  | This option specifies the final episode number of a group. If not given the | ||
|  | script generates metadata for the single episode indicated by B<-from>. | ||
|  | 
 | ||
|  | The value given here must be greater than or equal to that given in the | ||
|  | B<-from> option. The option must not be present with the B<-count> option. | ||
|  | 
 | ||
|  | The difference between the episode numbers given by the B<-from> and B<-to> | ||
|  | options must not be greater than 20. | ||
|  | 
 | ||
|  | =item B<-count=NUMBER> | ||
|  | 
 | ||
|  | This option specifies the number of episodes to process (starting from the | ||
|  | episode number specified by the B<-from>) option. The option must not be | ||
|  | present with the B<-to> option. | ||
|  | 
 | ||
|  | The number of episodes specified must not be greater than 20. | ||
|  | 
 | ||
|  | =item B<-list=LIST> | ||
|  | 
 | ||
|  | This option is an alternative to B<-from=NUMBER> and its associated modifying | ||
|  | options. The B<LIST> is a comma-separated list of not necessarily consecutive | ||
|  | episode numbers, and must consist of at least one and no more than 20 numbers. | ||
|  | 
 | ||
|  | This option is useful for the case when non-sequential episode numbers are to | ||
|  | be uploaded, and is particularly useful when repairing elements of particular | ||
|  | episodes (such as adding summary fields and tags) where they have already | ||
|  | been uploaded. | ||
|  | 
 | ||
|  | For example, the following shows have no summary and/or tags, but the shows | ||
|  | are already in the IA. The missing items have been provided, so we wish to | ||
|  | update the HTML part of the upload: | ||
|  | 
 | ||
|  |     $ ./make_metadata -list='2022,2027,2028,2029,2030,2033' -out -meta | ||
|  |     Output file: metadata_2022-2033.csv | ||
|  | 
 | ||
|  | =item B<-output[=FILE]> | ||
|  | 
 | ||
|  | This option specifies the file to receive the generated CSV data. If omitted | ||
|  | the output is written to B<metadata.csv> in the current directory. | ||
|  | 
 | ||
|  | The file name may contain one or two instances of the characters '%d', with | ||
|  | a leading width specification if desired (such as '%04d'). These will be | ||
|  | substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if | ||
|  | B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the | ||
|  | appropriate endpoint (adding the count to the starting number). If neither of | ||
|  | the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only | ||
|  | be one instance of '%d' or the script will abort. | ||
|  | 
 | ||
|  | If no value is provided to B<-output> then a suitable template will be | ||
|  | generated. It will be 'metadata_%04d.csv' if one episode is being processed, and | ||
|  | 'metadata_%04d-%04d.csv' if a range has been specified. | ||
|  | 
 | ||
|  | Example: | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -out=metadata_%04d.csv | ||
|  | 
 | ||
|  | the output file name will be B<metadata_1430.csv>. The same effect can be | ||
|  | achieved with: | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -out= | ||
|  | 
 | ||
|  | or | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -out | ||
|  | 
 | ||
|  | =item B<-script[=FILE]> | ||
|  | 
 | ||
|  | This option specifies the file to receive commands required to upload certain | ||
|  | files relating to a show. If omitted the commands are written to B<script.sh> | ||
|  | in the current directory. | ||
|  | 
 | ||
|  | The file name may contain one or two instances of the characters '%d', with | ||
|  | a leading width specification if desired (such as '%04d'). These will be | ||
|  | substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if | ||
|  | B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the | ||
|  | appropriate endpoint (adding the count to the starting number). If neither of | ||
|  | the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only | ||
|  | be one instance of '%d' or the script will abort. | ||
|  | 
 | ||
|  | If no value is provided to B<-script> then a suitable template will be | ||
|  | generated. It will be 'script_%04d.sh' if one episode is being processed, and | ||
|  | 'script_%04d-%04d.sh' if a range has been specified. | ||
|  | 
 | ||
|  | Example: | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -script=script_%04d.sh | ||
|  | 
 | ||
|  | the output file name will be B<script_1430.sh>. The same effect can be | ||
|  | achieved with: | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -script= | ||
|  | 
 | ||
|  | or | ||
|  | 
 | ||
|  |  ./make_metadata -from=1430 -script | ||
|  | 
 | ||
|  | =item B<-a_count=FILE> | ||
|  | 
 | ||
|  | Defines a file into which the script writes details of assets downloaded | ||
|  | during the analysis of notes (and other HTML files associated with a show). | ||
|  | The listing consists of the show identifier (e.g. 'hpr3901') followed by the | ||
|  | number of links followed to collect the files for this show. | ||
|  | 
 | ||
|  | This feature was added to allow other scripts to perform tasks with these | ||
|  | assets, but is now deprecated. | ||
|  | 
 | ||
|  | The feature will probably be removed in a later release of this script. | ||
|  | 
 | ||
|  | =item B<-[no]fetch> | ||
|  | 
 | ||
|  | This option controls whether the script attempts to fetch the MP3 audio file | ||
|  | from the HPR website should there be no WAV file in the upload area. The | ||
|  | default setting is B<-fetch>. | ||
|  | 
 | ||
|  | Normally the script is run as part of the workflow to upload the metadata and | ||
|  | audio to archive.org. The audio is expected to be a WAV file and to be in the | ||
|  | location referenced in the configuration file under the 'uploads' label. | ||
|  | However, not all of the WAV files exist for older shows. | ||
|  | 
 | ||
|  | When the WAV file is missing and B<-fetch> is selected or defaulted, the | ||
|  | script will attempt to download the MP3 version of the audio and will store it | ||
|  | in the 'uploads' area for the upload script (B<ias3upload.pl> or B<ia>) to | ||
|  | send to archive.org. If the MP3 file is not found then the script will abort. | ||
|  | 
 | ||
|  | If B<-fetch> is specified (or defaulted) as well as B<-nometa_only> (see | ||
|  | below) then the audio file fetching process will not be carried out. This is | ||
|  | because it makes no sense to fetch this file if it's not going to be | ||
|  | referenced in the metadata. | ||
|  | 
 | ||
|  | =item B<-[no]assets> | ||
|  | 
 | ||
|  | This option controls the downloading of any assets that may be associated with | ||
|  | a show. Assets are the files held on the HPR server which are referenced by | ||
|  | the show. Examples might be photographs, scripts, and supplementary notes. | ||
|  | Normally all such assets are collected and stored in the upload area and are | ||
|  | then sent to the archive via the script. The notes sent to the archive are | ||
|  | adjusted to refer to these notes on archive.org, making the HPR episode | ||
|  | completely self-contained. | ||
|  | 
 | ||
|  | =item B<-[no]meta_only> (alias B<-[no]noaudio>) | ||
|  | 
 | ||
|  | This option controls whether the output file will contain a reference to the | ||
|  | audio file(s) or only the metadata. The default is B<-nometa_only> meaning that | ||
|  | the file reference(s) and the metadata are present. | ||
|  | 
 | ||
|  | Omitting the file(s) allows the metadata to be regenerated, perhaps due to | ||
|  | edits and corrections in the database, and the changes to be propagated to | ||
|  | archive.org. If the file reference(s) exist(s) in the metadata file then the | ||
|  | file(s) must be available at the time the uploader is run. | ||
|  | 
 | ||
|  | Note that making changes this way is highly preferable to editing the entry on | ||
|  | archive.org using the web-based editor. This is because there is a problem | ||
|  | with the way HTML entities are treated and this can cause the HTML to be | ||
|  | corrupted. | ||
|  | 
 | ||
|  | =item B<-[no]silent> | ||
|  | 
 | ||
|  | The option enables (B<-silent>) and disables (B<-nosilent>) I<silent mode>. | ||
|  | When enabled the script reports nothing on STDOUT. If the script cannot find | ||
|  | the audio files and downloads the MP3 version from the HPR site for upload to | ||
|  | archive.org then the downloads are reported on STDERR. This cannot be | ||
|  | disabled, though the STDERR output could be redirected to a file or to | ||
|  | /dev/null. | ||
|  | 
 | ||
|  | If B<-silent> is specified with B<-verbose> then the latter "wins". | ||
|  | 
 | ||
|  | The script runs with silent mode disabled by default. When B<-nosilent> is | ||
|  | used with B<-noverbose> the script reports the output file name and nothing | ||
|  | else. | ||
|  | 
 | ||
|  | =item B<-[no]verbose> | ||
|  | 
 | ||
|  | This option enables (B<-verbose>) and disables (B<-noverbose>) | ||
|  | I<verbose mode>. When enabled the script reports the metadata it has collected | ||
|  | from the database before writing it to the output file. The data is reported | ||
|  | in a more readable mode than examining the CSV file, although another script | ||
|  | B<show_metadata> is also available to help with this. | ||
|  | 
 | ||
|  | If B<-verbose> is specified with B<-silent> then the former "wins". | ||
|  | 
 | ||
|  | The script runs with verbose mode disabled by default. | ||
|  | 
 | ||
|  | =item B<-[no]ignore_missing> | ||
|  | 
 | ||
|  | The script checks each episode to ensure it has a summary and tags. If either | ||
|  | of these fields is missing then a warning message is printed for that episode | ||
|  | (unless B<-silent> has been chosen), and if any episodes are lacking this | ||
|  | information the script aborts without producing metadata. If the option | ||
|  | B<-ignore_missing> is selected then the warnings are produced (dependent on | ||
|  | B<-silent>) but the script runs to completion. | ||
|  | 
 | ||
|  | The default setting is B<-noignore_missing>; the script checks and aborts if | ||
|  | any summaries or tags are missing. | ||
|  | 
 | ||
|  | =item B<-[no]test> | ||
|  | 
 | ||
|  | DO NOT USE! | ||
|  | 
 | ||
|  | This option enables (B<-test>) and disables (B<-notest>) | ||
|  | I<test mode>. When enabled the script generates metadata containing various | ||
|  | test values. | ||
|  | 
 | ||
|  | In test mode the following changes are made: | ||
|  | 
 | ||
|  | =over 4 | ||
|  | 
 | ||
|  | =item . | ||
|  | 
 | ||
|  | The item names, which normally contain 'hprnnnn', built from the episode | ||
|  | number, have 'test_' prepended to them. | ||
|  | 
 | ||
|  | =item . | ||
|  | 
 | ||
|  | The collection, which is normally a list containing 'hackerpublicradio' and | ||
|  | 'podcasts', is changed to 'test_collection'. Items in this collection are | ||
|  | normally deleted by Archive.org after 30 days. | ||
|  | 
 | ||
|  | =item . | ||
|  | 
 | ||
|  | The contributor, which is normally 'HackerPublicRadio' is changed to | ||
|  | 'perlist'. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | B<NOTE> The test mode only works for the author! | ||
|  | 
 | ||
|  | =item B<-config=FILE> | ||
|  | 
 | ||
|  | This option allows an alternative script configuration file to be used. This | ||
|  | file defines various settings relating to the running of the script - things | ||
|  | like the place to look for the files to be uploaded. It is rare to need to use | ||
|  | any other file than the default since these are specific to the environmewnt | ||
|  | in which the script runs. However, this has been added at the same time as an | ||
|  | alternative database configuration option was added. | ||
|  | 
 | ||
|  | See the CONFIGURATION AND ENVIRONMENT section below for the file format. | ||
|  | 
 | ||
|  | If the option is omitted the default file is used: B<.make_metadata.cfg> | ||
|  | 
 | ||
|  | =item B<-dbconfig=FILE> | ||
|  | 
 | ||
|  | This option allows an alternative database configuration file to be used. This | ||
|  | file defines the location of the database, its port, its name and the username | ||
|  | and password to be used to access it. This feature was added to allow the | ||
|  | script to access alternative databases or the live database over an SSH | ||
|  | tunnel. | ||
|  | 
 | ||
|  | See the CONFIGURATION AND ENVIRONMENT section below for the file format. | ||
|  | 
 | ||
|  | If the option is omitted the default file is used: B<.hpr_db.cfg> | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 DESCRIPTION | ||
|  | 
 | ||
|  | This script generates metadata suitable for uploading Hacker Public Radio | ||
|  | episodes to the Internet Archive (archive.org). | ||
|  | 
 | ||
|  | The metadata is in comma-separated variable (CSV) format suitable for | ||
|  | processing with an upload script. The original upload script was called | ||
|  | B<ias3upload.pl>, and could be obtained from | ||
|  | I<https://github.com/kngenie/ias3upload>. This script is no longer supported | ||
|  | and B<make_metadata> no longer generates output suitable for it (though it is | ||
|  | simple to make it compatible if necessary). The replacement script is called | ||
|  | B<internetarchive> which is a Python tool which can also be run from the | ||
|  | command line. It can be found at I<https://github.com/jjjake/internetarchive>. | ||
|  | 
 | ||
|  | The B<make_metadata> script generates CSV from the HPR database. It looks up | ||
|  | details for each episode selected by the options, and performs various | ||
|  | conversions and concatenations. The goal is to prepare items for the Internet | ||
|  | Archive with as much detail as the format can support. | ||
|  | 
 | ||
|  | The resulting CSV file contains a header line listing the field names required | ||
|  | by archive.org followed by as many CSV lines of episode data as requested (up | ||
|  | to a limit of 20). | ||
|  | 
 | ||
|  | Since the upload method uses the HTTP protocol with fields stored in headers, | ||
|  | there are restrictions on the way HTML can be formatted in the B<Details> | ||
|  | field. The script converts newlines, which are not allowed into I<<br/>> tags | ||
|  | where necessary. | ||
|  | 
 | ||
|  | HPR shows often have associated files, such as pictures, examples, long-form | ||
|  | notes and so forth. The script finds these and downloads them to the cache | ||
|  | area where the audio is kept and writes the necessary lines to the CSV file to | ||
|  | ensure they are uploaded with the show. It modifies any HTML which links to | ||
|  | these files to link to the archive.org copies in order to make the complete | ||
|  | show self-contained. | ||
|  | 
 | ||
|  | =head1 DIAGNOSTICS | ||
|  | 
 | ||
|  | =over 8 | ||
|  | 
 | ||
|  | =item B<Configuration file ... not found> | ||
|  | 
 | ||
|  | One or more of the configuration files has not been found. | ||
|  | 
 | ||
|  | =item B<Path ... not found> | ||
|  | 
 | ||
|  | The path specified in the B<uploads> definition in the configuration file | ||
|  | B<.make_metadata.cfg> does not exist. Check the configuration file. | ||
|  | 
 | ||
|  | =item B<Configuration data missing> | ||
|  | 
 | ||
|  | While checking the configuration file(s) the script has detected that settings | ||
|  | are missing. Check the details specified below and provide the missing | ||
|  | elements. | ||
|  | 
 | ||
|  | =item B<Mis-match between @fields and %dispatch!> | ||
|  | 
 | ||
|  | An internal error in the script has been detected where the elements of the | ||
|  | @fields array do not match the keys of the %dispatch hash. This is probably the | ||
|  | result of a failed attempt to edit either of these components. | ||
|  | 
 | ||
|  | Correct the error and run the script again. | ||
|  | 
 | ||
|  | =item B<Invalid list; no elements> | ||
|  | 
 | ||
|  | There are no list elements in the B<-list=LIST> option. | ||
|  | 
 | ||
|  | =item B<Invalid list; too many elements> | ||
|  | 
 | ||
|  | There are more than the allowed 20 elements in the list specified by the | ||
|  | B<-list=LIST> option. | ||
|  | 
 | ||
|  | =item B<Failed to parse -list=...> | ||
|  | 
 | ||
|  | A list was specified that did not contain a CSV list of numbers. | ||
|  | 
 | ||
|  | =item B<Invalid starting episode number (...)> | ||
|  | 
 | ||
|  | The value used in the B<-from> option must be greater than 0. | ||
|  | 
 | ||
|  | =item B<Do not combine -to and -count> | ||
|  | 
 | ||
|  | Using both the B<-to> and B<-count> is not permitted (and makes no sense). | ||
|  | 
 | ||
|  | =item B<Invalid range; ... is greater than ...> | ||
|  | 
 | ||
|  | The B<-from> episode number must be less than or equal to the B<-to> number. | ||
|  | 
 | ||
|  | =item B<Invalid range; range is too big (E<gt>20)> | ||
|  | 
 | ||
|  | The difference between the starting and ending episode number is greater than | ||
|  | 20. | ||
|  | 
 | ||
|  | =item B<Invalid - too many '%d' sequences in '...'> | ||
|  | 
 | ||
|  | There were more than two '%d' sequences in the the name of the output file if | ||
|  | a range of episodes is being processed, or more than one if a single episode | ||
|  | has been specified. | ||
|  | 
 | ||
|  | =item B<Invalid - too few '%d' sequences in '...'> | ||
|  | 
 | ||
|  | There were fewer than two '%d' sequences in the the name of the output file | ||
|  | when a range of episodes was being processed. | ||
|  | 
 | ||
|  | =item B<Unable to open ... for output: ...> | ||
|  | 
 | ||
|  | The script was unable to open the requested output file. | ||
|  | 
 | ||
|  | =item B<Unable to find or download ...> | ||
|  | 
 | ||
|  | The script has not found a I<.WAV> file in the cache area so has attempted to | ||
|  | download the I<MP3> copy of the audio from the HPR website. This process has | ||
|  | failed. | ||
|  | 
 | ||
|  | =item B<Failed to find requested episode> | ||
|  | 
 | ||
|  | An episode number could not be found in the database. This error is not fatal. | ||
|  | 
 | ||
|  | =item B<Nothing to do> | ||
|  | 
 | ||
|  | After processing the range of episodes specified the script could not find | ||
|  | anything to do. This is most often caused by all of the episodes in the range | ||
|  | being invalid. | ||
|  | 
 | ||
|  | =item B<Aborted due to missing summaries and/or tags> | ||
|  | 
 | ||
|  | One or more of the shows being processed does not have a summary or tags. The | ||
|  | script has been told not to ignore this so has aborted before generating | ||
|  | metadata. | ||
|  | 
 | ||
|  | =item B<HTML::TreeBuilder failed to parse notes: ...> | ||
|  | 
 | ||
|  | The script failed to parse the HTML in the notes of one of the episodes. This | ||
|  | indicates a serious problem with these notes and is fatal since these notes | ||
|  | need to be corrected before the episode is uploaded to the Internet Archive. | ||
|  | 
 | ||
|  | =item B<HTML::TreeBuilder failed to process ...: ...> | ||
|  | 
 | ||
|  | While parsing the HTML in a related file the parse has failed. The file being | ||
|  | parsed is reported as well as the error that was encountered. This is likely | ||
|  | due to bad HTML. | ||
|  | 
 | ||
|  | =item B<Unable to open ... for writing: ...> | ||
|  | 
 | ||
|  | The script is attempting to open an HTML file which it has downloaded to | ||
|  | write back edited HTML, yet the open has failed. The filename is in the error | ||
|  | message as is the cause of the error. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 CONFIGURATION AND ENVIRONMENT | ||
|  | 
 | ||
|  | This script reads two configuration files in B<Config::General> format | ||
|  | (similar to Apache configuration files) for the path to the files to be | ||
|  | uploaded and for credentials to access the HPR database. Two files are used | ||
|  | because the database configuration file is used by several other scripts. | ||
|  | 
 | ||
|  | =head2 SCRIPT CONFIGURATION | ||
|  | 
 | ||
|  | The general configuration file is B<.make_metadata.cfg> (although this can be | ||
|  | overridden through the B<-config=FILE> option) and contains the following | ||
|  | lines: | ||
|  | 
 | ||
|  |  uploads = "<path to files>" | ||
|  |  filetemplate = "hpr%04d.%s" | ||
|  |  baseURL = "http://hackerpublicradio.org" | ||
|  |  sourceURLtemplate = "https://hackerpublicradio.org/eps/%s/index.html" | ||
|  |  IAURLtemplate = "http://archive.org/download/%s/%s" | ||
|  | 
 | ||
|  | The I<uploads> line defines where the WAV files are to be found (currently | ||
|  | I</var/IA/uploads> on the VPS). The same area is used to store downloaded MP3 | ||
|  | files and any supplementary files associated with the episode. | ||
|  | 
 | ||
|  | The I<filetemplate> line defines the format of an audio file such as | ||
|  | I<hpr1234.wav>. This should not be changed. | ||
|  | 
 | ||
|  | The I<baseURL> line defines the common base for download URLs. It is used when | ||
|  | parsing and standardising URLs relating to files on the HPR server. | ||
|  | 
 | ||
|  | The I<sourceURLtemplate> line defines the format of the URL required to access the | ||
|  | show on the HPR site. This should not be changed except in the unlikely event that the | ||
|  | these URLs change. | ||
|  | 
 | ||
|  | The I<IAURLtemplate> line defines the format of URLs on archive.org which is | ||
|  | used when generating new links in HTML notes or supplementary files. | ||
|  | 
 | ||
|  | =head2 DATABASE CONFIGURATION | ||
|  | 
 | ||
|  | The database configuration file is B<.hpr_db.cfg> (although this can be | ||
|  | overridden through the B<-dbconfig=FILE> option). | ||
|  | 
 | ||
|  | The layout of the file should be as follows: | ||
|  | 
 | ||
|  |  <database> | ||
|  |      host = 127.0.0.1 | ||
|  |      port = PORT | ||
|  |      name = DATABASE | ||
|  |      user = USERNAME | ||
|  |      password = PASSWORD | ||
|  |  </database> | ||
|  | 
 | ||
|  | =head1 DEPENDENCIES | ||
|  | 
 | ||
|  |     Carp | ||
|  |     Config::General | ||
|  |     DBI | ||
|  |     Data::Dumper | ||
|  |     File::Find::Rule | ||
|  |     File::Path | ||
|  |     Getopt::Long | ||
|  |     HTML::Entities | ||
|  |     HTML::TreeBuilder | ||
|  |     IO::HTML | ||
|  |     LWP::Simple | ||
|  |     List::MoreUtils | ||
|  |     List::Util | ||
|  |     Pod::Usage | ||
|  |     Text::CSV_XS | ||
|  | 
 | ||
|  | =head1 BUGS AND LIMITATIONS | ||
|  | 
 | ||
|  | There are no known bugs in this module. | ||
|  | Please report problems to Dave Morriss (Dave.Morriss@gmail.com) | ||
|  | Patches are welcome. | ||
|  | 
 | ||
|  | =head1 AUTHOR | ||
|  | 
 | ||
|  | Dave Morriss  (Dave.Morriss@gmail.com) | ||
|  | 
 | ||
|  | =head1 LICENCE AND COPYRIGHT | ||
|  | 
 | ||
|  | Copyright (c) 2014-2019 Dave Morriss (Dave.Morriss@gmail.com). | ||
|  | All rights reserved. | ||
|  | 
 | ||
|  | This module is free software; you can redistribute it and/or | ||
|  | modify it under the same terms as Perl itself. See perldoc perlartistic. | ||
|  | 
 | ||
|  | This program is distributed in the hope that it will be useful, | ||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
|  | 
 | ||
|  | =cut | ||
|  | 
 | ||
|  | #}}} | ||
|  | 
 | ||
|  | # [zo to open fold, zc to close] | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker |