forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			2801 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			2801 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: make_metadata
 | |
| #
 | |
| #        USAGE: ./make_metadata -from=FROM [-to=TO] [-count=COUNT]
 | |
| #               [-output=FILE] [-[no]fetch] [-[no]meta_only] [-[no]verbose]
 | |
| #               [-[no]silent] [-[no]test] [-help] [-{documentation|man}]
 | |
| #
 | |
| #  DESCRIPTION: Make metadata for the uploading of a range of HPR episodes to
 | |
| #               the Internet Archive by collecting fields from the database.
 | |
| #
 | |
| #      OPTIONS: See the POD documentation for full details (./make_metadata
 | |
| #               -man) or use 'pod2pdf' to generate a PDF version in the
 | |
| #               directory holding this script.
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: 2023-07-06: A version in development 0.4.14 has been put aside
 | |
| #               and this version (0.4.12) made into the main line version
 | |
| #               because 4.14 was developing in a direction that doesn't fit
 | |
| #               with the changes made to the HPR system in June/July 2023.
 | |
| #               Will now move forward with version numbers.
 | |
| #               2024-01-23: Added the 'open' pragma for UTF-8
 | |
| #               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.4.14
 | |
| #      CREATED: 2014-06-13 12:51:04
 | |
| #     REVISION: 2024-01-23 16:28:59
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use 5.010;
 | |
| use strict;
 | |
| use warnings;
 | |
| use open ':encoding(UTF-8)';
 | |
| #use utf8;
 | |
| 
 | |
| use Carp;
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use List::Util qw{max};
 | |
| use List::MoreUtils qw{uniq apply};
 | |
| use Config::General;
 | |
| use Text::CSV_XS;
 | |
| 
 | |
| use IO::HTML;
 | |
| use HTML::TreeBuilder 5 -weak;
 | |
| use HTML::Entities;
 | |
| 
 | |
| use File::Find::Rule;
 | |
| use File::Path qw{make_path};
 | |
| #use LWP::Simple;
 | |
| use LWP::UserAgent;
 | |
| use HTTP::Status qw(status_message);
 | |
| 
 | |
| use DBI;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.4.14';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Constants and defaults
 | |
| #-------------------------------------------------------------------------------
 | |
| my $basedir      = "$ENV{HOME}/HPR/IA";
 | |
| my $configfile   = "$basedir/.$PROG.cfg";
 | |
| my $dbconfigfile = "$basedir/.hpr_db.cfg";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants used in the metadata
 | |
| #
 | |
| my $mediatype   = 'audio';
 | |
| my $collection  = 'hackerpublicradio';
 | |
| my $language    = 'eng';
 | |
| my $contributor = 'HackerPublicRadio';
 | |
| 
 | |
| #
 | |
| # Variables, arrays and hashes
 | |
| #
 | |
| my ( $dbh, $sth1, $h1 );
 | |
| my ( $DEBUG, $verbose, $silent, $test, $from, $to, $count );
 | |
| my ( $list, $meta_only, $outfile, $scriptfile, $fetch, $assets);
 | |
| my ( $acountfile, $ignore );
 | |
| my ( @range, $tree, %links, %meta, @source, @transcripts, @counts, @head, @data );
 | |
| my ( $ep_name, $ft, $file, $URL, $status, $incomplete );
 | |
| my ( $filepath, $max_epno, $filetemplate, $sourceURLtemplate );
 | |
| my ( $iauploadtemplate, $iauploadoptions );
 | |
| 
 | |
| #
 | |
| # File types we'll look for when working out (and collecting) the main audio
 | |
| # file and their order
 | |
| #
 | |
| my @filetypes = (qw{ wav mp3 ogg });
 | |
| 
 | |
| #
 | |
| # Additional filetypes we'll look for if we're working round the lack of tags
 | |
| # on the derived audio files. See the discussion at
 | |
| # https://gitlab.anhonesthost.com/HPR/HPR_Public_Code/issues/34
 | |
| #
 | |
| my @additional_audio = (qw{ flac mp3 ogg opus spx });
 | |
| 
 | |
| #
 | |
| # Templates for finding the transcripts.
 | |
| #
 | |
| # We expect these to be in $uploads/$episode/ where we expect to find:
 | |
| #     $episode.json
 | |
| #     $episode.srt
 | |
| #     $episode.tsv
 | |
| #     $episode.txt
 | |
| #     $episode.vtt
 | |
| # The regular expression below should find the transcripts given that the
 | |
| # search is directed to the sub-directory in $uploads.
 | |
| #
 | |
| my $transcript_re = qr{hpr\d{4}\.(json|srt|tsv|txt|vtt)};
 | |
| 
 | |
| #
 | |
| # Names of CSV fields and their order
 | |
| #
 | |
| my @fields = (
 | |
|     qw{
 | |
|         identifier file mediatype collection title creator language
 | |
|         description contributor date subject licenseurl
 | |
|         }
 | |
| );
 | |
| 
 | |
| #
 | |
| # Maximum episodes per run
 | |
| #
 | |
| my $max_episodes = 20;
 | |
| 
 | |
| #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| # A dispatch table to define how to build metadata. The keys are actually
 | |
| # header names to be used in the metadata file and must relate to the @fields
 | |
| # array above. Two types of values are expected here: 1) anonymous subroutine
 | |
| # references which return the value for the field or 2) scalars (or strings)
 | |
| # to be placed as-is. Some of the anonymous subroutines are written as
 | |
| # closures (see
 | |
| # http://en.wikipedia.org/wiki/Closure_%28computer_programming%29), which make
 | |
| # it easier to build the calling code since we can pass in a variety of
 | |
| # arguments internally while offering a simple no-argument call externally.
 | |
| #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| my %dispatch = (
 | |
|     identifier => sub { return make_item( $h1, $test ) },
 | |
|     file       => sub {
 | |
|         return (
 | |
|             $meta_only
 | |
|             ? ''
 | |
|             : make_filename( $filepath, $filetemplate, \@filetypes, $h1 )
 | |
|         );
 | |
|     },
 | |
|     mediatype   => $mediatype,
 | |
|     collection  => $collection,
 | |
|     #title       => sub { return decode_entities( db_title($h1) ) },
 | |
|     title       => sub { return db_title($h1) },
 | |
|     creator     => sub { return $h1->{host} },
 | |
|     language    => $language,
 | |
|     description => sub { return db_notes( $sourceURLtemplate, $h1, $tree, \@source ) },
 | |
|     contributor => $contributor,
 | |
|     date       => sub { return $h1->{date} },
 | |
|     subject    => sub { return db_tags($h1) },
 | |
|     licenseurl => sub { return $h1->{license_url} }
 | |
| );
 | |
| 
 | |
| #
 | |
| # Validate the data structures above to ensure there have been no mis-edits
 | |
| #
 | |
| foreach (@fields) {
 | |
|     die "Mis-match between \@fields and \%dispatch!\n"
 | |
|         unless ( defined( $dispatch{$_} ) );
 | |
| }
 | |
| 
 | |
| #
 | |
| # The main query on the database. We want episode details, the host's name,
 | |
| # the series name and the Creative Commons licence the episode is under.
 | |
| # Note that the 'LEFT JOIN' for 'miniseries' is for the case where the
 | |
| # 'series' column of 'eps' is 'NULL'. This is not the case for the live
 | |
| # database - yet!
 | |
| #
 | |
| my $sql = q{
 | |
|     SELECT
 | |
|         e.id AS eps_id,
 | |
|         e.date,
 | |
|         e.title,
 | |
|         sec_to_time(e.duration) AS duration,
 | |
|         e.summary,
 | |
|         e.notes,
 | |
|         e.explicit,
 | |
|         e.license,
 | |
|         l.url as license_url,
 | |
|         e.tags,
 | |
|         e.valid,
 | |
|         h.host,
 | |
|         s.name AS s_name,
 | |
|         s.description AS s_desc
 | |
|     FROM eps e
 | |
|         JOIN hosts h ON e.hostid = h.hostid
 | |
|         LEFT JOIN miniseries s ON e.series = s.id
 | |
|         JOIN licenses l ON e.license = l.short_name
 | |
|     WHERE e.id = ?
 | |
|     };
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options and arguments
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Option defaults
 | |
| #
 | |
| my $DEFDEBUG = 0;
 | |
| 
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help shows minimal information
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # The -documentation or -man option shows the full POD documentation through
 | |
| # a pager for convenience
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
 | |
|     if ( $options{'documentation'} );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| $DEBUG   = ( defined( $options{debug} )   ? $options{debug}   : $DEFDEBUG );
 | |
| $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
 | |
| $silent  = ( defined( $options{silent} )  ? $options{silent}  : 0 );
 | |
| $test    = ( defined( $options{test} )    ? $options{test}    : 0 );
 | |
| $from    = $options{from};
 | |
| $to      = $options{to};
 | |
| $count   = $options{count};
 | |
| $list    = $options{list};
 | |
| $meta_only = ( defined( $options{meta_only} ) ? $options{meta_only} : 0 );
 | |
| $fetch     = ( defined( $options{fetch} )     ? $options{fetch}     : 1 );
 | |
| $assets    = ( defined( $options{assets} )    ? $options{assets}    : 1 );
 | |
| $ignore
 | |
|     = ( defined( $options{ignore_missing} ) ? $options{ignore_missing} : 0 );
 | |
| $outfile    = $options{output};
 | |
| $scriptfile = $options{script};
 | |
| $acountfile = $options{a_count};
 | |
| 
 | |
| #
 | |
| # The two config files can be overridden or default to the original
 | |
| # declarations
 | |
| #
 | |
| my $cfgfile
 | |
|     = ( defined( $options{config} ) ? $options{config} : $configfile );
 | |
| my $dbcfgfile
 | |
|     = ( defined( $options{dbconfig} ) ? $options{dbconfig} : $dbconfigfile );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Load and validate program configuration data
 | |
| #-------------------------------------------------------------------------------
 | |
| die "Configuration file $cfgfile not found\n" unless ( -e $cfgfile );
 | |
| 
 | |
| my $conf = Config::General->new(
 | |
|     -ConfigFile      => $configfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1,
 | |
| );
 | |
| my %config = $conf->getall();
 | |
| 
 | |
| #
 | |
| # Check the config settings
 | |
| #
 | |
| $filepath = $config{uploads};
 | |
| 
 | |
| die "Path $filepath not found\n" unless ( -e $filepath );
 | |
| 
 | |
| $max_epno          = $config{max_epno};
 | |
| $filetemplate      = $config{filetemplate};
 | |
| $sourceURLtemplate = $config{sourceURLtemplate};
 | |
| $iauploadtemplate  = $config{iauploadtemplate} . "\n";
 | |
| $iauploadoptions   = $config{iauploadoptions};
 | |
| 
 | |
| die "Configuration data missing\n"
 | |
|     unless $max_epno
 | |
|     && $filetemplate
 | |
|     && $sourceURLtemplate
 | |
|     && $iauploadtemplate
 | |
|     && $iauploadoptions;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Load database configuration data
 | |
| #-------------------------------------------------------------------------------
 | |
| die "Configuration file $dbcfgfile not found\n"  unless ( -e $dbcfgfile );
 | |
| 
 | |
| my $dbconf = Config::General->new(
 | |
|     -ConfigFile      => $dbcfgfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1,
 | |
| );
 | |
| my %dbcfg = $dbconf->getall();
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Validate options
 | |
| #-------------------------------------------------------------------------------
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     if ( ( !defined($from) and !defined($list) )
 | |
|     or ( defined($from) and defined($list) ) );
 | |
| 
 | |
| #
 | |
| # Deal with the values specified by one of:
 | |
| #     -from=N1 -to=N2
 | |
| #     -from=N3 -count=N4
 | |
| #     -list="N1,N2,N3"
 | |
| #
 | |
| # End by populating @range with a list of episode numbers
 | |
| #
 | |
| if ( defined($list) ) {
 | |
|     #
 | |
|     # We have a list which we'll parse, validate, sort, make unique and filter
 | |
|     #
 | |
|     my $lcsv = Text::CSV_XS->new( { binary => 1, } );
 | |
|     if ( $lcsv->parse($list) ) {
 | |
|         @range = uniq( sort { $a <=> $b } $lcsv->fields() );
 | |
|         @range = grep {/\d+/} @range;
 | |
|         @range = grep { $_ > 0 && $_ <= $max_epno } @range;
 | |
|         @range = apply { $_ =~ s/(^\s*|\s*$)// } @range;
 | |
| 
 | |
|         die "Invalid list; no elements\n" if scalar(@range) == 0;
 | |
|         die "Invalid list; too many elements\n"
 | |
|             if scalar(@range) > $max_episodes;
 | |
|     }
 | |
|     else {
 | |
|         die "Failed to parse -list='$list'\n" . $lcsv->error_diag() . "\n";
 | |
|     }
 | |
| }
 | |
| else {
 | |
|     #
 | |
|     # We have -from=N, -to=M or -count=X
 | |
|     #
 | |
|     die "Invalid starting episode number ($from)\n" unless $from > 0;
 | |
|     die "Do not combine -to and -count\n"
 | |
|         if ( defined($to) && defined($count) );
 | |
| 
 | |
|     if ( defined($count) ) {
 | |
|         $to = $from + $count - 1;
 | |
|     }
 | |
|     elsif ( !defined($to) ) {
 | |
|         $to = $from;
 | |
|     }
 | |
| 
 | |
|     die "Invalid range; $from is greater than $to\n" unless $from <= $to;
 | |
|     die "Invalid range; range is too big (>$max_episodes): $to - $from = "
 | |
|         . ( $to - $from ) . "\n" if ( $to - $from ) > $max_episodes;
 | |
| 
 | |
|     @range = ( $from .. $to );
 | |
| }
 | |
| 
 | |
| #
 | |
| # Check we got a sensible range
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     unless scalar(@range) > 0;
 | |
| 
 | |
| #
 | |
| # You can't be silent and verbose at the same time
 | |
| #
 | |
| $silent = 0 if $verbose;
 | |
| 
 | |
| #
 | |
| # Fetching audio files is not relevant in 'metadata only' mode
 | |
| # TODO: -nofetch with -meta_only incorrectly results in the download of files;
 | |
| # see the Journal in regard to show 3004 in April 2020.
 | |
| # TODO: 2023-07-06 Time to drop '-[no]fetch' completely?
 | |
| #
 | |
| $fetch = 0 if $meta_only;
 | |
| 
 | |
| #
 | |
| # If there's a defined output filename then there are three options: a null
 | |
| # string, a plain filename and a substitution string with '%d' sequences. The
 | |
| # null string means the user used '-output' without a value, so we want to
 | |
| # generate a substitution string. A string with '%d' requires a check to
 | |
| # ensure there's the right number, one for $ubound and one for $lbound. The
 | |
| # plain filename needs no more work.
 | |
| #
 | |
| # If no defined output filename we'll provide a default.
 | |
| #
 | |
| $outfile = expand_template( $outfile, 'metadata', 'csv',
 | |
|     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
 | |
| 
 | |
| #
 | |
| # Check the -script option. Like -output the choices are: a null string,
 | |
| # a plain filename or a substitution string with '%d' sequences.
 | |
| #
 | |
| $scriptfile = expand_template( $scriptfile, 'script', 'sh',
 | |
|     ( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # In verbose mode report the options
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($verbose) {
 | |
|     my $fmt = "%18s: %s\n";
 | |
| 
 | |
|     printf $fmt, 'From',  $from  if defined( $options{from} );
 | |
|     printf $fmt, 'To',    $to    if defined( $options{to} );
 | |
|     printf $fmt, 'Count', $count if defined( $options{count} );
 | |
|     printf $fmt, 'List',  $list  if defined( $options{list} );
 | |
|     printf $fmt, 'Output',        $outfile;
 | |
|     printf $fmt, 'Output script', $scriptfile;
 | |
|     printf $fmt, 'Output asset count', ( $acountfile ? $acountfile : '--' );
 | |
|     printf $fmt, 'Meta only', ( $meta_only ? 'Yes' : 'No' );
 | |
|     printf $fmt, 'Fetch',     ( $fetch     ? 'Yes' : 'No' );
 | |
|     printf $fmt, 'Silent',    ( $silent    ? 'Yes' : 'No' );
 | |
|     printf $fmt, 'Verbose',   ( $verbose   ? 'Yes' : 'No' );
 | |
|     printf $fmt, 'Debug',           $DEBUG;
 | |
|     printf $fmt, 'Script config',   $cfgfile;
 | |
|     printf $fmt, 'Database config', $dbcfgfile;
 | |
| 
 | |
|     print '-' x 80, "\n";
 | |
| }
 | |
| 
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up output and script file
 | |
| #-------------------------------------------------------------------------------
 | |
| open( my $out, '>:encoding(UTF-8)', $outfile )
 | |
|     or die "Unable to open $outfile for output: $!\n";
 | |
| 
 | |
| #
 | |
| # Open the script file and add a header. Set execute permission on the
 | |
| # file for the owner, and nobody else. We keep a count of lines (other than
 | |
| # the header stuff) written to this file.
 | |
| #
 | |
| my $script_lines = 0;
 | |
| open( my $script, '>:encoding(UTF-8)', $scriptfile )
 | |
|     or die "Unable to open $scriptfile for output: $!\n";
 | |
| print $script "#!/usr/bin/env bash\n\n";
 | |
| print $script <<'EndHead';
 | |
| Upload () {
 | |
|     local id=${1}
 | |
|     local file=${2}
 | |
|     local remote=${3:-}
 | |
|     local options=${4:-}
 | |
| 
 | |
|     if [[ -e $file ]]; then
 | |
|         if [[ -z $remote ]]; then
 | |
|             ia upload ${id} ${file} ${options}
 | |
|         else
 | |
|             ia upload ${id} ${file} --remote-name=${remote} ${options}
 | |
|         fi
 | |
|     else
 | |
|         echo "File missing: $file"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| EndHead
 | |
| 
 | |
| chmod(0744,$scriptfile);
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Make changes for test mode
 | |
| #-------------------------------------------------------------------------------
 | |
| # NOTE: Commented out 2023-07-06 since this is dangerous
 | |
| #if ($test) {
 | |
| #    $dispatch{collection}  = 'test_collection';
 | |
| #    $dispatch{contributor} = 'perlist';
 | |
| #}
 | |
| 
 | |
| #
 | |
| # Prepare to turn the data into CSV data. Since we're allowing embedded
 | |
| # newlines (binary mode) add an explicit end of line.
 | |
| #
 | |
| # TODO: The new tool 'ia' does not accept newlines in the CSV, so no point in
 | |
| # defining them here. They have been removed elsewhere in the code.
 | |
| #
 | |
| my $csv = Text::CSV_XS->new( { binary => 1, eol => "\r\n" } );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Copy audio files from the website if requested and if needed. We look for
 | |
| # the WAV file in the 'uploads' area but if it's not found we get the MP3
 | |
| # (which is all we have for older shows).
 | |
| #
 | |
| # TODO: Given we're redirecting requests for audio files on the HPR site to
 | |
| # archive.org there's potential for a nasty paradox here if the rewrite rules
 | |
| # aren't perfect. We do the MP3 thing only for old shows, so we need to make
 | |
| # sure that it works when we need it.
 | |
| # TODO: Are there any logic flaws here? This was designed before we started
 | |
| # creating all the audio formats we have now (partly to avoid the 'derive'
 | |
| # process on archive.org from making copies without tags).
 | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| # NOTE: Commented out 2023-07-06 because (a) this situation should not occur
 | |
| # again now that all audio versions are on the IA and we get everything for
 | |
| # new shows, (b) the URLs used for downloads are obsolete.
 | |
| #-------------------------------------------------------------------------------
 | |
| #if ( $fetch && !$meta_only ) {
 | |
| #
 | |
| #    foreach my $episode (@range) {
 | |
| #        $ft = 0;
 | |
| #
 | |
| #        #
 | |
| #        # Does the WAV version already exist?
 | |
| #        #
 | |
| #        $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
 | |
| #        if ( ! -e "$filepath/$file" ) {
 | |
| #            #
 | |
| #            # Doesn't exist, try the alternative version
 | |
| #            #
 | |
| #            $ft++;
 | |
| #            $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
 | |
| #            if ( !-e "$filepath/$file" ) {
 | |
| #                #
 | |
| #                # We need to download the file
 | |
| #                #
 | |
| #                $URL = sprintf( $URLtemplate, $file );
 | |
| #                print STDERR "Downloading $URL\n" unless $silent;
 | |
| #                $status = getstore( $URL, "$filepath/$file" );
 | |
| #                $status == 200 or warn "$status : $URL ($file)\n";
 | |
| #            }
 | |
| #        }
 | |
| #
 | |
| #        die "Unable to find or download $file\n"
 | |
| #            unless ( -e "$filepath/$file" );
 | |
| #    }
 | |
| #
 | |
| #}
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Connect to the database
 | |
| #-------------------------------------------------------------------------------
 | |
| my $dbhost = $dbcfg{database}->{host} // '127.0.0.1';
 | |
| my $dbport = $dbcfg{database}->{port} // 3306;
 | |
| my $dbname = $dbcfg{database}->{name};
 | |
| my $dbuser = $dbcfg{database}->{user};
 | |
| my $dbpwd  = $dbcfg{database}->{password};
 | |
| $dbh = DBI->connect( "dbi:mysql:database=$dbname;host=$dbhost;port=$dbport",
 | |
|     $dbuser, $dbpwd, { AutoCommit => 1 } )
 | |
|     or croak $DBI::errstr;
 | |
| 
 | |
| #
 | |
| # Enable client-side UTF8
 | |
| #
 | |
| $dbh->{mysql_enable_utf8} = 1;
 | |
| $dbh->{mysql_enable_utf8mb4} = 1;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Collect the necessary information for constructing the metadata for each of
 | |
| # the selected episodes.
 | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| # Each key of the %meta hash will contain an arrayref with an element per
 | |
| # episode in the @range array. Some of these elements will be objects
 | |
| # (arrayrefs or hashrefs) containing multi-part items such as 'subject'
 | |
| # strings from the 'tags' field in the database. The %meta keys correspond to
 | |
| # the strings of the @fields array which correspond to IA metadata fields and
 | |
| # are also used by the %dispatch hash to contain actions to perform to make
 | |
| # the contents. A few of the %meta keys correspond to other items relating to
 | |
| # the episode such as supplementary audio and supplementary files like
 | |
| # pictures.
 | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| # NOTE: See the dump of the %meta hash in the Journal that goes with this
 | |
| # project (Example 1). It should give a better representation of this
 | |
| # structure.
 | |
| #-------------------------------------------------------------------------------
 | |
| $sth1 = $dbh->prepare($sql);
 | |
| 
 | |
| $incomplete = 0;
 | |
| foreach my $episode (@range) {
 | |
|     #
 | |
|     # Get the episode from the database
 | |
|     #
 | |
|     $sth1->execute($episode);
 | |
|     if ( $dbh->err ) {
 | |
|         croak $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     $h1 = $sth1->fetchrow_hashref;
 | |
|     unless ($h1) {
 | |
|         #
 | |
|         # Somehow this episode doesn't exist in the database. No idea why, but
 | |
|         # skip this one and keep going anyway.
 | |
|         #
 | |
|         carp "Failed to find requested episode number $episode\n";
 | |
|         next;
 | |
|     }
 | |
| 
 | |
|     print STDERR "D> ", $h1->{title}, "\n";
 | |
| 
 | |
|     #
 | |
|     # Make the episode name with leading zeroes
 | |
|     #
 | |
|     $ep_name = sprintf( 'hpr%04d', $episode );
 | |
| 
 | |
|     #
 | |
|     # Check whether this episode has a summary and tags. Count up the number
 | |
|     # that are missing either of these.
 | |
|     # TODO: Time to remove this check?
 | |
|     #
 | |
|     if ( length( $h1->{summary} ) == 0 || length( $h1->{tags} ) == 0 ) {
 | |
|         print STDERR "Episode $episode is missing summary and/or tags\n"
 | |
|             unless ($silent);
 | |
|         $incomplete++;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Handle supplementary audio
 | |
|     #
 | |
|     unless ( defined( $meta{audio} ) ) {
 | |
|         $meta{audio} = [];
 | |
|     }
 | |
| 
 | |
|     my @audio;
 | |
| 
 | |
|     #
 | |
|     # Look for other audio files for upload.
 | |
|     # TODO: Consider catering for hpr\d{4}(\.\d)?\.(flac|mp3|ogg|opus|spx)
 | |
|     # e.g. hpr2781.1.ogg which could happen if the audio has had to be
 | |
|     # re-released.
 | |
|     #
 | |
|     foreach my $ext ( @additional_audio ) {
 | |
|         my $audio
 | |
|             = sprintf( "%s/hpr%04d.%s", $config{uploads}, $episode, $ext );
 | |
|         if ( -e $audio ) {
 | |
|             push( @audio, $audio );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Look for the source file if provided. It will be called
 | |
|     # 'hpr9999_source.{wav,flac,mp3,ogg,opus}' (though it's doubtful if people
 | |
|     # are sending in opus). For safety we'll accept any extension.
 | |
|     #
 | |
|     @source = File::Find::Rule->file()
 | |
|         ->name("hpr${episode}_source.*")
 | |
|         ->in($filepath);
 | |
| 
 | |
|     #
 | |
|     # Generate the upload command(s) for the source file(s). We upload to the
 | |
|     # main directory for the item and request that no 'derive' process be run
 | |
|     # on the file(s). If we're just generating metadata it's not really
 | |
|     # necessary to do this. Rather than having a different format here we now
 | |
|     # use a generic format from the configuration file but pass in a null
 | |
|     # remote filename. We also add default options, also from the
 | |
|     # configuration file in all cases so we don't derive anything and don't
 | |
|     # generate an archive of old stuff.
 | |
|     #
 | |
|     # TODO: What if the source file(s) had been omitted in the first pass, and
 | |
|     # we need to upload them now? Could just do it by hand of course.
 | |
|     #
 | |
|     unless ($meta_only) {
 | |
|         for my $sou (@source) {
 | |
|             printf $script $iauploadtemplate,
 | |
|                 'hpr' . $episode,
 | |
|                 $sou,
 | |
|                 '',
 | |
|                 $iauploadoptions;
 | |
|             $script_lines++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Turn all source audio files into URLs pointing to the IA item
 | |
|     #
 | |
|     for (my $i = 0; $i <= $#source; $i++) {
 | |
|         ( my $bn = $source[$i] ) =~ s|.*/||mx;
 | |
|         $source[$i] = sprintf( $config{IAURLtemplate}, "hpr$episode", $bn );
 | |
|     }
 | |
| 
 | |
|     #push( @audio, @source ) if (@source);
 | |
| 
 | |
|     #
 | |
|     # Save whatever was found, whether an arrayref of audio or undef
 | |
|     #
 | |
|     push( @{ $meta{audio} }, \@audio );
 | |
| 
 | |
|     #
 | |
|     # Most shows have transcripts since Q2 2023. They aren't referred to in
 | |
|     # the show notes or database (not at present anyway). They get added to
 | |
|     # the static page on the HPR site with a footer template on every page.
 | |
|     # The files themselves arrive in a sub-directory of $uploads (usually
 | |
|     # `/data/IA/uploads/hpr1234/`) as part of the show components. For now
 | |
|     # we'll search for them and upload them using the Bash script we're
 | |
|     # building which is run after the initial 'ia upload'.
 | |
|     #
 | |
|     unless ($meta_only) {
 | |
|         @transcripts = File::Find::Rule->file()
 | |
|             ->name($transcript_re)
 | |
|             ->in("$filepath/$ep_name");
 | |
| 
 | |
|         #
 | |
|         # Write 'Upload' function calls to the script for all of the transcripts
 | |
|         #
 | |
|         for my $ts (@transcripts) {
 | |
|             ( my $bn = $ts ) =~ s|.*/||mx;
 | |
|             printf $script $iauploadtemplate, $ep_name,    # identifier
 | |
|                 $ts,                                       # local file path
 | |
|                 "$ep_name/$bn",                            # Remote file path
 | |
|                 $iauploadoptions;
 | |
|             $script_lines++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Initialise the hash element for holding links to files in the notes per
 | |
|     # show
 | |
|     #
 | |
|     unless ( defined( $meta{links} ) ) {
 | |
|         $meta{links} = [];
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Scan the database-stored notes looking for links to files on the HPR
 | |
|     # server. For any HTML files found in that pass, download them and perform
 | |
|     # another scan. If further HTML files are found download and scan them and
 | |
|     # so on recursively. Any non-HTML files are left for downloading later.
 | |
|     #
 | |
|     #<<<                                 (Stop perltidy reformatting oddly)
 | |
|     if (find_links_in_notes($episode, $h1->{notes}, \$tree, \%links, \%config,
 | |
|          $verbose, $silent) > 0) {
 | |
|     #>>>
 | |
|         #
 | |
|         # Save the links we found. Do this by copying the %links hash into the
 | |
|         # %meta hash
 | |
|         #
 | |
|         push( @{ $meta{links} }, {%links} );
 | |
|     }
 | |
|     else {
 | |
|         push( @{ $meta{links} }, undef );
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Build the metadata hash using the dispatch table. For each field we will
 | |
|     # have an array of data. This may be a scalar for single-valued items or
 | |
|     # an array for multi-valued ones.
 | |
|     #
 | |
|     foreach my $fld (@fields) {
 | |
|         unless ( defined( $meta{$fld} ) ) {
 | |
|             $meta{$fld} = [];
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # If it's a code reference then call the code and save the result
 | |
|         # otherwise save the data
 | |
|         #
 | |
|         if ( ref( $dispatch{$fld} ) eq 'CODE' ) {
 | |
|             push( @{ $meta{$fld} }, $dispatch{$fld}->() );
 | |
|         }
 | |
|         else {
 | |
|             push( @{ $meta{$fld} }, $dispatch{$fld} );
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| _debug( $DEBUG >= 3, Dumper(\%meta) );
 | |
| 
 | |
| die "Nothing to do\n" unless %meta;
 | |
| 
 | |
| #
 | |
| # If we're not ignoring missed summaries and tags and we found them then abort
 | |
| # with a message.
 | |
| # TODO: 2023-07-06 Not needed any more?
 | |
| #
 | |
| if ( $incomplete > 0 ) {
 | |
|     unless ($ignore) {
 | |
|         die "Aborted due to missing summaries and/or tags\n";
 | |
|     }
 | |
|     else {
 | |
|         say "Missing summaries and/or tags - ignored" unless ($silent);
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # ~~ Explanation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| # Considering '%meta' as a "sideways spreadsheet" where the keys are row
 | |
| # headings and the elements within the arrayrefs held as the hash values, the
 | |
| # "cells" (array elements) may be arrayrefs with multiple elements (i.e. the
 | |
| # spreadsheet is 3-dimensional!). We need to know the maximum number of array
 | |
| # elements per cell because we have to expand the resulting CSV "spreadsheet"
 | |
| # we'll generate for the 'ia upload' command by providing headers like
 | |
| # "subject[1]" and "subject[2]". Hard to explain without a diagram!
 | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | |
| #
 | |
| # Compute the maximum number of sub-fields when a field can be multi-valued.
 | |
| # This is because all CSV rows must have the same number of fields; the
 | |
| # maximum of course.
 | |
| #
 | |
| @counts = map {
 | |
|     max( map { ref($_) eq 'ARRAY' ? scalar(@$_) : 1 } @{ $meta{$_} } )
 | |
| } @fields;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Report on the collected data if requested.
 | |
| # See 'Example 1' in the Journal for this project for what the '%meta' hash
 | |
| # looks like at this stage.
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($verbose) {
 | |
|     foreach my $i ( 0 .. $#range ) {
 | |
|         my $ind = 0;
 | |
|         foreach my $fld (@fields) {
 | |
|             printf "%20s[%d]: %s\n", $fld, $counts[$ind],
 | |
|                 defined( $meta{$fld}->[$i] )
 | |
|                 ? (
 | |
|                 ref( $meta{$fld}->[$i] ) eq 'ARRAY'
 | |
|                 ? join( ",", @{ $meta{$fld}->[$i] } )
 | |
|                 : $meta{$fld}->[$i]
 | |
|                 )
 | |
|                 : '';
 | |
|             $ind++;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Each show will (most likely) have an array of audio types in an
 | |
|         # array of arrays.
 | |
|         #
 | |
|         if ( defined( $meta{audio}->[$i] ) ) {
 | |
|             printf "%23s: %s\n", 'added audio',
 | |
|                 scalar( @{ $meta{audio}->[$i] } );
 | |
|         }
 | |
|         else {
 | |
|             printf "%23s: 0\n", 'added audio';
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Each show may have links. They will be stored as an array of hashes
 | |
|         # where each hash has sub-hashes, one per "asset". Some will just be
 | |
|         # 'undef' meaning that they have no assets.
 | |
|         #
 | |
|         if ( defined( $meta{links}->[$i] ) ) {
 | |
|             printf "%23s: %s\n", 'links',
 | |
|                 scalar( keys( %{ $meta{links}->[$i] } ) );
 | |
|         }
 | |
|         else {
 | |
|             printf "%23s: 0\n", 'links';
 | |
|         }
 | |
| 
 | |
|         print '-' x 80, "\n";
 | |
|     }
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Output asset counts if requested
 | |
| # TODO: Used in 'past_upload', which is no longer used (since that project is
 | |
| # complete).
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($acountfile) {
 | |
|     my $acount_lines = 0;
 | |
|     open( my $acount, '>:encoding(UTF-8)', $acountfile )
 | |
|         or die "Unable to open $acountfile for output: $!\n";
 | |
| 
 | |
|     foreach my $i ( 0 .. $#range ) {
 | |
|         if ( defined( $meta{links}->[$i] ) ) {
 | |
|             printf $acount "%s %d\n", $meta{identifier}->[$i],
 | |
|                 scalar( keys( %{ $meta{links}->[$i] } ) );
 | |
|             $acount_lines++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     close($acount);
 | |
| 
 | |
|     #
 | |
|     # If we never wrote anything to the asset count file then delete it
 | |
|     #
 | |
|     if ( $acount_lines eq 0 ) {
 | |
|         print "Deleting empty '$acountfile'\n" if $verbose;
 | |
|         unlink($acountfile);
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Download any linked files unless the -noassets option was used.
 | |
| # Report the details in verbose mode. Only download if the file is not already
 | |
| # downloaded.
 | |
| # NOTE: HTML downloads will have already happened since we need to recursively
 | |
| # parse and modify them. So if an asset is HTML on the HPR site an earlier
 | |
| # stage will have downloaded it so that it can be parsed for any further HTML
 | |
| # or assets.
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($assets) {
 | |
|     foreach my $i ( 0 .. $#range ) {
 | |
|         if ( defined( $meta{links}->[$i] ) ) {
 | |
|             for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
 | |
|                 my $linkfile = $meta{links}->[$i]->{$key}->{cached};
 | |
|                 if ( ! -e $linkfile ) {
 | |
|                     download_url( $key, $linkfile,
 | |
|                         $meta{links}->[$i]->{$key}->{new},
 | |
|                         $verbose, $silent );
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Generate the completed CSV
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # First build and output the header fields. If there's a single header then just
 | |
| # output it, but if it's a multi-item thing output the maximum with indexes
 | |
| # (like "subject[0]").
 | |
| #
 | |
| for my $ind ( 0 .. $#fields ) {
 | |
|     if ( $counts[$ind] == 1 ) {
 | |
|         push( @head, $fields[$ind] );
 | |
|     }
 | |
|     else {
 | |
|         for ( my $i = 0; $i < $counts[$ind]; $i++ ) {
 | |
|             push( @head, $fields[$ind] . "[$i]" );
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| $csv->print( $out, \@head );
 | |
| 
 | |
| #
 | |
| # Build and output the data. The structure we have is a hash where each
 | |
| # element is an array with an element per episode (0 .. $range). The
 | |
| # per-episode elements may be scalars (for single-value fields) or arrays (for
 | |
| # multi-value). In the latter case we have to make sure we have filled all of
 | |
| # the header positions (e.g. we have 10 "subject" headers, but this episode
 | |
| # has only 1 - we have to fill in the remaining 9 slots with empty items).
 | |
| #
 | |
| foreach my $i ( 0 .. $#range ) {
 | |
|     @data = ();
 | |
|     my $ind = 0;
 | |
|     foreach my $fld (@fields) {
 | |
|         my $count = 0;
 | |
|         if ( ref( $meta{$fld}->[$i] ) eq 'ARRAY' ) {
 | |
|             foreach my $elem ( @{ $meta{$fld}->[$i] } ) {
 | |
|                 push( @data, $elem );
 | |
|                 $count++;
 | |
|             }
 | |
|         }
 | |
|         else {
 | |
|             push( @data, $meta{$fld}->[$i] );
 | |
|             $count++;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Cater for any blank slots
 | |
|         #
 | |
|         if ( $count < $counts[$ind] ) {
 | |
|             push( @data, undef ) for 1 .. ( $counts[$ind] - $count );
 | |
|         }
 | |
| 
 | |
|         $ind++;
 | |
|     }
 | |
| 
 | |
|     #print Dumper( \@data ), "\n";
 | |
|     $csv->print( $out, \@data );
 | |
| 
 | |
|     #
 | |
|     # If there are additional audio files or supplementary files for the
 | |
|     # current show write a CSV row for each of them here.
 | |
|     # NOTE: originally the CSV format did not require the second and
 | |
|     # subsequent row belonging to an IA identifier to have an identifier in it
 | |
|     # (maybe it was even illegal in the early days). Now the ia tool seems to
 | |
|     # have made it mandatory - surprisingly.
 | |
|     #
 | |
|     if ( defined( $meta{audio}->[$i] ) ) {
 | |
|         for my $audio (@{$meta{audio}->[$i]}) {
 | |
|             #
 | |
|             # Each row is a file path which we'll add to the CSV. We use the
 | |
|             # length of the @header array to get the number of CSV fields
 | |
|             #
 | |
|             @data = ();
 | |
| #           push( @data, undef, $audio );
 | |
|             push( @data, $meta{identifier}->[$i], $audio );
 | |
|             push( @data, undef ) for 1 .. ( scalar(@head) - 2 );
 | |
|             $csv->print( $out, \@data );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # The 'links' key holds an array of hashes containing details of the
 | |
|     # 'assets' relating to a show (files other than audio files, but including
 | |
|     # the original "source" audio). In make_metadata V0.3.x we used these
 | |
|     # hashes to add rows to the CSV file which results in the upload of these
 | |
|     # files.  However, this mechanism does not cater for (a) the uploading of
 | |
|     # directories to the IA, and (b) the ability to disable the "derive"
 | |
|     # mechanism for a given file. Therefore we build a Bash script containing
 | |
|     # a function that calls 'ia' commands which can be used instead, but only
 | |
|     # if $meta_only is not set.
 | |
|     # TODO: 2023-07-06 We want to upload the transcripts created by 'whisper'
 | |
|     # and placed in the 'uploads' directory. We will add these to the 'script'
 | |
|     # file.
 | |
|     #
 | |
|     if ( ! $meta_only && defined( $meta{links}->[$i] ) ) {
 | |
|         for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
 | |
|             #
 | |
|             # Build a call to the 'Upload' function being explicit about the
 | |
|             # remote name
 | |
|             #
 | |
|             ( my $rem = $meta{links}->[$i]->{$key}->{cached} )
 | |
|                 =~ s|$config{uploads}/||;
 | |
|             printf $script $iauploadtemplate,
 | |
|                 $meta{identifier}->[$i],
 | |
|                 $meta{links}->[$i]->{$key}->{cached},
 | |
|                 $rem,
 | |
|                 $iauploadoptions;
 | |
|             $script_lines++;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # Report the output and script file names
 | |
| #
 | |
| unless ($silent) {
 | |
|     print "Output file: $outfile\n";
 | |
|     print "Script file: $scriptfile\n" unless ( $script_lines eq 0 );
 | |
| }
 | |
| 
 | |
| close($out);
 | |
| close($script);
 | |
| 
 | |
| #
 | |
| # If we never wrote anything to the script file then delete it
 | |
| #
 | |
| if ($script_lines eq 0) {
 | |
|     print "Deleting empty '$scriptfile'\n" if $verbose;
 | |
|     unlink($scriptfile);
 | |
| }
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: make_item
 | |
| #      PURPOSE: Build an Archive.org item name for this item
 | |
| #   PARAMETERS: $h              - hashref returned from the database
 | |
| #               $testmode       - Boolean denoting test mode
 | |
| #      RETURNS: The newly fashioned item string
 | |
| #  DESCRIPTION: The method for generating an unique item string for
 | |
| #               Archive.org is embodied in this function
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub make_item {
 | |
|     my ( $h, $testmode ) = @_;
 | |
| 
 | |
|     return sprintf( "%shpr%04d", ( $testmode ? 'test_' : '' ), $h->{eps_id} );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: make_filename
 | |
| #      PURPOSE: Determine the filename path for the metadata
 | |
| #   PARAMETERS: $path           - relative path to the files
 | |
| #               $template       - template for forming the file name
 | |
| #               $ftypes         - arrayref holding ordered file types
 | |
| #               $h              - hashref returned from the database
 | |
| #      RETURNS: The path to the file
 | |
| #  DESCRIPTION: Forms the file path then checks that there really is a file of
 | |
| #               that name. Since we don't have *.wav files for every episode
 | |
| #               we might have one of the other formats. The array referenced
 | |
| #               by $ftypes contains a list of these file types, starting with
 | |
| #               the WAV default. At the moment we only look at the second
 | |
| #               choice if there is no WAV, but we could be more fancy if
 | |
| #               necessary.
 | |
| #               TODO The design of this code is not good. The earlier fetch
 | |
| #               stage will have determined what is available. We are
 | |
| #               potentially repeating these checks here, though they are
 | |
| #               necessary if -nofetch was specified. Needs some further
 | |
| #               thought.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub make_filename {
 | |
|     my ( $path, $template, $ftypes, $h ) = @_;
 | |
| 
 | |
|     my $file;
 | |
|     my $ft = 0;
 | |
| 
 | |
|     #
 | |
|     # Look for the WAV file first. If found, return it
 | |
|     #
 | |
|     $file = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
 | |
|     if ( -e $file ) {
 | |
|         return $file;
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # Look for the alternative type of file and return that if found
 | |
|         #
 | |
|         $ft++;
 | |
|         $file
 | |
|             = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
 | |
|         if ( -e $file ) {
 | |
|             return $file;
 | |
|         }
 | |
|         else {
 | |
|             warn "No file found for hpr" . $h->{eps_id} . "\n";
 | |
|             return;
 | |
|         }
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: db_title
 | |
| #      PURPOSE: Return a title for the Archive.org item
 | |
| #   PARAMETERS: $h      - hashref returned from the database
 | |
| #      RETURNS: The title
 | |
| #  DESCRIPTION: Makes the title string we want for the item
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub db_title {
 | |
|     my ($h) = @_;
 | |
| 
 | |
|     # print STDERR "D> ", $h->{title}, "\n";
 | |
|     return sprintf( "hpr%04d :: %s", $h->{eps_id}, $h->{title} );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: db_notes
 | |
| #      PURPOSE: Return the description for an episode
 | |
| #   PARAMETERS: $sourceURLtpl   - printf template for making the HPR link to
 | |
| #                                 the original show (from config file)
 | |
| #               $h              - hashref returned from the database
 | |
| #               $tree           - parsed notes
 | |
| #               $asource        - arrayref containing source audio file URLs
 | |
| #      RETURNS: The description built from the database for Archive.org
 | |
| #  DESCRIPTION: This routine generates the item description for Archive.org as
 | |
| #               HTML by concatenating the summary, series name (if there is
 | |
| #               one), duration, link to episode on the HPR site and the notes
 | |
| #               into a long string. The main work is in the formatting of the
 | |
| #               notes element which is stored as HTML in the database. Since
 | |
| #               we are making a single string from the multi-line data we have
 | |
| #               to take special action to preserve the contents of <pre> tags.
 | |
| #               See the comments in 'flatten_pre' for what is being done to
 | |
| #               achieve this.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub db_notes {
 | |
|     my ( $sourceURLtpl, $h, $tree, $asource ) = @_;
 | |
| 
 | |
|     my ( $lines, $episode, $desc, $sourceurl, $notes );
 | |
| 
 | |
|     #
 | |
|     # Ensure leading zeroes on the episode number and prepend 'hpr'
 | |
|     #
 | |
|     $episode = sprintf('hpr%04d', $h->{eps_id});
 | |
| 
 | |
| #    $sourceurl = sprintf( "http://hackerpublicradio.org/eps.php?id=%04d",
 | |
| #        $h->{eps_id} );
 | |
|     $sourceurl = sprintf( $sourceURLtpl, $episode );
 | |
| 
 | |
|     #
 | |
|     # Make the header with newlines so we can count them
 | |
|     # TODO: Should we use the series number 0 which denotes "no series" rather
 | |
|     # than the name "general"? Need to adjust the query to return this if so.
 | |
|     # FIXME: The "not series" 'general' is being removed and the logic will
 | |
|     # use NULL for an episode that is not in a series. The local copy of the
 | |
|     # database has had this change made, but not the live one. We have to
 | |
|     # cater for both. When it goes we can drop references to 'general'.
 | |
|     #
 | |
|     $desc = sprintf( "Summary: %s\n", $h->{summary} );
 | |
|     unless ( !defined( $h->{s_name} ) or $h->{s_name} eq 'general' ) {
 | |
|         $desc .= sprintf( "Series: %s\n", coalesce( $h->{s_name}, '' ) );
 | |
|     }
 | |
|     #$desc .= sprintf( "Duration: %s\n", $h->{duration} );
 | |
|     $desc .= sprintf( "Source: <a href=\"%s\">%s</a>\n", $sourceurl, $sourceurl );
 | |
|     #
 | |
|     # Add a pointer to the source audio if there is any (cater for multiple,
 | |
|     # but it's not likely we'll get more than one)
 | |
|     #
 | |
|     if (@{$asource}) {
 | |
|         foreach my $aurl (@{$asource}) {
 | |
|             $desc .= sprintf( "Original audio: <a href=\"%s\">%s</a>\n",
 | |
|                 $aurl, $aurl );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Count the lines so we can add <br/> tags at the end to make the header
 | |
|     # a standard size of 4 lines high to accomodate the media widget.
 | |
|     # TODO: Rationalise this since there is no media widget any more.
 | |
|     # Temporarily boosted the height to 6 lines now the Original audio has
 | |
|     # been added.
 | |
|     #
 | |
|     $lines = $desc =~ tr/\n//;
 | |
|     $desc =~ s#\n#<br />#g;
 | |
|     $desc .= ( "<br />" x ( 6 - $lines ) );
 | |
| 
 | |
|     #
 | |
|     # Prepare the notes and add to the header. Actions are:
 | |
|     # 1. Make all <pre> nodes compatible by adding <br/>
 | |
|     # 2. Generate entities for all non-ASCII
 | |
|     # 3. Do the silly thing of double-encoding entities to counteract the IA
 | |
|     #    bug
 | |
|     # 4. Remove all newlines ('ia' doesn't seem to know what to do with them)
 | |
|     # 5. Trim trailing spaces
 | |
|     #
 | |
|     # ----
 | |
|     # NOTE: Removed the double-encoding on 2017-10-07 since (unannounced) it
 | |
|     # looks as if the IA have fixed this bug. There is some debate over
 | |
|     # whether we should be encoding non-ASCII also.
 | |
|     # ----
 | |
|     # NOTE: 2017-11-25 restored the double-encoding since the bug bit again at
 | |
|     # the last upload. This is not the whole story though. Added a step to
 | |
|     # decode the notes before re-encoding the non-ASCII bits. This is to
 | |
|     # counteract what seems to be being done by Pandoc if we process the notes
 | |
|     # from plain text.
 | |
|     # ----
 | |
|     # NOTE: 2021-02-20 found a problem with the strategy of decoding
 | |
|     # everything and then encoding it again (selectively). The issue was seen
 | |
|     # in HPR show 3284 where the HTML notes contains a <pre> section with C++
 | |
|     # code in it which uses characters like '<' and '>'. These were encoded as
 | |
|     # HTML entities at shownote preparation time since they were flagged as
 | |
|     # bad HTML by my validator script. The original algorithm then decoded
 | |
|     # these characters but couldn't encode them with encode_entities because
 | |
|     # that would have encoded all the HTML tags. The decode/encode stuff is
 | |
|     # therefore HTML context dependent, and we don't have the means of
 | |
|     # handling this with the current script. So, the solution seems to be not
 | |
|     # to decode and encode at all. After running flatten_pre, simply run the
 | |
|     # re_encode_entities and trust that there's nothing in the notes that need
 | |
|     # special action here. Anyway, the preparation stage should have handled
 | |
|     # whatever's necessary.
 | |
|     # ----
 | |
|     #
 | |
|     #$notes = flatten_pre( $h->{notes} );
 | |
|     $notes = flatten_pre($tree);
 | |
|     ## $notes = decode_entities($notes);
 | |
|     ## $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
 | |
|     ## $notes = re_encode_entities($notes);
 | |
|     $notes =~ s/\n//g;
 | |
| 
 | |
|     $desc .= $notes;
 | |
|     $desc =~ s/\s*$//;
 | |
| 
 | |
|     return $desc;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: db_tags
 | |
| #      PURPOSE: Return the episode search tags as an array for populating
 | |
| #               'subject' fields on Archive.org
 | |
| #   PARAMETERS: $h      - hashref returned from the database
 | |
| #      RETURNS: A reference to an array of tags or undef if there are none
 | |
| #  DESCRIPTION: In the database the tags are held as a comma-delimited string.
 | |
| #               We need to turn this into an array to populate 'subject'
 | |
| #               fields in the CSV file. If there are no tags the caller needs
 | |
| #               an undef to be returned to indicate that a blank field needs
 | |
| #               to be constructed in the CSV. The format of the database
 | |
| #               string may be messy since we rely on the show's host to submit
 | |
| #               this information. We split the string taking this into
 | |
| #               consideration.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub db_tags {
 | |
|     my ($h) = @_;
 | |
| 
 | |
|     return $h->{tags} ? [ split( /\s*,\s*/, $h->{tags} ) ] : undef;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: flatten_pre
 | |
| #      PURPOSE: Process notes "flattening" <pre> contents
 | |
| #   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
 | |
| #                       partially processed notes
 | |
| #      RETURNS: Processed notes
 | |
| #  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
 | |
| #               contain some formatting tags like <em> and <code>, but spaces
 | |
| #               and newlines are significant. The Internet Archive upload API
 | |
| #               uses HTTP headers which are text strings without newlines, so
 | |
| #               when these tags are uploaded through this route some
 | |
| #               formatting is lost. What this routine does is parse the
 | |
| #               contents of all <pre> sections in $notes, adding <br/> tags
 | |
| #               to replace newlines. It has to perform a full parse
 | |
| #               since the contents may include HTML tags and these need to be
 | |
| #               passed through intact. It calls the subroutine 'flatten_item' to
 | |
| #               deal with the recursive nature of HTML tags.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub flatten_pre {
 | |
|     my ($tree) = @_;
 | |
| 
 | |
|     #
 | |
|     # Find all the <pre> tags
 | |
|     #
 | |
|     my @pre_tags = $tree->look_down( _tag => 'pre', );
 | |
| 
 | |
|     #
 | |
|     # Walk the various <pre> elements in the document
 | |
|     #
 | |
|     foreach my $tag (@pre_tags) {
 | |
|         #
 | |
|         # Save the tag and empty the original
 | |
|         #
 | |
|         my $saved = $tag->clone();
 | |
|         $tag->delete_content();
 | |
| 
 | |
|         #
 | |
|         # Walk the saved content and rebuild the tag into $atag using the
 | |
|         # nested arrayref structure permitted by HTML::Element for
 | |
|         # convenience (the alternative is a little nasty). See the
 | |
|         # documentation for 'new_from_lol' in HTML::Element.
 | |
|         #
 | |
|         my $atag;
 | |
|         foreach my $item ( @{ $saved->content_array_ref } ) {
 | |
|             push( @$atag, flatten_item($item) );
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Rebuild the tag from the arrayref we built. We treat the arrayref
 | |
|         # structure we just built as an array because otherwise the top level
 | |
|         # is interpreted as a spurious <null> tag.
 | |
|         #
 | |
|         $tag->push_content(@$atag);
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Trim out the original notes from the enclosing tags we added earlier
 | |
|     #
 | |
|     my $body = $tree->look_down( _tag => 'body' );
 | |
|     ( my $result = $body->as_HTML( undef, ' ', {} ) )
 | |
|         =~ s{(^<body[^>]*>|</body>$)}{}gi;
 | |
| 
 | |
|     return $result;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: flatten_item
 | |
| #      PURPOSE: Recursively "flatten" items within the enclosing <pre>
 | |
| #   PARAMETERS: $item   an HTML::Element item parsed from the original
 | |
| #                       <pre> section
 | |
| #      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
 | |
| #  DESCRIPTION: Since <pre> sections can contain inline elements which change
 | |
| #               the rendering of the text we need to parse these as we add
 | |
| #               <br/> tags. This routine does this by recursively descending
 | |
| #               through the contents. A common tag sequence is <pre><code> for
 | |
| #               scripts and the like. This routine deals with such sequences.
 | |
| #               It expects to receive the contents in sequence and builds the
 | |
| #               result as a nested arrayref structure.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub flatten_item {
 | |
|     my ($item) = @_;
 | |
| 
 | |
|     return unless defined($item);
 | |
| 
 | |
|     my ( @result, %attr );
 | |
| 
 | |
|     #
 | |
|     # Is it a sub-tag or non-tag content?
 | |
|     #
 | |
|     if ( ref($item) ) {
 | |
|         #
 | |
|         # It's a tag. Save the tag name and any attributes and recurse into
 | |
|         # it. Return an arrayref
 | |
|         #
 | |
|         push( @result, $item->tag() );
 | |
|         %attr = $item->all_external_attr();
 | |
|         push( @result, \%attr ) if %attr;
 | |
|         for my $child ( $item->content_list() ) {
 | |
|             push( @result, flatten_item($child) );
 | |
|         }
 | |
|         return \@result;
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # It's non-tag content. Join the lines with <br/> tags.  Return an
 | |
|         # array (since this is a simple list).
 | |
|         #
 | |
|         # Note that we split with a LIMIT of -1 which causes any trailing list
 | |
|         # items to be returned; default behaviour is to drop them.
 | |
|         #
 | |
|         $item =~ s/\r//g;
 | |
|         my @content = split( /\n/, $item, -1 );
 | |
|         if (@content) {
 | |
|             #
 | |
|             # Remove a leading blank line - usually the result of
 | |
|             # a "<pre>'NL'text" sequence
 | |
|             #
 | |
|             shift(@content) if ( $content[0] =~ /^\s*$/ );
 | |
| 
 | |
|             #
 | |
|             # Join back the lines with <br/> tags between them.
 | |
|             #
 | |
|             foreach my $txt (@content) {
 | |
|                 push( @result, $txt, ['br'] );
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Remove the <br/> at the end, it's spurious
 | |
|             #
 | |
|             pop(@result);
 | |
|         }
 | |
| 
 | |
|         return (@result);
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: re_encode_entities
 | |
| #      PURPOSE: Find all encoded entities and encode them all over again
 | |
| #   PARAMETERS: $notes  - string containing unprocessed notes
 | |
| #      RETURNS: Processed notes
 | |
| #  DESCRIPTION: Uses a brutal regular expression substitution approach, but
 | |
| #               since this is a very unusual requirement brought about by what
 | |
| #               is essentially a bug in the way the Internet Archive stores
 | |
| #               and processes metadata, we have no choice.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: It looks as if the software on archive.org saves HTML metadata
 | |
| #               in an XML file by encoding it as entities. So HTML tags are
 | |
| #               turned into <TAG> sequences. Then when HTML is required
 | |
| #               the XML is decoded back to <TAG>. Unfortunately any existing
 | |
| #               entities in the HTML like '<' will also be decoded in this
 | |
| #               phase which may result in invalid HTML sequences. In this
 | |
| #               routine we are converting existing entities so that the
 | |
| #               decoding phase turns them into valid entities. The archive.org
 | |
| #               software should be doing this, but it isn't, and any messages
 | |
| #               sent to the staff there are ignored. Of course, there may be
 | |
| #               a point at which this bug is corrected and the double encoding
 | |
| #               process here becomes redundant. Then the call to this routine
 | |
| #               can be omitted.
 | |
| #               Note also that the IA editor corrupts HTML containing entities
 | |
| #               so should not be used. To make a change, edit the notes on the
 | |
| #               HPR database and generate new metadata with this script. Use
 | |
| #               the -meta_only option to avoid the need for the media (unless
 | |
| #               it too is being updated of course) and re-submit the CSV file
 | |
| #               to ias3upload.pl or ia.
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub re_encode_entities {
 | |
|     my ($notes) = @_;
 | |
| 
 | |
|     #
 | |
|     # Replace any '&xxx;' sequence by '&xxx;'
 | |
|     #
 | |
|     $notes =~ s#\&([^;]+;)#&$1#g;
 | |
|     return $notes;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: find_links_in_notes
 | |
| #      PURPOSE: Finds HPR links in show notes
 | |
| #   PARAMETERS: $episode        episode number we're dealing with
 | |
| #               $notes          a string containing the shownotes
 | |
| #               $rtree          reference to an HTML::TreeBuilder object built
 | |
| #                               from the notes
 | |
| #               $rlinks         hashref containing the links found
 | |
| #               $rconfig        hashref containing config data
 | |
| #               $verbose        setting controlling what reports are generated
 | |
| #               $silent         setting controlling what reports are generated
 | |
| #      RETURNS: Count of the number of links
 | |
| #  DESCRIPTION: Parses the notes passed as an argument then scans these notes
 | |
| #               looking for links which relate to items on the HPR server.
 | |
| #               Normally these are files and images but could be anchor
 | |
| #               references as well. If the latter then the URL has to be
 | |
| #               parsed to remove the anchor. Unique links are saved in a hash
 | |
| #               as the path to the file that will be saved on the VPS and as
 | |
| #               the link to the item in the new notes. This hash will be
 | |
| #               passed back to the caller so it can get the file(s) and
 | |
| #               prepare them for upload to the IA. Finally, the link in the
 | |
| #               notes is modified to refer to the file that will be uploaded
 | |
| #               to the IA. The function returns the number of links it has
 | |
| #               recorded (not the number it has changed), as well as passing
 | |
| #               back the parsed tree and the link hash it has constructed.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub find_links_in_notes {
 | |
|     my ( $episode, $notes, $rtree, $rlinks, $rconfig, $verbose, $silent )
 | |
|         = @_;
 | |
| 
 | |
|     my ( $linkre,  $epstr,    $uri,     $slink );
 | |
|     my ( $oldfile, $newfile, $newURL );
 | |
| 
 | |
|     _debug( $DEBUG >= 3, "Entered find_links_in_notes\n" );
 | |
| 
 | |
|     #
 | |
|     # Initialise links
 | |
|     #
 | |
|     undef %$rlinks; # TODO: Consider whether we should do this?
 | |
| 
 | |
|     #
 | |
|     # Create an object that can be shared downstream
 | |
|     #
 | |
|     $$rtree = HTML::TreeBuilder->new;
 | |
|     $$rtree->ignore_unknown(0);
 | |
|     $$rtree->no_expand_entities(1);
 | |
|     $$rtree->p_strict(1);
 | |
|     $$rtree->store_comments(1);
 | |
|     $$rtree->warn(1);
 | |
| 
 | |
|     #
 | |
|     # Parse the notes. Die if we fail because then we know this show needs
 | |
|     # some urgent attention.
 | |
|     #
 | |
|     $$rtree->parse_content($notes)
 | |
|         or die "HTML::TreeBuilder failed to parse notes: $!\n";
 | |
| 
 | |
|     #
 | |
|     # NOTE: No longer filtering out links not relating to the episode.
 | |
|     # Experience with 'upload_manager' has shown that there are some weird
 | |
|     # links in the HPR database which will cause problems.
 | |
|     #
 | |
|     # Regular expression to match links to the HPR server. Examples:
 | |
|     #   http://hackerpublicradio.org/eps/hpr2153.png
 | |
|     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
 | |
|     # Also things like this (**Why Ken?**)
 | |
|     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg
 | |
|     # Don't match things like when *not* processing 1986:
 | |
|     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
 | |
|     #
 | |
|     $epstr = sprintf( "hpr%04d", $episode );
 | |
| #   my $re
 | |
| #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
 | |
|     $linkre = qr{
 | |
|         ^https?://
 | |
|         (?:www.)?
 | |
|         (?:hacker|hobby)publicradio.org/eps/
 | |
|         (.+)$
 | |
|     }x;
 | |
| 
 | |
|     #
 | |
|     # Scan the HTML tree for links we care about
 | |
|     #
 | |
|     for ( @{ $$rtree->extract_links( 'a', 'img' ) } ) {
 | |
|         my ( $link, $element, $attr, $tag ) = @$_;
 | |
| 
 | |
|         #
 | |
|         # Standardise the link (expands relative URLs, removes any fragment).
 | |
|         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
 | |
|         # URIs are removed.
 | |
|         #
 | |
|         local $URI::ABS_REMOTE_LEADING_DOTS = 1;
 | |
|         $uri = URI->new_abs( $link, $rconfig->{baseURL} );
 | |
|         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
 | |
| 
 | |
|         _debug( $DEBUG >= 3, "\$uri = $uri\n" );
 | |
|         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
 | |
|             if $uri->fragment;
 | |
|         _debug( $DEBUG >= 3, "\$slink = $slink, \n" );
 | |
| 
 | |
|         #
 | |
|         # Is it an HPR link?
 | |
|         #
 | |
|         if ( $slink =~ $linkre ) {
 | |
|             #
 | |
|             # Save the last bracketed match, without any 'fragment' if there
 | |
|             # is one (we want this not to be URL-related)
 | |
|             # NOTE: Will we ever have a fragment here?
 | |
|             #
 | |
|             ( $oldfile = "$+" ) =~ s/#.*$//;
 | |
|             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
 | |
| 
 | |
|             #
 | |
|             # Does this file path begin with an 'hpr' prefix? If so is it the
 | |
|             # show id? If not we don't want to process it.
 | |
|             #
 | |
|             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
 | |
|                 if ( $1 ne $epstr ) {
 | |
|                     _debug( $DEBUG >= 3, "Ignored $slink\n" );
 | |
|                     next;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # The path and URL might end with a slash which means the URL is
 | |
|             # relying on the Web server to fill in the filename as
 | |
|             # 'index.html'. We have to make this explicit.
 | |
|             #
 | |
|             if ( $slink =~ /\/$/ ) {
 | |
|                 $slink    .= 'index.html';
 | |
|                 $oldfile .= 'index.html';
 | |
|             }
 | |
| 
 | |
| 
 | |
|             #
 | |
|             # Save the original link if it's unique. We have a hashref in
 | |
|             # $rlinks (pointing to a global hash).
 | |
|             #
 | |
|             # We add a key made from the parsed link in which we store an
 | |
|             # anonymous hashref containing the name of the file we'll store in
 | |
|             # the cache area for upload, and the new IA-based URL we'll use in
 | |
|             # the notes. We rename files that don't start with 'hprNNNN_' with
 | |
|             # that prefix to make it clear it belongs to the show (it mainly
 | |
|             # helps organise and manage the cache if truth be told)
 | |
|             #
 | |
|             unless ( exists( $rlinks->{$slink} ) ) {
 | |
|                 #
 | |
|                 # Originally we turned "hpr9999/file.dat" into
 | |
|                 # "hpr9999_file.dat". We don't want to do this any more so the
 | |
|                 # code is much simpler
 | |
|                 #
 | |
|                 $newfile = $rconfig->{uploads} . "/$oldfile";
 | |
|                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
 | |
| 
 | |
|                 #
 | |
|                 # Save the link details as a sub-hash indexed by the
 | |
|                 # standardised URL. Elements are:
 | |
|                 # {
 | |
|                 #     cached => 'file path for the cache area',
 | |
|                 #     new    => 'URL to be used on the IA',
 | |
|                 # }
 | |
|                 #
 | |
|                 $rlinks->{$slink}           = {};
 | |
|                 $rlinks->{$slink}->{cached} = $newfile;
 | |
|                 $rlinks->{$slink}->{new}    = $newURL;
 | |
| 
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Simply change the content of the link (in the parsed HTML) with
 | |
|             # the new URL built above. We know the attribute ('src' or 'href')
 | |
|             # from what 'extract_links' returned. Deal with any fragment we
 | |
|             # found as well.
 | |
|             #
 | |
|             if ( $uri->fragment ) {
 | |
|                 $element->attr( $attr,
 | |
|                     $rlinks->{$slink}->{new} . '#' . $uri->fragment );
 | |
|             }
 | |
|             else {
 | |
|                 $element->attr( $attr, $rlinks->{$slink}->{new} );
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # If we found any links the notes will have been changed, but since the
 | |
|     # tree is shared with the caller this will result in changes to the
 | |
|     # metadata. However, we want to look at any HTML files in those links
 | |
|     #
 | |
|     if ( scalar( keys(%$rlinks) ) > 0 ) {
 | |
|         #
 | |
|         # If any links are HTML we need to download them and recursively parse,
 | |
|         # record and possibly change their contents. This updates $rlinks
 | |
|         #
 | |
|         foreach my $key ( keys(%$rlinks) ) {
 | |
|             if ( $key =~ /\.html$/ ) {
 | |
|                 my $linkfile = $rlinks->{$key}->{cached};
 | |
| 
 | |
|                 #
 | |
|                 # Get the file unless we've already collected it
 | |
|                 #
 | |
|                 if ( ! -e $linkfile ) {
 | |
|                     download_url( $key, $linkfile, $rlinks->{$key}->{new},
 | |
|                         $verbose, $silent );
 | |
|                 }
 | |
| 
 | |
|                 #
 | |
|                 # Do the recursive parsing since we need to know about further
 | |
|                 # links
 | |
|                 #
 | |
|                 find_links_in_file(
 | |
|                     $episode, $linkfile, $rlinks,
 | |
|                     $rconfig, $verbose,  $silent
 | |
|                 );
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Return the link count after all recursion through supplementary files
 | |
|     # and so on
 | |
|     #
 | |
|     return scalar( keys(%$rlinks) );
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: find_links_in_file
 | |
| #      PURPOSE: Finds HPR links in related files
 | |
| #   PARAMETERS: $episode        episode number we're dealing with
 | |
| #               $filename       file being examined
 | |
| #               $rlinks         hashref containing the links found
 | |
| #               $rconfig        hashref containing config data
 | |
| #               $verbose        setting controlling what reports are generated
 | |
| #               $silent         setting controlling what reports are generated
 | |
| #      RETURNS: The number of new links found and downloaded
 | |
| #  DESCRIPTION: The routine is related to 'find_links_in_notes' but doesn't work
 | |
| #               quite the same way. It is given the name of an HTML which has
 | |
| #               already been downloaded. It parses this with HTML::TreeBuilder
 | |
| #               because it neds to be scanned and possibly edited. The HTML
 | |
| #               tree is scanned for <a> and <img> tags. If the URL returned is
 | |
| #               an HPR URL then we will want to change it to an IA one so the
 | |
| #               show can reference its components on archive.org and be
 | |
| #               independent of HPR. If a link is found which hasn't been seen
 | |
| #               before it is saved in the $rlinks hash keyed by the original
 | |
| #               URL. The link URL is replaced by the IA version in the HTML
 | |
| #               tree. Once the scan is complete if URL edits have been counted
 | |
| #               the file is rewritten from the modified tree. Then if any new
 | |
| #               links have been found these are checked, and if any are HTML
 | |
| #               they are downloaded ready to be parsed. Parsing is done by
 | |
| #               recursively calling this routine. Other links, which are not
 | |
| #               HTML exist in the $rlinks hash and are downloaded later in the
 | |
| #               script since they do not need to be edited.
 | |
| #               NOTE: As of 2017-12-10 this code has been used in a live
 | |
| #               situation but the recursive capabilities have not been
 | |
| #               invoked. This is because we have shows with the usual short
 | |
| #               notes which reference longer notes, and sometimes the latter
 | |
| #               have links to other files. However, so far none of these cases
 | |
| #               have referenced further HTML files needing scanning and
 | |
| #               editing.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub find_links_in_file {
 | |
|     my ( $episode, $filename, $rlinks, $rconfig, $verbose, $silent ) = @_;
 | |
| 
 | |
|     my ( $linkre, $epstr, $tree, $uri, $slink );
 | |
|     my ( $oldfile, $newfile, $newURL );
 | |
|     my ( $encoding, $linkcount, $linkedits );
 | |
| 
 | |
|     _debug( $DEBUG >= 3, "Entered find_links_in_file\n" );
 | |
| 
 | |
|     #
 | |
|     # Create a tree object
 | |
|     #
 | |
|     $tree = HTML::TreeBuilder->new;
 | |
|     $tree->ignore_unknown(0);
 | |
|     $tree->no_expand_entities(1);
 | |
|     $tree->p_strict(1);
 | |
|     $tree->store_comments(1);
 | |
|     $tree->warn(1);
 | |
| 
 | |
|     #
 | |
|     # Parse the file using IO::HTML to grab it. Die if we fail because then we
 | |
|     # know this stuff needs some urgent attention.
 | |
|     #
 | |
|     $tree->parse_file( html_file($filename) )
 | |
|         or die "HTML::TreeBuilder failed to process $filename: $!\n";
 | |
| 
 | |
|     #
 | |
|     # NOTE: No longer filtering out links not relating to the episode.
 | |
|     # Experience with 'upload_manager' has shown that there are some weird
 | |
|     # links in the HPR database which will cause problems.
 | |
|     #
 | |
|     # Regular expression to match links to the HPR server. Examples:
 | |
|     #   http://hackerpublicradio.org/eps/hpr2153.png
 | |
|     #   http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
 | |
|     # Also things like this (**Why Ken?**)
 | |
|     #   ../eps/hpr2945/IMG_20191018_122746Z.jpg
 | |
|     # Don't match things like when *not* processing 1986:
 | |
|     #   http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
 | |
|     #
 | |
|     $epstr = sprintf( "hpr%04d", $episode );
 | |
| #   my $re
 | |
| #       = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
 | |
|     $linkre = qr{
 | |
|         ^https?://
 | |
|         (?:www.)?
 | |
|         (?:hacker|hobby)publicradio.org/eps/
 | |
|         (.+)$
 | |
|     }x;
 | |
| 
 | |
|     #
 | |
|     # Counting new links found and stashed as well as edits made
 | |
|     #
 | |
|     $linkcount = 0;
 | |
|     $linkedits = 0;
 | |
| 
 | |
|     #
 | |
|     # Scan the HTML tree for links we care about
 | |
|     #
 | |
|     for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
 | |
|         my ( $link, $element, $attr, $tag ) = @$_;
 | |
| 
 | |
|         #
 | |
|         # Standardise the link (expands relative URLs, removes any fragment)
 | |
|         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
 | |
|         # URIs are removed.
 | |
|         #
 | |
|         local $URI::ABS_REMOTE_LEADING_DOTS = 1;
 | |
|         $uri = URI->new_abs( $link, $rconfig->{baseURL} );
 | |
|         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
 | |
| 
 | |
|         _debug( $DEBUG >= 3, "\$uri = $uri\n" );
 | |
|         _debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
 | |
|             if $uri->fragment;
 | |
|         _debug( $DEBUG >= 3, "\$slink = $slink, \n" );
 | |
| 
 | |
|         #
 | |
|         # Is it an HPR link?
 | |
|         #
 | |
|         if ( $slink =~ $linkre ) {
 | |
|             #
 | |
|             # Save the last bracketed match, without any 'fragment' if there
 | |
|             # is one (we want this not to be URL-related)
 | |
|             # NOTE: Will we ever have a fragment here?
 | |
|             #
 | |
|             ( $oldfile = "$+" ) =~ s/#.*$//;
 | |
|             _debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
 | |
| 
 | |
|             #
 | |
|             # Does this file path begin with an 'hpr' prefix? If so is it the
 | |
|             # show id? If not we don't want to process it.
 | |
|             #
 | |
|             if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
 | |
|                 if ( $1 ne $epstr ) {
 | |
|                     _debug( $DEBUG >= 3, "Ignored $slink\n" );
 | |
|                     next;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # The path and URL might end with a slash which means the URL is
 | |
|             # relying on the Web server to fill in the filename as
 | |
|             # 'index.html'. We have to make this explicit.
 | |
|             #
 | |
|             if ( $slink =~ /\/$/ ) {
 | |
|                 $slink    .= 'index.html';
 | |
|                 $oldfile .= 'index.html';
 | |
|             }
 | |
| 
 | |
| 
 | |
|             #
 | |
|             # Save the original link if it's unique. We have a hashref in
 | |
|             # $rlinks (pointing to a global hash).
 | |
|             #
 | |
|             # We add a key made from the parsed link in which we store an
 | |
|             # anonymous hashref containing the name of the file we'll store in
 | |
|             # the cache area for upload, and the new IA-based URL we'll use in
 | |
|             # the notes. We rename files that don't start with 'hprNNNN_' with
 | |
|             # that prefix to make it clear it belongs to the show (it mainly
 | |
|             # helps organise and manage the cache if truth be told)
 | |
|             #
 | |
|             unless ( exists( $rlinks->{$slink} ) ) {
 | |
|                 #
 | |
|                 # Originally we turned "hpr9999/file.dat" into
 | |
|                 # "hpr9999_file.dat". We don't want to do this any more so the
 | |
|                 # code is much simpler
 | |
|                 #
 | |
|                 $newfile = $rconfig->{uploads} . "/$oldfile";
 | |
|                 $newURL  = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
 | |
| 
 | |
|                 #
 | |
|                 # Save the link details as a sub-hash indexed by the
 | |
|                 # standardised URL. Elements are:
 | |
|                 # {
 | |
|                 #     cached => 'file path for the cache area',
 | |
|                 #     new    => 'URL to be used on the IA',
 | |
|                 # }
 | |
|                 #
 | |
|                 $rlinks->{$slink}           = {};
 | |
|                 $rlinks->{$slink}->{cached} = $newfile;
 | |
|                 $rlinks->{$slink}->{new}    = $newURL;
 | |
| 
 | |
|                 $linkcount++;
 | |
| 
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Simply change the content of the link (in the parsed HTML) with
 | |
|             # the new URL built above. We know the attribute ('src' or 'href')
 | |
|             # from what 'extract_links' returned. Deal with any fragment we
 | |
|             # found as well.
 | |
|             #
 | |
|             if ( $uri->fragment ) {
 | |
|                 $element->attr( $attr,
 | |
|                     $rlinks->{$slink}->{new} . '#' . $uri->fragment );
 | |
|             }
 | |
|             else {
 | |
|                 $element->attr( $attr, $rlinks->{$slink}->{new} );
 | |
|             }
 | |
|             $linkedits++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # If we found HPR links then we'll have changed them in the tree, so we
 | |
|     # need to update the file. We use 'as_HTML' with no entities, indentation
 | |
|     # by one space and no optional end tags (see HTML::Element). If the file
 | |
|     # gets examined after the edits there should be no eligible HPR links, so
 | |
|     # no action.
 | |
|     #
 | |
|     if ( $linkedits > 0 ) {
 | |
|         open( my $out, ">:encoding(UTF-8)", $filename )
 | |
|             or die "Unable to open $filename for writing: $!\n";
 | |
|         print $out $tree->as_HTML( undef, ' ', {} );
 | |
|         close($out);
 | |
|         print STDERR "Altered links in $filename\n" unless $silent;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # If we found new links then $linkcount will be non-zero and we need to
 | |
|     # download and parse them, otherwise nothing else to do.
 | |
|     #
 | |
|     if ( $linkcount > 0 ) {
 | |
|         #
 | |
|         # If any links are HTML we need to download them and recursively parse,
 | |
|         # record and possibly change their contents
 | |
|         #
 | |
|         foreach my $key ( keys(%$rlinks) ) {
 | |
|             if ( $key =~ /\.html$/ ) {
 | |
|                 my $linkfile = $rlinks->{$key}->{cached};
 | |
| 
 | |
|                 #
 | |
|                 # Get the file unless we've already collected it
 | |
|                 #
 | |
|                 if ( !-e $linkfile ) {
 | |
|                     download_url( $key, $linkfile, $rlinks->{$key}->{new},
 | |
|                         $verbose, $silent );
 | |
|                 }
 | |
| 
 | |
|                 #
 | |
|                 # Do the recursive parsing since we need to know about further
 | |
|                 # links
 | |
|                 #
 | |
|                 find_links_in_file(
 | |
|                     $episode, $linkfile, $rlinks,
 | |
|                     $rconfig, $verbose,  $silent
 | |
|                 );
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Return the link count
 | |
|     #
 | |
|     return $linkcount;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: download_url
 | |
| #      PURPOSE: Download a file from an URL
 | |
| #   PARAMETERS: $from           the URL the file is to be collected from
 | |
| #               $to             the path to which the file is to be written
 | |
| #               $upload         the eventual IA URL (for information)
 | |
| #               $verbose        setting controlling what reports are generated
 | |
| #               $silent         setting controlling what reports are generated
 | |
| #      RETURNS: The status value from the download
 | |
| #  DESCRIPTION: If 'verbose' is true the details of the download are reported.
 | |
| #               We are to download from the URL specified as $from, and the
 | |
| #               destination is the path in $to which is in a temporary cache
 | |
| #               area. This may result in the original file being renamed. The
 | |
| #               value in $upload shows the URL the file will be available at
 | |
| #               on the IA once the metadata has been used for uploads. A brief
 | |
| #               message is written to STDERR by default then the HTTP download
 | |
| #               is initiated with a warning generated if the download is not
 | |
| #               successful.
 | |
| #               Moved from LWP::Simple to LWP::UserAgent to get more control
 | |
| #               over the download. The former failed when an HTTPS URL was
 | |
| #               used and the server didn't offer this type of connection,
 | |
| #               whereas the latter does not.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub download_url {
 | |
|     my ( $from, $to, $upload, $verbose, $silent ) = @_;
 | |
| 
 | |
|     my ( $dirname, $status );
 | |
| 
 | |
|     if ($verbose) {
 | |
|         print "Link to be downloaded: $from\n";
 | |
|         print "  to ",              $to,     "\n";
 | |
|         print "  and uploaded as ", $upload, "\n";
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Extract the directory from the path. If the directory doesn't exist then
 | |
|     # make it.
 | |
|     #
 | |
|     ( $dirname = $to ) =~ s|/?[^/]*$||mx;
 | |
|     make_path($dirname) unless -d $dirname;
 | |
| 
 | |
|     #
 | |
|     # Collect the file and save it
 | |
|     #
 | |
|     print STDERR "Downloading $from\n" unless $silent;
 | |
|     $status = getstore( $from, $to, $verbose );
 | |
|     $status == 200 or warn "Download failed: $status : $from ($to)\n";
 | |
| 
 | |
|     print '~' x 80, "\n" if $verbose;
 | |
| 
 | |
|     return $status;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: getstore
 | |
| #      PURPOSE: Get an URL and store the result in a file
 | |
| #   PARAMETERS: $from           the URL to download
 | |
| #               $to             where to put the result
 | |
| #               $verbose        setting controlling what reports are generated
 | |
| #      RETURNS: The status code from the 'get'
 | |
| #  DESCRIPTION: When using LWP::Simple an attempt to fetch an URL with an
 | |
| #               'https' method failed since the server doesn't offer this
 | |
| #               service. Using LWP::UserAgent this is not a problem, so we are
 | |
| #               effectively emulating LWP::Simple in a more complex way!
 | |
| #               However, we now have a means of taking action when the
 | |
| #               download fails in some predictable way.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub getstore {
 | |
|     my ( $from, $to, $verbose ) = @_;
 | |
| 
 | |
|     #
 | |
|     # Create the agent, and identify it (a bit)
 | |
|     #
 | |
|     my $ua = LWP::UserAgent->new;
 | |
|     $ua->agent("make_metadata/$VERSION ");
 | |
| 
 | |
|     #
 | |
|     # Get the URL and store it to the requested file
 | |
|     #
 | |
|     my $res = $ua->get( $from, ':content_file' => $to );
 | |
| 
 | |
|     #
 | |
|     # Report what happened
 | |
|     #
 | |
|     print $res->status_line, "\n" if $verbose;
 | |
| 
 | |
|     #
 | |
|     # Return status
 | |
|     #
 | |
|     return $res->code;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: expand_template
 | |
| #      PURPOSE: Turns a filename template into a filename or a default
 | |
| #   PARAMETERS: $option         The input from the -output or -script option
 | |
| #               $prefix         Filename prefix
 | |
| #               $suffix         Filename suffix
 | |
| #               $lbound         Lower bound of the episode range
 | |
| #               $ubound         Upper bound of the episode range
 | |
| #      RETURNS: The expanded template or the default
 | |
| #  DESCRIPTION: The -output or -script options both take an optional template,
 | |
| #               so we need to expand it with episode numbers.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub expand_template {
 | |
|     my ( $option, $prefix, $suffix, $lbound, $ubound ) = @_;
 | |
| 
 | |
|     if ( defined($option) ) {
 | |
|         if ( $option =~ /^$/ ) {
 | |
|             #
 | |
|             # Build a default template depending on the episode range
 | |
|             #
 | |
|             if ( $lbound == $ubound ) {
 | |
|                 $option = $prefix . '_%04d.' . $suffix;
 | |
|                 $option = sprintf( $option, $lbound );
 | |
|             }
 | |
|             else {
 | |
|                 $option = $prefix . '_%04d-%04d.' . $suffix;
 | |
|                 $option = sprintf( $option, $lbound, $ubound );
 | |
|             }
 | |
|         }
 | |
|         elsif ( $option =~ /%(\d*)d/ ) {
 | |
|             #
 | |
|             # Caller specified a template. We need to check it
 | |
|             #
 | |
|             my $count = () = $option =~ /%(\d*)d/g;
 | |
|             die "Invalid - too many '%d' sequences in '$option'\n"
 | |
|                 if ( $count > ( $lbound == $ubound ? 1 : 2 ) );
 | |
|             die "Invalid - too few '%d' sequences in '$option'\n"
 | |
|                 if ( $count < ( $lbound == $ubound ? 1 : 2 ) );
 | |
|             $option =~ s/%(\d*)d/%04d/g;
 | |
|             $option = sprintf( $option, $lbound, $ubound );
 | |
|         }
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # The default
 | |
|         #
 | |
|         $option = "$prefix.$suffix"
 | |
|     }
 | |
| 
 | |
|     return $option;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
 | |
| #               returned when doing so might be a problem.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return undef;    ## no critic
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: _debug
 | |
| #      PURPOSE: Prints debug reports
 | |
| #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
 | |
| #               $message        Message to print
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
 | |
| #               newline and then adds one in the 'print' to the caller doesn't
 | |
| #               have to bother. Prepends the message with 'D> ' to show it's
 | |
| #               a debug message.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub _debug {
 | |
|     my ( $active, $message ) = @_;
 | |
| 
 | |
|     chomp($message);
 | |
|     print "D> $message\n" if $active;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION: Process the options we want to offer. See the documentation
 | |
| #               for details
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",               "documentation|man",
 | |
|         "debug=i",            "verbose!",
 | |
|         "silent!",            "test!",
 | |
|         "meta_only|noaudio!", "from=i",
 | |
|         "to=i",               "count=i",
 | |
|         "list=s",             "output:s",
 | |
|         "fetch!",             "ignore_missing|im!",
 | |
|         "assets!",            "script:s",
 | |
|         "a_count=s",          "dbconfig=s",
 | |
|         "config=s",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage(
 | |
|             -msg     => "$PROG version $VERSION\n",
 | |
|             -exitval => 1,
 | |
|             -verbose => 0
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| make_metadata - Generate metadata from the HPR database for Archive.org
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to make_metadata version 0.4.14
 | |
| 
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     make_metadata [-help] [-documentation]
 | |
| 
 | |
|     make_metadata -from=FROM [-to=TO] [-count=COUNT] [-output[=FILE]]
 | |
|         [-script[=FILE]] [-a_count=FILE] [-[no]meta_only] [-[no]fetch]
 | |
|         [-[no]assets] [-[no]silent] [-[no]verbose] [-[no]test]
 | |
|         [-[no]ignore_missing] [-config=FILE] [-dbconfig=FILE] [-debug=N]
 | |
| 
 | |
|     make_metadata -list=LIST [-output[=FILE]] [-script[=FILE]]
 | |
|         [-[no]meta_only] [-[no]fetch] [-[no]assets] [-[no]silent]
 | |
|         [-[no]verbose] [-[no]test] [-[no]ignore_missing] [-config=FILE]
 | |
|         [-dbconfig=FILE] [-debug=N]
 | |
| 
 | |
|     Examples:
 | |
| 
 | |
|     make_metadata -from=1234 -nofetch
 | |
| 
 | |
|     make_metadata -from=1234 -to=1235
 | |
| 
 | |
|     make_metadata -from=1234 -count=10
 | |
| 
 | |
|     make_metadata -from=1 -to=3 -output=metadata_1-3.csv
 | |
| 
 | |
|     make_metadata -from=1500 -to=1510 -out=metadata_1500-1510.csv -verbose
 | |
| 
 | |
|     make_metadata -from=1500 -to=1510 -out=metadata_%d-%d.csv -verbose
 | |
| 
 | |
|     make_metadata -from=500 -to=510 -out=metadata_%04d-%04d.csv -verbose
 | |
| 
 | |
|     make_metadata -from=1500 -to=1510 -out -verbose
 | |
| 
 | |
|     make_metadata -from=1500 -to=1510 -out
 | |
| 
 | |
|     make_metadata -from=1675 -to=1680 -out=metadata_%d-%d.csv -meta_only
 | |
| 
 | |
|     make_metadata -from=1450 -test
 | |
| 
 | |
|     make_metadata -list='1234,2134,2314' -out -meta_only
 | |
| 
 | |
|     make_metadata -list="931,932,933,935,938,939,940" -out -meta -ignore
 | |
| 
 | |
|     make_metadata -dbconf=.hpr_livedb.cfg -from=1234 -to=1235
 | |
| 
 | |
|     make_metadata -from=3004 -out -meta_only -noassets
 | |
| 
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Reports brief information about how to use the script and exits. To see the
 | |
| full documentation use the option B<-documentation> or B<-man>. Alternatively,
 | |
| to generate a PDF version use the I<pod2pdf> tool from
 | |
| I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
 | |
| installed with the cpan tool as App::pod2pdf.
 | |
| 
 | |
| =item B<-documentation> or B<-man>
 | |
| 
 | |
| Reports full information about how to use the script and exits. Alternatively,
 | |
| to generate a PDF version use the I<pod2pdf> tool from
 | |
| I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
 | |
| installed with the cpan tool as App::pod2pdf.
 | |
| 
 | |
| =item B<-debug=N>
 | |
| 
 | |
| Run in debug mode at the level specified by I<N>. Possible values are:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<0>
 | |
| 
 | |
| No debugging (the default).
 | |
| 
 | |
| =item B<1>
 | |
| 
 | |
| TBA
 | |
| 
 | |
| =item B<2>
 | |
| 
 | |
| TBA
 | |
| 
 | |
| =item B<3>
 | |
| 
 | |
| TBA
 | |
| 
 | |
| =item B<4 and above>
 | |
| 
 | |
| The metadata hash is dumped.
 | |
| 
 | |
| Each call of the function I<find_links_in_notes> is reported. On finding an
 | |
| <a> or <img> tag the I<uri> value is shown, as is any fragment and the related
 | |
| link. The original file is reported here.
 | |
| 
 | |
| Each call of the function I<find_links_in_file> is reported. On finding an
 | |
| <a> or <img> tag the I<uri> value is shown, as is any fragment and the related
 | |
| link. The original file is reported here, and if a link is to be ignored this
 | |
| is reported.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =item B<-from=NUMBER>
 | |
| 
 | |
| This option defines the starting episode number of a group. It is mandatory to
 | |
| provide either the B<-from=NUMBER> option or the B<-list=LIST> option (see
 | |
| below).
 | |
| 
 | |
| =item B<-to=NUMBER>
 | |
| 
 | |
| This option specifies the final episode number of a group. If not given the
 | |
| script generates metadata for the single episode indicated by B<-from>.
 | |
| 
 | |
| The value given here must be greater than or equal to that given in the
 | |
| B<-from> option. The option must not be present with the B<-count> option.
 | |
| 
 | |
| The difference between the episode numbers given by the B<-from> and B<-to>
 | |
| options must not be greater than 20.
 | |
| 
 | |
| =item B<-count=NUMBER>
 | |
| 
 | |
| This option specifies the number of episodes to process (starting from the
 | |
| episode number specified by the B<-from>) option. The option must not be
 | |
| present with the B<-to> option.
 | |
| 
 | |
| The number of episodes specified must not be greater than 20.
 | |
| 
 | |
| =item B<-list=LIST>
 | |
| 
 | |
| This option is an alternative to B<-from=NUMBER> and its associated modifying
 | |
| options. The B<LIST> is a comma-separated list of not necessarily consecutive
 | |
| episode numbers, and must consist of at least one and no more than 20 numbers.
 | |
| 
 | |
| This option is useful for the case when non-sequential episode numbers are to
 | |
| be uploaded, and is particularly useful when repairing elements of particular
 | |
| episodes (such as adding summary fields and tags) where they have already
 | |
| been uploaded.
 | |
| 
 | |
| For example, the following shows have no summary and/or tags, but the shows
 | |
| are already in the IA. The missing items have been provided, so we wish to
 | |
| update the HTML part of the upload:
 | |
| 
 | |
|     $ ./make_metadata -list='2022,2027,2028,2029,2030,2033' -out -meta
 | |
|     Output file: metadata_2022-2033.csv
 | |
| 
 | |
| =item B<-output[=FILE]>
 | |
| 
 | |
| This option specifies the file to receive the generated CSV data. If omitted
 | |
| the output is written to B<metadata.csv> in the current directory.
 | |
| 
 | |
| The file name may contain one or two instances of the characters '%d', with
 | |
| a leading width specification if desired (such as '%04d'). These will be
 | |
| substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
 | |
| B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
 | |
| appropriate endpoint (adding the count to the starting number). If neither of
 | |
| the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
 | |
| be one instance of '%d' or the script will abort.
 | |
| 
 | |
| If no value is provided to B<-output> then a suitable template will be
 | |
| generated. It will be 'metadata_%04d.csv' if one episode is being processed, and
 | |
| 'metadata_%04d-%04d.csv' if a range has been specified.
 | |
| 
 | |
| Example:
 | |
| 
 | |
|  ./make_metadata -from=1430 -out=metadata_%04d.csv
 | |
| 
 | |
| the output file name will be B<metadata_1430.csv>. The same effect can be
 | |
| achieved with:
 | |
| 
 | |
|  ./make_metadata -from=1430 -out=
 | |
| 
 | |
| or
 | |
| 
 | |
|  ./make_metadata -from=1430 -out
 | |
| 
 | |
| =item B<-script[=FILE]>
 | |
| 
 | |
| This option specifies the file to receive commands required to upload certain
 | |
| files relating to a show. If omitted the commands are written to B<script.sh>
 | |
| in the current directory.
 | |
| 
 | |
| The file name may contain one or two instances of the characters '%d', with
 | |
| a leading width specification if desired (such as '%04d'). These will be
 | |
| substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
 | |
| B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
 | |
| appropriate endpoint (adding the count to the starting number). If neither of
 | |
| the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
 | |
| be one instance of '%d' or the script will abort.
 | |
| 
 | |
| If no value is provided to B<-script> then a suitable template will be
 | |
| generated. It will be 'script_%04d.sh' if one episode is being processed, and
 | |
| 'script_%04d-%04d.sh' if a range has been specified.
 | |
| 
 | |
| Example:
 | |
| 
 | |
|  ./make_metadata -from=1430 -script=script_%04d.sh
 | |
| 
 | |
| the output file name will be B<script_1430.sh>. The same effect can be
 | |
| achieved with:
 | |
| 
 | |
|  ./make_metadata -from=1430 -script=
 | |
| 
 | |
| or
 | |
| 
 | |
|  ./make_metadata -from=1430 -script
 | |
| 
 | |
| =item B<-a_count=FILE>
 | |
| 
 | |
| Defines a file into which the script writes details of assets downloaded
 | |
| during the analysis of notes (and other HTML files associated with a show).
 | |
| The listing consists of the show identifier (e.g. 'hpr3901') followed by the
 | |
| number of links followed to collect the files for this show.
 | |
| 
 | |
| This feature was added to allow other scripts to perform tasks with these
 | |
| assets, but is now deprecated.
 | |
| 
 | |
| The feature will probably be removed in a later release of this script.
 | |
| 
 | |
| =item B<-[no]fetch>
 | |
| 
 | |
| This option controls whether the script attempts to fetch the MP3 audio file
 | |
| from the HPR website should there be no WAV file in the upload area. The
 | |
| default setting is B<-fetch>.
 | |
| 
 | |
| Normally the script is run as part of the workflow to upload the metadata and
 | |
| audio to archive.org. The audio is expected to be a WAV file and to be in the
 | |
| location referenced in the configuration file under the 'uploads' label.
 | |
| However, not all of the WAV files exist for older shows.
 | |
| 
 | |
| When the WAV file is missing and B<-fetch> is selected or defaulted, the
 | |
| script will attempt to download the MP3 version of the audio and will store it
 | |
| in the 'uploads' area for the upload script (B<ias3upload.pl> or B<ia>) to
 | |
| send to archive.org. If the MP3 file is not found then the script will abort.
 | |
| 
 | |
| If B<-fetch> is specified (or defaulted) as well as B<-nometa_only> (see
 | |
| below) then the audio file fetching process will not be carried out. This is
 | |
| because it makes no sense to fetch this file if it's not going to be
 | |
| referenced in the metadata.
 | |
| 
 | |
| =item B<-[no]assets>
 | |
| 
 | |
| This option controls the downloading of any assets that may be associated with
 | |
| a show. Assets are the files held on the HPR server which are referenced by
 | |
| the show. Examples might be photographs, scripts, and supplementary notes.
 | |
| Normally all such assets are collected and stored in the upload area and are
 | |
| then sent to the archive via the script. The notes sent to the archive are
 | |
| adjusted to refer to these notes on archive.org, making the HPR episode
 | |
| completely self-contained.
 | |
| 
 | |
| =item B<-[no]meta_only> (alias B<-[no]noaudio>)
 | |
| 
 | |
| This option controls whether the output file will contain a reference to the
 | |
| audio file(s) or only the metadata. The default is B<-nometa_only> meaning that
 | |
| the file reference(s) and the metadata are present.
 | |
| 
 | |
| Omitting the file(s) allows the metadata to be regenerated, perhaps due to
 | |
| edits and corrections in the database, and the changes to be propagated to
 | |
| archive.org. If the file reference(s) exist(s) in the metadata file then the
 | |
| file(s) must be available at the time the uploader is run.
 | |
| 
 | |
| Note that making changes this way is highly preferable to editing the entry on
 | |
| archive.org using the web-based editor. This is because there is a problem
 | |
| with the way HTML entities are treated and this can cause the HTML to be
 | |
| corrupted.
 | |
| 
 | |
| =item B<-[no]silent>
 | |
| 
 | |
| The option enables (B<-silent>) and disables (B<-nosilent>) I<silent mode>.
 | |
| When enabled the script reports nothing on STDOUT. If the script cannot find
 | |
| the audio files and downloads the MP3 version from the HPR site for upload to
 | |
| archive.org then the downloads are reported on STDERR. This cannot be
 | |
| disabled, though the STDERR output could be redirected to a file or to
 | |
| /dev/null.
 | |
| 
 | |
| If B<-silent> is specified with B<-verbose> then the latter "wins".
 | |
| 
 | |
| The script runs with silent mode disabled by default. When B<-nosilent> is
 | |
| used with B<-noverbose> the script reports the output file name and nothing
 | |
| else.
 | |
| 
 | |
| =item B<-[no]verbose>
 | |
| 
 | |
| This option enables (B<-verbose>) and disables (B<-noverbose>)
 | |
| I<verbose mode>. When enabled the script reports the metadata it has collected
 | |
| from the database before writing it to the output file. The data is reported
 | |
| in a more readable mode than examining the CSV file, although another script
 | |
| B<show_metadata> is also available to help with this.
 | |
| 
 | |
| If B<-verbose> is specified with B<-silent> then the former "wins".
 | |
| 
 | |
| The script runs with verbose mode disabled by default.
 | |
| 
 | |
| =item B<-[no]ignore_missing>
 | |
| 
 | |
| The script checks each episode to ensure it has a summary and tags. If either
 | |
| of these fields is missing then a warning message is printed for that episode
 | |
| (unless B<-silent> has been chosen), and if any episodes are lacking this
 | |
| information the script aborts without producing metadata. If the option
 | |
| B<-ignore_missing> is selected then the warnings are produced (dependent on
 | |
| B<-silent>) but the script runs to completion.
 | |
| 
 | |
| The default setting is B<-noignore_missing>; the script checks and aborts if
 | |
| any summaries or tags are missing.
 | |
| 
 | |
| =item B<-[no]test>
 | |
| 
 | |
| DO NOT USE!
 | |
| 
 | |
| This option enables (B<-test>) and disables (B<-notest>)
 | |
| I<test mode>. When enabled the script generates metadata containing various
 | |
| test values.
 | |
| 
 | |
| In test mode the following changes are made:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item .
 | |
| 
 | |
| The item names, which normally contain 'hprnnnn', built from the episode
 | |
| number, have 'test_' prepended to them.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| The collection, which is normally a list containing 'hackerpublicradio' and
 | |
| 'podcasts', is changed to 'test_collection'. Items in this collection are
 | |
| normally deleted by Archive.org after 30 days.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| The contributor, which is normally 'HackerPublicRadio' is changed to
 | |
| 'perlist'.
 | |
| 
 | |
| =back
 | |
| 
 | |
| B<NOTE> The test mode only works for the author!
 | |
| 
 | |
| =item B<-config=FILE>
 | |
| 
 | |
| This option allows an alternative script configuration file to be used. This
 | |
| file defines various settings relating to the running of the script - things
 | |
| like the place to look for the files to be uploaded. It is rare to need to use
 | |
| any other file than the default since these are specific to the environmewnt
 | |
| in which the script runs. However, this has been added at the same time as an
 | |
| alternative database configuration option was added.
 | |
| 
 | |
| See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | |
| 
 | |
| If the option is omitted the default file is used: B<.make_metadata.cfg>
 | |
| 
 | |
| =item B<-dbconfig=FILE>
 | |
| 
 | |
| This option allows an alternative database configuration file to be used. This
 | |
| file defines the location of the database, its port, its name and the username
 | |
| and password to be used to access it. This feature was added to allow the
 | |
| script to access alternative databases or the live database over an SSH
 | |
| tunnel.
 | |
| 
 | |
| See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | |
| 
 | |
| If the option is omitted the default file is used: B<.hpr_db.cfg>
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| This script generates metadata suitable for uploading Hacker Public Radio
 | |
| episodes to the Internet Archive (archive.org).
 | |
| 
 | |
| The metadata is in comma-separated variable (CSV) format suitable for
 | |
| processing with an upload script. The original upload script was called
 | |
| B<ias3upload.pl>, and could be obtained from
 | |
| I<https://github.com/kngenie/ias3upload>. This script is no longer supported
 | |
| and B<make_metadata> no longer generates output suitable for it (though it is
 | |
| simple to make it compatible if necessary). The replacement script is called
 | |
| B<internetarchive> which is a Python tool which can also be run from the
 | |
| command line. It can be found at I<https://github.com/jjjake/internetarchive>.
 | |
| 
 | |
| The B<make_metadata> script generates CSV from the HPR database. It looks up
 | |
| details for each episode selected by the options, and performs various
 | |
| conversions and concatenations. The goal is to prepare items for the Internet
 | |
| Archive with as much detail as the format can support.
 | |
| 
 | |
| The resulting CSV file contains a header line listing the field names required
 | |
| by archive.org followed by as many CSV lines of episode data as requested (up
 | |
| to a limit of 20).
 | |
| 
 | |
| Since the upload method uses the HTTP protocol with fields stored in headers,
 | |
| there are restrictions on the way HTML can be formatted in the B<Details>
 | |
| field. The script converts newlines, which are not allowed into I<<br/>> tags
 | |
| where necessary.
 | |
| 
 | |
| HPR shows often have associated files, such as pictures, examples, long-form
 | |
| notes and so forth. The script finds these and downloads them to the cache
 | |
| area where the audio is kept and writes the necessary lines to the CSV file to
 | |
| ensure they are uploaded with the show. It modifies any HTML which links to
 | |
| these files to link to the archive.org copies in order to make the complete
 | |
| show self-contained.
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<Configuration file ... not found>
 | |
| 
 | |
| One or more of the configuration files has not been found.
 | |
| 
 | |
| =item B<Path ... not found>
 | |
| 
 | |
| The path specified in the B<uploads> definition in the configuration file
 | |
| B<.make_metadata.cfg> does not exist. Check the configuration file.
 | |
| 
 | |
| =item B<Configuration data missing>
 | |
| 
 | |
| While checking the configuration file(s) the script has detected that settings
 | |
| are missing. Check the details specified below and provide the missing
 | |
| elements.
 | |
| 
 | |
| =item B<Mis-match between @fields and %dispatch!>
 | |
| 
 | |
| An internal error in the script has been detected where the elements of the
 | |
| @fields array do not match the keys of the %dispatch hash. This is probably the
 | |
| result of a failed attempt to edit either of these components.
 | |
| 
 | |
| Correct the error and run the script again.
 | |
| 
 | |
| =item B<Invalid list; no elements>
 | |
| 
 | |
| There are no list elements in the B<-list=LIST> option.
 | |
| 
 | |
| =item B<Invalid list; too many elements>
 | |
| 
 | |
| There are more than the allowed 20 elements in the list specified by the
 | |
| B<-list=LIST> option.
 | |
| 
 | |
| =item B<Failed to parse -list=...>
 | |
| 
 | |
| A list was specified that did not contain a CSV list of numbers.
 | |
| 
 | |
| =item B<Invalid starting episode number (...)>
 | |
| 
 | |
| The value used in the B<-from> option must be greater than 0.
 | |
| 
 | |
| =item B<Do not combine -to and -count>
 | |
| 
 | |
| Using both the B<-to> and B<-count> is not permitted (and makes no sense).
 | |
| 
 | |
| =item B<Invalid range; ... is greater than ...>
 | |
| 
 | |
| The B<-from> episode number must be less than or equal to the B<-to> number.
 | |
| 
 | |
| =item B<Invalid range; range is too big (E<gt>20)>
 | |
| 
 | |
| The difference between the starting and ending episode number is greater than
 | |
| 20.
 | |
| 
 | |
| =item B<Invalid - too many '%d' sequences in '...'>
 | |
| 
 | |
| There were more than two '%d' sequences in the the name of the output file if
 | |
| a range of episodes is being processed, or more than one if a single episode
 | |
| has been specified.
 | |
| 
 | |
| =item B<Invalid - too few '%d' sequences in '...'>
 | |
| 
 | |
| There were fewer than two '%d' sequences in the the name of the output file
 | |
| when a range of episodes was being processed.
 | |
| 
 | |
| =item B<Unable to open ... for output: ...>
 | |
| 
 | |
| The script was unable to open the requested output file.
 | |
| 
 | |
| =item B<Unable to find or download ...>
 | |
| 
 | |
| The script has not found a I<.WAV> file in the cache area so has attempted to
 | |
| download the I<MP3> copy of the audio from the HPR website. This process has
 | |
| failed.
 | |
| 
 | |
| =item B<Failed to find requested episode>
 | |
| 
 | |
| An episode number could not be found in the database. This error is not fatal.
 | |
| 
 | |
| =item B<Nothing to do>
 | |
| 
 | |
| After processing the range of episodes specified the script could not find
 | |
| anything to do. This is most often caused by all of the episodes in the range
 | |
| being invalid.
 | |
| 
 | |
| =item B<Aborted due to missing summaries and/or tags>
 | |
| 
 | |
| One or more of the shows being processed does not have a summary or tags. The
 | |
| script has been told not to ignore this so has aborted before generating
 | |
| metadata.
 | |
| 
 | |
| =item B<HTML::TreeBuilder failed to parse notes: ...>
 | |
| 
 | |
| The script failed to parse the HTML in the notes of one of the episodes. This
 | |
| indicates a serious problem with these notes and is fatal since these notes
 | |
| need to be corrected before the episode is uploaded to the Internet Archive.
 | |
| 
 | |
| =item B<HTML::TreeBuilder failed to process ...: ...>
 | |
| 
 | |
| While parsing the HTML in a related file the parse has failed. The file being
 | |
| parsed is reported as well as the error that was encountered. This is likely
 | |
| due to bad HTML.
 | |
| 
 | |
| =item B<Unable to open ... for writing: ...>
 | |
| 
 | |
| The script is attempting to open an HTML file which it has downloaded to
 | |
| write back edited HTML, yet the open has failed. The filename is in the error
 | |
| message as is the cause of the error.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| This script reads two configuration files in B<Config::General> format
 | |
| (similar to Apache configuration files) for the path to the files to be
 | |
| uploaded and for credentials to access the HPR database. Two files are used
 | |
| because the database configuration file is used by several other scripts.
 | |
| 
 | |
| =head2 SCRIPT CONFIGURATION
 | |
| 
 | |
| The general configuration file is B<.make_metadata.cfg> (although this can be
 | |
| overridden through the B<-config=FILE> option) and contains the following
 | |
| lines:
 | |
| 
 | |
|  uploads = "<path to files>"
 | |
|  filetemplate = "hpr%04d.%s"
 | |
|  baseURL = "http://hackerpublicradio.org"
 | |
|  sourceURLtemplate = "https://hackerpublicradio.org/eps/%s/index.html"
 | |
|  IAURLtemplate = "http://archive.org/download/%s/%s"
 | |
| 
 | |
| The I<uploads> line defines where the WAV files are to be found (currently
 | |
| I</var/IA/uploads> on the VPS). The same area is used to store downloaded MP3
 | |
| files and any supplementary files associated with the episode.
 | |
| 
 | |
| The I<filetemplate> line defines the format of an audio file such as
 | |
| I<hpr1234.wav>. This should not be changed.
 | |
| 
 | |
| The I<baseURL> line defines the common base for download URLs. It is used when
 | |
| parsing and standardising URLs relating to files on the HPR server.
 | |
| 
 | |
| The I<sourceURLtemplate> line defines the format of the URL required to access the
 | |
| show on the HPR site. This should not be changed except in the unlikely event that the
 | |
| these URLs change.
 | |
| 
 | |
| The I<IAURLtemplate> line defines the format of URLs on archive.org which is
 | |
| used when generating new links in HTML notes or supplementary files.
 | |
| 
 | |
| =head2 DATABASE CONFIGURATION
 | |
| 
 | |
| The database configuration file is B<.hpr_db.cfg> (although this can be
 | |
| overridden through the B<-dbconfig=FILE> option).
 | |
| 
 | |
| The layout of the file should be as follows:
 | |
| 
 | |
|  <database>
 | |
|      host = 127.0.0.1
 | |
|      port = PORT
 | |
|      name = DATABASE
 | |
|      user = USERNAME
 | |
|      password = PASSWORD
 | |
|  </database>
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
|     Carp
 | |
|     Config::General
 | |
|     DBI
 | |
|     Data::Dumper
 | |
|     File::Find::Rule
 | |
|     File::Path
 | |
|     Getopt::Long
 | |
|     HTML::Entities
 | |
|     HTML::TreeBuilder
 | |
|     IO::HTML
 | |
|     LWP::Simple
 | |
|     List::MoreUtils
 | |
|     List::Util
 | |
|     Pod::Usage
 | |
|     Text::CSV_XS
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Dave Morriss  (Dave.Morriss@gmail.com)
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) 2014-2019 Dave Morriss (Dave.Morriss@gmail.com).
 | |
| All rights reserved.
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |