2801 lines
101 KiB
Perl
Executable File
2801 lines
101 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#===============================================================================
|
|
#
|
|
# FILE: make_metadata
|
|
#
|
|
# USAGE: ./make_metadata -from=FROM [-to=TO] [-count=COUNT]
|
|
# [-output=FILE] [-[no]fetch] [-[no]meta_only] [-[no]verbose]
|
|
# [-[no]silent] [-[no]test] [-help] [-{documentation|man}]
|
|
#
|
|
# DESCRIPTION: Make metadata for the uploading of a range of HPR episodes to
|
|
# the Internet Archive by collecting fields from the database.
|
|
#
|
|
# OPTIONS: See the POD documentation for full details (./make_metadata
|
|
# -man) or use 'pod2pdf' to generate a PDF version in the
|
|
# directory holding this script.
|
|
# REQUIREMENTS: ---
|
|
# BUGS: ---
|
|
# NOTES: 2023-07-06: A version in development 0.4.14 has been put aside
|
|
# and this version (0.4.12) made into the main line version
|
|
# because 4.14 was developing in a direction that doesn't fit
|
|
# with the changes made to the HPR system in June/July 2023.
|
|
# Will now move forward with version numbers.
|
|
# 2024-01-23: Added the 'open' pragma for UTF-8
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
# VERSION: 0.4.14
|
|
# CREATED: 2014-06-13 12:51:04
|
|
# REVISION: 2024-01-23 16:28:59
|
|
#
|
|
#===============================================================================
|
|
|
|
use 5.010;
|
|
use strict;
|
|
use warnings;
|
|
use open ':encoding(UTF-8)';
|
|
#use utf8;
|
|
|
|
use Carp;
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
|
|
use List::Util qw{max};
|
|
use List::MoreUtils qw{uniq apply};
|
|
use Config::General;
|
|
use Text::CSV_XS;
|
|
|
|
use IO::HTML;
|
|
use HTML::TreeBuilder 5 -weak;
|
|
use HTML::Entities;
|
|
|
|
use File::Find::Rule;
|
|
use File::Path qw{make_path};
|
|
#use LWP::Simple;
|
|
use LWP::UserAgent;
|
|
use HTTP::Status qw(status_message);
|
|
|
|
use DBI;
|
|
|
|
use Data::Dumper;
|
|
|
|
#
|
|
# Version number (manually incremented)
|
|
#
|
|
our $VERSION = '0.4.14';
|
|
|
|
#
|
|
# Script and directory names
|
|
#
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Constants and defaults
|
|
#-------------------------------------------------------------------------------
|
|
my $basedir = "$ENV{HOME}/HPR/IA";
|
|
my $configfile = "$basedir/.$PROG.cfg";
|
|
my $dbconfigfile = "$basedir/.hpr_db.cfg";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Declarations
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Constants used in the metadata
|
|
#
|
|
my $mediatype = 'audio';
|
|
my $collection = 'hackerpublicradio';
|
|
my $language = 'eng';
|
|
my $contributor = 'HackerPublicRadio';
|
|
|
|
#
|
|
# Variables, arrays and hashes
|
|
#
|
|
my ( $dbh, $sth1, $h1 );
|
|
my ( $DEBUG, $verbose, $silent, $test, $from, $to, $count );
|
|
my ( $list, $meta_only, $outfile, $scriptfile, $fetch, $assets);
|
|
my ( $acountfile, $ignore );
|
|
my ( @range, $tree, %links, %meta, @source, @transcripts, @counts, @head, @data );
|
|
my ( $ep_name, $ft, $file, $URL, $status, $incomplete );
|
|
my ( $filepath, $max_epno, $filetemplate, $sourceURLtemplate );
|
|
my ( $iauploadtemplate, $iauploadoptions );
|
|
|
|
#
|
|
# File types we'll look for when working out (and collecting) the main audio
|
|
# file and their order
|
|
#
|
|
my @filetypes = (qw{ wav mp3 ogg });
|
|
|
|
#
|
|
# Additional filetypes we'll look for if we're working round the lack of tags
|
|
# on the derived audio files. See the discussion at
|
|
# https://gitlab.anhonesthost.com/HPR/HPR_Public_Code/issues/34
|
|
#
|
|
my @additional_audio = (qw{ flac mp3 ogg opus spx });
|
|
|
|
#
|
|
# Templates for finding the transcripts.
|
|
#
|
|
# We expect these to be in $uploads/$episode/ where we expect to find:
|
|
# $episode.json
|
|
# $episode.srt
|
|
# $episode.tsv
|
|
# $episode.txt
|
|
# $episode.vtt
|
|
# The regular expression below should find the transcripts given that the
|
|
# search is directed to the sub-directory in $uploads.
|
|
#
|
|
my $transcript_re = qr{hpr\d{4}\.(json|srt|tsv|txt|vtt)};
|
|
|
|
#
|
|
# Names of CSV fields and their order
|
|
#
|
|
my @fields = (
|
|
qw{
|
|
identifier file mediatype collection title creator language
|
|
description contributor date subject licenseurl
|
|
}
|
|
);
|
|
|
|
#
|
|
# Maximum episodes per run
|
|
#
|
|
my $max_episodes = 20;
|
|
|
|
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# A dispatch table to define how to build metadata. The keys are actually
|
|
# header names to be used in the metadata file and must relate to the @fields
|
|
# array above. Two types of values are expected here: 1) anonymous subroutine
|
|
# references which return the value for the field or 2) scalars (or strings)
|
|
# to be placed as-is. Some of the anonymous subroutines are written as
|
|
# closures (see
|
|
# http://en.wikipedia.org/wiki/Closure_%28computer_programming%29), which make
|
|
# it easier to build the calling code since we can pass in a variety of
|
|
# arguments internally while offering a simple no-argument call externally.
|
|
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
my %dispatch = (
|
|
identifier => sub { return make_item( $h1, $test ) },
|
|
file => sub {
|
|
return (
|
|
$meta_only
|
|
? ''
|
|
: make_filename( $filepath, $filetemplate, \@filetypes, $h1 )
|
|
);
|
|
},
|
|
mediatype => $mediatype,
|
|
collection => $collection,
|
|
#title => sub { return decode_entities( db_title($h1) ) },
|
|
title => sub { return db_title($h1) },
|
|
creator => sub { return $h1->{host} },
|
|
language => $language,
|
|
description => sub { return db_notes( $sourceURLtemplate, $h1, $tree, \@source ) },
|
|
contributor => $contributor,
|
|
date => sub { return $h1->{date} },
|
|
subject => sub { return db_tags($h1) },
|
|
licenseurl => sub { return $h1->{license_url} }
|
|
);
|
|
|
|
#
|
|
# Validate the data structures above to ensure there have been no mis-edits
|
|
#
|
|
foreach (@fields) {
|
|
die "Mis-match between \@fields and \%dispatch!\n"
|
|
unless ( defined( $dispatch{$_} ) );
|
|
}
|
|
|
|
#
|
|
# The main query on the database. We want episode details, the host's name,
|
|
# the series name and the Creative Commons licence the episode is under.
|
|
# Note that the 'LEFT JOIN' for 'miniseries' is for the case where the
|
|
# 'series' column of 'eps' is 'NULL'. This is not the case for the live
|
|
# database - yet!
|
|
#
|
|
my $sql = q{
|
|
SELECT
|
|
e.id AS eps_id,
|
|
e.date,
|
|
e.title,
|
|
sec_to_time(e.duration) AS duration,
|
|
e.summary,
|
|
e.notes,
|
|
e.explicit,
|
|
e.license,
|
|
l.url as license_url,
|
|
e.tags,
|
|
e.valid,
|
|
h.host,
|
|
s.name AS s_name,
|
|
s.description AS s_desc
|
|
FROM eps e
|
|
JOIN hosts h ON e.hostid = h.hostid
|
|
LEFT JOIN miniseries s ON e.series = s.id
|
|
JOIN licenses l ON e.license = l.short_name
|
|
WHERE e.id = ?
|
|
};
|
|
|
|
#
|
|
# Enable Unicode mode
|
|
#
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Options and arguments
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Option defaults
|
|
#
|
|
my $DEFDEBUG = 0;
|
|
|
|
my %options;
|
|
Options( \%options );
|
|
|
|
#
|
|
# Default help shows minimal information
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
|
if ( $options{'help'} );
|
|
|
|
#
|
|
# The -documentation or -man option shows the full POD documentation through
|
|
# a pager for convenience
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
|
if ( $options{'documentation'} );
|
|
|
|
#
|
|
# Collect options
|
|
#
|
|
$DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
|
$verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
|
$silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
$test = ( defined( $options{test} ) ? $options{test} : 0 );
|
|
$from = $options{from};
|
|
$to = $options{to};
|
|
$count = $options{count};
|
|
$list = $options{list};
|
|
$meta_only = ( defined( $options{meta_only} ) ? $options{meta_only} : 0 );
|
|
$fetch = ( defined( $options{fetch} ) ? $options{fetch} : 1 );
|
|
$assets = ( defined( $options{assets} ) ? $options{assets} : 1 );
|
|
$ignore
|
|
= ( defined( $options{ignore_missing} ) ? $options{ignore_missing} : 0 );
|
|
$outfile = $options{output};
|
|
$scriptfile = $options{script};
|
|
$acountfile = $options{a_count};
|
|
|
|
#
|
|
# The two config files can be overridden or default to the original
|
|
# declarations
|
|
#
|
|
my $cfgfile
|
|
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
|
my $dbcfgfile
|
|
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $dbconfigfile );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Load and validate program configuration data
|
|
#-------------------------------------------------------------------------------
|
|
die "Configuration file $cfgfile not found\n" unless ( -e $cfgfile );
|
|
|
|
my $conf = Config::General->new(
|
|
-ConfigFile => $configfile,
|
|
-InterPolateVars => 1,
|
|
-ExtendedAccess => 1,
|
|
);
|
|
my %config = $conf->getall();
|
|
|
|
#
|
|
# Check the config settings
|
|
#
|
|
$filepath = $config{uploads};
|
|
|
|
die "Path $filepath not found\n" unless ( -e $filepath );
|
|
|
|
$max_epno = $config{max_epno};
|
|
$filetemplate = $config{filetemplate};
|
|
$sourceURLtemplate = $config{sourceURLtemplate};
|
|
$iauploadtemplate = $config{iauploadtemplate} . "\n";
|
|
$iauploadoptions = $config{iauploadoptions};
|
|
|
|
die "Configuration data missing\n"
|
|
unless $max_epno
|
|
&& $filetemplate
|
|
&& $sourceURLtemplate
|
|
&& $iauploadtemplate
|
|
&& $iauploadoptions;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Load database configuration data
|
|
#-------------------------------------------------------------------------------
|
|
die "Configuration file $dbcfgfile not found\n" unless ( -e $dbcfgfile );
|
|
|
|
my $dbconf = Config::General->new(
|
|
-ConfigFile => $dbcfgfile,
|
|
-InterPolateVars => 1,
|
|
-ExtendedAccess => 1,
|
|
);
|
|
my %dbcfg = $dbconf->getall();
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Validate options
|
|
#-------------------------------------------------------------------------------
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
|
if ( ( !defined($from) and !defined($list) )
|
|
or ( defined($from) and defined($list) ) );
|
|
|
|
#
|
|
# Deal with the values specified by one of:
|
|
# -from=N1 -to=N2
|
|
# -from=N3 -count=N4
|
|
# -list="N1,N2,N3"
|
|
#
|
|
# End by populating @range with a list of episode numbers
|
|
#
|
|
if ( defined($list) ) {
|
|
#
|
|
# We have a list which we'll parse, validate, sort, make unique and filter
|
|
#
|
|
my $lcsv = Text::CSV_XS->new( { binary => 1, } );
|
|
if ( $lcsv->parse($list) ) {
|
|
@range = uniq( sort { $a <=> $b } $lcsv->fields() );
|
|
@range = grep {/\d+/} @range;
|
|
@range = grep { $_ > 0 && $_ <= $max_epno } @range;
|
|
@range = apply { $_ =~ s/(^\s*|\s*$)// } @range;
|
|
|
|
die "Invalid list; no elements\n" if scalar(@range) == 0;
|
|
die "Invalid list; too many elements\n"
|
|
if scalar(@range) > $max_episodes;
|
|
}
|
|
else {
|
|
die "Failed to parse -list='$list'\n" . $lcsv->error_diag() . "\n";
|
|
}
|
|
}
|
|
else {
|
|
#
|
|
# We have -from=N, -to=M or -count=X
|
|
#
|
|
die "Invalid starting episode number ($from)\n" unless $from > 0;
|
|
die "Do not combine -to and -count\n"
|
|
if ( defined($to) && defined($count) );
|
|
|
|
if ( defined($count) ) {
|
|
$to = $from + $count - 1;
|
|
}
|
|
elsif ( !defined($to) ) {
|
|
$to = $from;
|
|
}
|
|
|
|
die "Invalid range; $from is greater than $to\n" unless $from <= $to;
|
|
die "Invalid range; range is too big (>$max_episodes): $to - $from = "
|
|
. ( $to - $from ) . "\n" if ( $to - $from ) > $max_episodes;
|
|
|
|
@range = ( $from .. $to );
|
|
}
|
|
|
|
#
|
|
# Check we got a sensible range
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
|
unless scalar(@range) > 0;
|
|
|
|
#
|
|
# You can't be silent and verbose at the same time
|
|
#
|
|
$silent = 0 if $verbose;
|
|
|
|
#
|
|
# Fetching audio files is not relevant in 'metadata only' mode
|
|
# TODO: -nofetch with -meta_only incorrectly results in the download of files;
|
|
# see the Journal in regard to show 3004 in April 2020.
|
|
# TODO: 2023-07-06 Time to drop '-[no]fetch' completely?
|
|
#
|
|
$fetch = 0 if $meta_only;
|
|
|
|
#
|
|
# If there's a defined output filename then there are three options: a null
|
|
# string, a plain filename and a substitution string with '%d' sequences. The
|
|
# null string means the user used '-output' without a value, so we want to
|
|
# generate a substitution string. A string with '%d' requires a check to
|
|
# ensure there's the right number, one for $ubound and one for $lbound. The
|
|
# plain filename needs no more work.
|
|
#
|
|
# If no defined output filename we'll provide a default.
|
|
#
|
|
$outfile = expand_template( $outfile, 'metadata', 'csv',
|
|
( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
|
|
|
|
#
|
|
# Check the -script option. Like -output the choices are: a null string,
|
|
# a plain filename or a substitution string with '%d' sequences.
|
|
#
|
|
$scriptfile = expand_template( $scriptfile, 'script', 'sh',
|
|
( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# In verbose mode report the options
|
|
#-------------------------------------------------------------------------------
|
|
if ($verbose) {
|
|
my $fmt = "%18s: %s\n";
|
|
|
|
printf $fmt, 'From', $from if defined( $options{from} );
|
|
printf $fmt, 'To', $to if defined( $options{to} );
|
|
printf $fmt, 'Count', $count if defined( $options{count} );
|
|
printf $fmt, 'List', $list if defined( $options{list} );
|
|
printf $fmt, 'Output', $outfile;
|
|
printf $fmt, 'Output script', $scriptfile;
|
|
printf $fmt, 'Output asset count', ( $acountfile ? $acountfile : '--' );
|
|
printf $fmt, 'Meta only', ( $meta_only ? 'Yes' : 'No' );
|
|
printf $fmt, 'Fetch', ( $fetch ? 'Yes' : 'No' );
|
|
printf $fmt, 'Silent', ( $silent ? 'Yes' : 'No' );
|
|
printf $fmt, 'Verbose', ( $verbose ? 'Yes' : 'No' );
|
|
printf $fmt, 'Debug', $DEBUG;
|
|
printf $fmt, 'Script config', $cfgfile;
|
|
printf $fmt, 'Database config', $dbcfgfile;
|
|
|
|
print '-' x 80, "\n";
|
|
}
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Set up output and script file
|
|
#-------------------------------------------------------------------------------
|
|
open( my $out, '>:encoding(UTF-8)', $outfile )
|
|
or die "Unable to open $outfile for output: $!\n";
|
|
|
|
#
|
|
# Open the script file and add a header. Set execute permission on the
|
|
# file for the owner, and nobody else. We keep a count of lines (other than
|
|
# the header stuff) written to this file.
|
|
#
|
|
my $script_lines = 0;
|
|
open( my $script, '>:encoding(UTF-8)', $scriptfile )
|
|
or die "Unable to open $scriptfile for output: $!\n";
|
|
print $script "#!/usr/bin/env bash\n\n";
|
|
print $script <<'EndHead';
|
|
Upload () {
|
|
local id=${1}
|
|
local file=${2}
|
|
local remote=${3:-}
|
|
local options=${4:-}
|
|
|
|
if [[ -e $file ]]; then
|
|
if [[ -z $remote ]]; then
|
|
ia upload ${id} ${file} ${options}
|
|
else
|
|
ia upload ${id} ${file} --remote-name=${remote} ${options}
|
|
fi
|
|
else
|
|
echo "File missing: $file"
|
|
fi
|
|
}
|
|
|
|
EndHead
|
|
|
|
chmod(0744,$scriptfile);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Make changes for test mode
|
|
#-------------------------------------------------------------------------------
|
|
# NOTE: Commented out 2023-07-06 since this is dangerous
|
|
#if ($test) {
|
|
# $dispatch{collection} = 'test_collection';
|
|
# $dispatch{contributor} = 'perlist';
|
|
#}
|
|
|
|
#
|
|
# Prepare to turn the data into CSV data. Since we're allowing embedded
|
|
# newlines (binary mode) add an explicit end of line.
|
|
#
|
|
# TODO: The new tool 'ia' does not accept newlines in the CSV, so no point in
|
|
# defining them here. They have been removed elsewhere in the code.
|
|
#
|
|
my $csv = Text::CSV_XS->new( { binary => 1, eol => "\r\n" } );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Copy audio files from the website if requested and if needed. We look for
|
|
# the WAV file in the 'uploads' area but if it's not found we get the MP3
|
|
# (which is all we have for older shows).
|
|
#
|
|
# TODO: Given we're redirecting requests for audio files on the HPR site to
|
|
# archive.org there's potential for a nasty paradox here if the rewrite rules
|
|
# aren't perfect. We do the MP3 thing only for old shows, so we need to make
|
|
# sure that it works when we need it.
|
|
# TODO: Are there any logic flaws here? This was designed before we started
|
|
# creating all the audio formats we have now (partly to avoid the 'derive'
|
|
# process on archive.org from making copies without tags).
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# NOTE: Commented out 2023-07-06 because (a) this situation should not occur
|
|
# again now that all audio versions are on the IA and we get everything for
|
|
# new shows, (b) the URLs used for downloads are obsolete.
|
|
#-------------------------------------------------------------------------------
|
|
#if ( $fetch && !$meta_only ) {
|
|
#
|
|
# foreach my $episode (@range) {
|
|
# $ft = 0;
|
|
#
|
|
# #
|
|
# # Does the WAV version already exist?
|
|
# #
|
|
# $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
|
|
# if ( ! -e "$filepath/$file" ) {
|
|
# #
|
|
# # Doesn't exist, try the alternative version
|
|
# #
|
|
# $ft++;
|
|
# $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
|
|
# if ( !-e "$filepath/$file" ) {
|
|
# #
|
|
# # We need to download the file
|
|
# #
|
|
# $URL = sprintf( $URLtemplate, $file );
|
|
# print STDERR "Downloading $URL\n" unless $silent;
|
|
# $status = getstore( $URL, "$filepath/$file" );
|
|
# $status == 200 or warn "$status : $URL ($file)\n";
|
|
# }
|
|
# }
|
|
#
|
|
# die "Unable to find or download $file\n"
|
|
# unless ( -e "$filepath/$file" );
|
|
# }
|
|
#
|
|
#}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Connect to the database
|
|
#-------------------------------------------------------------------------------
|
|
my $dbhost = $dbcfg{database}->{host} // '127.0.0.1';
|
|
my $dbport = $dbcfg{database}->{port} // 3306;
|
|
my $dbname = $dbcfg{database}->{name};
|
|
my $dbuser = $dbcfg{database}->{user};
|
|
my $dbpwd = $dbcfg{database}->{password};
|
|
$dbh = DBI->connect( "dbi:mysql:database=$dbname;host=$dbhost;port=$dbport",
|
|
$dbuser, $dbpwd, { AutoCommit => 1 } )
|
|
or croak $DBI::errstr;
|
|
|
|
#
|
|
# Enable client-side UTF8
|
|
#
|
|
$dbh->{mysql_enable_utf8} = 1;
|
|
$dbh->{mysql_enable_utf8mb4} = 1;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Collect the necessary information for constructing the metadata for each of
|
|
# the selected episodes.
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# Each key of the %meta hash will contain an arrayref with an element per
|
|
# episode in the @range array. Some of these elements will be objects
|
|
# (arrayrefs or hashrefs) containing multi-part items such as 'subject'
|
|
# strings from the 'tags' field in the database. The %meta keys correspond to
|
|
# the strings of the @fields array which correspond to IA metadata fields and
|
|
# are also used by the %dispatch hash to contain actions to perform to make
|
|
# the contents. A few of the %meta keys correspond to other items relating to
|
|
# the episode such as supplementary audio and supplementary files like
|
|
# pictures.
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# NOTE: See the dump of the %meta hash in the Journal that goes with this
|
|
# project (Example 1). It should give a better representation of this
|
|
# structure.
|
|
#-------------------------------------------------------------------------------
|
|
$sth1 = $dbh->prepare($sql);
|
|
|
|
$incomplete = 0;
|
|
foreach my $episode (@range) {
|
|
#
|
|
# Get the episode from the database
|
|
#
|
|
$sth1->execute($episode);
|
|
if ( $dbh->err ) {
|
|
croak $dbh->errstr;
|
|
}
|
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
unless ($h1) {
|
|
#
|
|
# Somehow this episode doesn't exist in the database. No idea why, but
|
|
# skip this one and keep going anyway.
|
|
#
|
|
carp "Failed to find requested episode number $episode\n";
|
|
next;
|
|
}
|
|
|
|
print STDERR "D> ", $h1->{title}, "\n";
|
|
|
|
#
|
|
# Make the episode name with leading zeroes
|
|
#
|
|
$ep_name = sprintf( 'hpr%04d', $episode );
|
|
|
|
#
|
|
# Check whether this episode has a summary and tags. Count up the number
|
|
# that are missing either of these.
|
|
# TODO: Time to remove this check?
|
|
#
|
|
if ( length( $h1->{summary} ) == 0 || length( $h1->{tags} ) == 0 ) {
|
|
print STDERR "Episode $episode is missing summary and/or tags\n"
|
|
unless ($silent);
|
|
$incomplete++;
|
|
}
|
|
|
|
#
|
|
# Handle supplementary audio
|
|
#
|
|
unless ( defined( $meta{audio} ) ) {
|
|
$meta{audio} = [];
|
|
}
|
|
|
|
my @audio;
|
|
|
|
#
|
|
# Look for other audio files for upload.
|
|
# TODO: Consider catering for hpr\d{4}(\.\d)?\.(flac|mp3|ogg|opus|spx)
|
|
# e.g. hpr2781.1.ogg which could happen if the audio has had to be
|
|
# re-released.
|
|
#
|
|
foreach my $ext ( @additional_audio ) {
|
|
my $audio
|
|
= sprintf( "%s/hpr%04d.%s", $config{uploads}, $episode, $ext );
|
|
if ( -e $audio ) {
|
|
push( @audio, $audio );
|
|
}
|
|
}
|
|
|
|
#
|
|
# Look for the source file if provided. It will be called
|
|
# 'hpr9999_source.{wav,flac,mp3,ogg,opus}' (though it's doubtful if people
|
|
# are sending in opus). For safety we'll accept any extension.
|
|
#
|
|
@source = File::Find::Rule->file()
|
|
->name("hpr${episode}_source.*")
|
|
->in($filepath);
|
|
|
|
#
|
|
# Generate the upload command(s) for the source file(s). We upload to the
|
|
# main directory for the item and request that no 'derive' process be run
|
|
# on the file(s). If we're just generating metadata it's not really
|
|
# necessary to do this. Rather than having a different format here we now
|
|
# use a generic format from the configuration file but pass in a null
|
|
# remote filename. We also add default options, also from the
|
|
# configuration file in all cases so we don't derive anything and don't
|
|
# generate an archive of old stuff.
|
|
#
|
|
# TODO: What if the source file(s) had been omitted in the first pass, and
|
|
# we need to upload them now? Could just do it by hand of course.
|
|
#
|
|
unless ($meta_only) {
|
|
for my $sou (@source) {
|
|
printf $script $iauploadtemplate,
|
|
'hpr' . $episode,
|
|
$sou,
|
|
'',
|
|
$iauploadoptions;
|
|
$script_lines++;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Turn all source audio files into URLs pointing to the IA item
|
|
#
|
|
for (my $i = 0; $i <= $#source; $i++) {
|
|
( my $bn = $source[$i] ) =~ s|.*/||mx;
|
|
$source[$i] = sprintf( $config{IAURLtemplate}, "hpr$episode", $bn );
|
|
}
|
|
|
|
#push( @audio, @source ) if (@source);
|
|
|
|
#
|
|
# Save whatever was found, whether an arrayref of audio or undef
|
|
#
|
|
push( @{ $meta{audio} }, \@audio );
|
|
|
|
#
|
|
# Most shows have transcripts since Q2 2023. They aren't referred to in
|
|
# the show notes or database (not at present anyway). They get added to
|
|
# the static page on the HPR site with a footer template on every page.
|
|
# The files themselves arrive in a sub-directory of $uploads (usually
|
|
# `/data/IA/uploads/hpr1234/`) as part of the show components. For now
|
|
# we'll search for them and upload them using the Bash script we're
|
|
# building which is run after the initial 'ia upload'.
|
|
#
|
|
unless ($meta_only) {
|
|
@transcripts = File::Find::Rule->file()
|
|
->name($transcript_re)
|
|
->in("$filepath/$ep_name");
|
|
|
|
#
|
|
# Write 'Upload' function calls to the script for all of the transcripts
|
|
#
|
|
for my $ts (@transcripts) {
|
|
( my $bn = $ts ) =~ s|.*/||mx;
|
|
printf $script $iauploadtemplate, $ep_name, # identifier
|
|
$ts, # local file path
|
|
"$ep_name/$bn", # Remote file path
|
|
$iauploadoptions;
|
|
$script_lines++;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Initialise the hash element for holding links to files in the notes per
|
|
# show
|
|
#
|
|
unless ( defined( $meta{links} ) ) {
|
|
$meta{links} = [];
|
|
}
|
|
|
|
#
|
|
# Scan the database-stored notes looking for links to files on the HPR
|
|
# server. For any HTML files found in that pass, download them and perform
|
|
# another scan. If further HTML files are found download and scan them and
|
|
# so on recursively. Any non-HTML files are left for downloading later.
|
|
#
|
|
#<<< (Stop perltidy reformatting oddly)
|
|
if (find_links_in_notes($episode, $h1->{notes}, \$tree, \%links, \%config,
|
|
$verbose, $silent) > 0) {
|
|
#>>>
|
|
#
|
|
# Save the links we found. Do this by copying the %links hash into the
|
|
# %meta hash
|
|
#
|
|
push( @{ $meta{links} }, {%links} );
|
|
}
|
|
else {
|
|
push( @{ $meta{links} }, undef );
|
|
}
|
|
|
|
#
|
|
# Build the metadata hash using the dispatch table. For each field we will
|
|
# have an array of data. This may be a scalar for single-valued items or
|
|
# an array for multi-valued ones.
|
|
#
|
|
foreach my $fld (@fields) {
|
|
unless ( defined( $meta{$fld} ) ) {
|
|
$meta{$fld} = [];
|
|
}
|
|
|
|
#
|
|
# If it's a code reference then call the code and save the result
|
|
# otherwise save the data
|
|
#
|
|
if ( ref( $dispatch{$fld} ) eq 'CODE' ) {
|
|
push( @{ $meta{$fld} }, $dispatch{$fld}->() );
|
|
}
|
|
else {
|
|
push( @{ $meta{$fld} }, $dispatch{$fld} );
|
|
}
|
|
}
|
|
}
|
|
|
|
_debug( $DEBUG >= 3, Dumper(\%meta) );
|
|
|
|
die "Nothing to do\n" unless %meta;
|
|
|
|
#
|
|
# If we're not ignoring missed summaries and tags and we found them then abort
|
|
# with a message.
|
|
# TODO: 2023-07-06 Not needed any more?
|
|
#
|
|
if ( $incomplete > 0 ) {
|
|
unless ($ignore) {
|
|
die "Aborted due to missing summaries and/or tags\n";
|
|
}
|
|
else {
|
|
say "Missing summaries and/or tags - ignored" unless ($silent);
|
|
}
|
|
}
|
|
|
|
#
|
|
# ~~ Explanation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# Considering '%meta' as a "sideways spreadsheet" where the keys are row
|
|
# headings and the elements within the arrayrefs held as the hash values, the
|
|
# "cells" (array elements) may be arrayrefs with multiple elements (i.e. the
|
|
# spreadsheet is 3-dimensional!). We need to know the maximum number of array
|
|
# elements per cell because we have to expand the resulting CSV "spreadsheet"
|
|
# we'll generate for the 'ia upload' command by providing headers like
|
|
# "subject[1]" and "subject[2]". Hard to explain without a diagram!
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
#
|
|
# Compute the maximum number of sub-fields when a field can be multi-valued.
|
|
# This is because all CSV rows must have the same number of fields; the
|
|
# maximum of course.
|
|
#
|
|
@counts = map {
|
|
max( map { ref($_) eq 'ARRAY' ? scalar(@$_) : 1 } @{ $meta{$_} } )
|
|
} @fields;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Report on the collected data if requested.
|
|
# See 'Example 1' in the Journal for this project for what the '%meta' hash
|
|
# looks like at this stage.
|
|
#-------------------------------------------------------------------------------
|
|
if ($verbose) {
|
|
foreach my $i ( 0 .. $#range ) {
|
|
my $ind = 0;
|
|
foreach my $fld (@fields) {
|
|
printf "%20s[%d]: %s\n", $fld, $counts[$ind],
|
|
defined( $meta{$fld}->[$i] )
|
|
? (
|
|
ref( $meta{$fld}->[$i] ) eq 'ARRAY'
|
|
? join( ",", @{ $meta{$fld}->[$i] } )
|
|
: $meta{$fld}->[$i]
|
|
)
|
|
: '';
|
|
$ind++;
|
|
}
|
|
|
|
#
|
|
# Each show will (most likely) have an array of audio types in an
|
|
# array of arrays.
|
|
#
|
|
if ( defined( $meta{audio}->[$i] ) ) {
|
|
printf "%23s: %s\n", 'added audio',
|
|
scalar( @{ $meta{audio}->[$i] } );
|
|
}
|
|
else {
|
|
printf "%23s: 0\n", 'added audio';
|
|
}
|
|
|
|
#
|
|
# Each show may have links. They will be stored as an array of hashes
|
|
# where each hash has sub-hashes, one per "asset". Some will just be
|
|
# 'undef' meaning that they have no assets.
|
|
#
|
|
if ( defined( $meta{links}->[$i] ) ) {
|
|
printf "%23s: %s\n", 'links',
|
|
scalar( keys( %{ $meta{links}->[$i] } ) );
|
|
}
|
|
else {
|
|
printf "%23s: 0\n", 'links';
|
|
}
|
|
|
|
print '-' x 80, "\n";
|
|
}
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Output asset counts if requested
|
|
# TODO: Used in 'past_upload', which is no longer used (since that project is
|
|
# complete).
|
|
#-------------------------------------------------------------------------------
|
|
if ($acountfile) {
|
|
my $acount_lines = 0;
|
|
open( my $acount, '>:encoding(UTF-8)', $acountfile )
|
|
or die "Unable to open $acountfile for output: $!\n";
|
|
|
|
foreach my $i ( 0 .. $#range ) {
|
|
if ( defined( $meta{links}->[$i] ) ) {
|
|
printf $acount "%s %d\n", $meta{identifier}->[$i],
|
|
scalar( keys( %{ $meta{links}->[$i] } ) );
|
|
$acount_lines++;
|
|
}
|
|
}
|
|
|
|
close($acount);
|
|
|
|
#
|
|
# If we never wrote anything to the asset count file then delete it
|
|
#
|
|
if ( $acount_lines eq 0 ) {
|
|
print "Deleting empty '$acountfile'\n" if $verbose;
|
|
unlink($acountfile);
|
|
}
|
|
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Download any linked files unless the -noassets option was used.
|
|
# Report the details in verbose mode. Only download if the file is not already
|
|
# downloaded.
|
|
# NOTE: HTML downloads will have already happened since we need to recursively
|
|
# parse and modify them. So if an asset is HTML on the HPR site an earlier
|
|
# stage will have downloaded it so that it can be parsed for any further HTML
|
|
# or assets.
|
|
#-------------------------------------------------------------------------------
|
|
if ($assets) {
|
|
foreach my $i ( 0 .. $#range ) {
|
|
if ( defined( $meta{links}->[$i] ) ) {
|
|
for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
|
|
my $linkfile = $meta{links}->[$i]->{$key}->{cached};
|
|
if ( ! -e $linkfile ) {
|
|
download_url( $key, $linkfile,
|
|
$meta{links}->[$i]->{$key}->{new},
|
|
$verbose, $silent );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Generate the completed CSV
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# First build and output the header fields. If there's a single header then just
|
|
# output it, but if it's a multi-item thing output the maximum with indexes
|
|
# (like "subject[0]").
|
|
#
|
|
for my $ind ( 0 .. $#fields ) {
|
|
if ( $counts[$ind] == 1 ) {
|
|
push( @head, $fields[$ind] );
|
|
}
|
|
else {
|
|
for ( my $i = 0; $i < $counts[$ind]; $i++ ) {
|
|
push( @head, $fields[$ind] . "[$i]" );
|
|
}
|
|
}
|
|
}
|
|
|
|
$csv->print( $out, \@head );
|
|
|
|
#
|
|
# Build and output the data. The structure we have is a hash where each
|
|
# element is an array with an element per episode (0 .. $range). The
|
|
# per-episode elements may be scalars (for single-value fields) or arrays (for
|
|
# multi-value). In the latter case we have to make sure we have filled all of
|
|
# the header positions (e.g. we have 10 "subject" headers, but this episode
|
|
# has only 1 - we have to fill in the remaining 9 slots with empty items).
|
|
#
|
|
foreach my $i ( 0 .. $#range ) {
|
|
@data = ();
|
|
my $ind = 0;
|
|
foreach my $fld (@fields) {
|
|
my $count = 0;
|
|
if ( ref( $meta{$fld}->[$i] ) eq 'ARRAY' ) {
|
|
foreach my $elem ( @{ $meta{$fld}->[$i] } ) {
|
|
push( @data, $elem );
|
|
$count++;
|
|
}
|
|
}
|
|
else {
|
|
push( @data, $meta{$fld}->[$i] );
|
|
$count++;
|
|
}
|
|
|
|
#
|
|
# Cater for any blank slots
|
|
#
|
|
if ( $count < $counts[$ind] ) {
|
|
push( @data, undef ) for 1 .. ( $counts[$ind] - $count );
|
|
}
|
|
|
|
$ind++;
|
|
}
|
|
|
|
#print Dumper( \@data ), "\n";
|
|
$csv->print( $out, \@data );
|
|
|
|
#
|
|
# If there are additional audio files or supplementary files for the
|
|
# current show write a CSV row for each of them here.
|
|
# NOTE: originally the CSV format did not require the second and
|
|
# subsequent row belonging to an IA identifier to have an identifier in it
|
|
# (maybe it was even illegal in the early days). Now the ia tool seems to
|
|
# have made it mandatory - surprisingly.
|
|
#
|
|
if ( defined( $meta{audio}->[$i] ) ) {
|
|
for my $audio (@{$meta{audio}->[$i]}) {
|
|
#
|
|
# Each row is a file path which we'll add to the CSV. We use the
|
|
# length of the @header array to get the number of CSV fields
|
|
#
|
|
@data = ();
|
|
# push( @data, undef, $audio );
|
|
push( @data, $meta{identifier}->[$i], $audio );
|
|
push( @data, undef ) for 1 .. ( scalar(@head) - 2 );
|
|
$csv->print( $out, \@data );
|
|
}
|
|
}
|
|
|
|
#
|
|
# The 'links' key holds an array of hashes containing details of the
|
|
# 'assets' relating to a show (files other than audio files, but including
|
|
# the original "source" audio). In make_metadata V0.3.x we used these
|
|
# hashes to add rows to the CSV file which results in the upload of these
|
|
# files. However, this mechanism does not cater for (a) the uploading of
|
|
# directories to the IA, and (b) the ability to disable the "derive"
|
|
# mechanism for a given file. Therefore we build a Bash script containing
|
|
# a function that calls 'ia' commands which can be used instead, but only
|
|
# if $meta_only is not set.
|
|
# TODO: 2023-07-06 We want to upload the transcripts created by 'whisper'
|
|
# and placed in the 'uploads' directory. We will add these to the 'script'
|
|
# file.
|
|
#
|
|
if ( ! $meta_only && defined( $meta{links}->[$i] ) ) {
|
|
for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
|
|
#
|
|
# Build a call to the 'Upload' function being explicit about the
|
|
# remote name
|
|
#
|
|
( my $rem = $meta{links}->[$i]->{$key}->{cached} )
|
|
=~ s|$config{uploads}/||;
|
|
printf $script $iauploadtemplate,
|
|
$meta{identifier}->[$i],
|
|
$meta{links}->[$i]->{$key}->{cached},
|
|
$rem,
|
|
$iauploadoptions;
|
|
$script_lines++;
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# Report the output and script file names
|
|
#
|
|
unless ($silent) {
|
|
print "Output file: $outfile\n";
|
|
print "Script file: $scriptfile\n" unless ( $script_lines eq 0 );
|
|
}
|
|
|
|
close($out);
|
|
close($script);
|
|
|
|
#
|
|
# If we never wrote anything to the script file then delete it
|
|
#
|
|
if ($script_lines eq 0) {
|
|
print "Deleting empty '$scriptfile'\n" if $verbose;
|
|
unlink($scriptfile);
|
|
}
|
|
|
|
exit;
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: make_item
|
|
# PURPOSE: Build an Archive.org item name for this item
|
|
# PARAMETERS: $h - hashref returned from the database
|
|
# $testmode - Boolean denoting test mode
|
|
# RETURNS: The newly fashioned item string
|
|
# DESCRIPTION: The method for generating an unique item string for
|
|
# Archive.org is embodied in this function
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub make_item {
|
|
my ( $h, $testmode ) = @_;
|
|
|
|
return sprintf( "%shpr%04d", ( $testmode ? 'test_' : '' ), $h->{eps_id} );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: make_filename
|
|
# PURPOSE: Determine the filename path for the metadata
|
|
# PARAMETERS: $path - relative path to the files
|
|
# $template - template for forming the file name
|
|
# $ftypes - arrayref holding ordered file types
|
|
# $h - hashref returned from the database
|
|
# RETURNS: The path to the file
|
|
# DESCRIPTION: Forms the file path then checks that there really is a file of
|
|
# that name. Since we don't have *.wav files for every episode
|
|
# we might have one of the other formats. The array referenced
|
|
# by $ftypes contains a list of these file types, starting with
|
|
# the WAV default. At the moment we only look at the second
|
|
# choice if there is no WAV, but we could be more fancy if
|
|
# necessary.
|
|
# TODO The design of this code is not good. The earlier fetch
|
|
# stage will have determined what is available. We are
|
|
# potentially repeating these checks here, though they are
|
|
# necessary if -nofetch was specified. Needs some further
|
|
# thought.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub make_filename {
|
|
my ( $path, $template, $ftypes, $h ) = @_;
|
|
|
|
my $file;
|
|
my $ft = 0;
|
|
|
|
#
|
|
# Look for the WAV file first. If found, return it
|
|
#
|
|
$file = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
|
|
if ( -e $file ) {
|
|
return $file;
|
|
}
|
|
else {
|
|
#
|
|
# Look for the alternative type of file and return that if found
|
|
#
|
|
$ft++;
|
|
$file
|
|
= sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
|
|
if ( -e $file ) {
|
|
return $file;
|
|
}
|
|
else {
|
|
warn "No file found for hpr" . $h->{eps_id} . "\n";
|
|
return;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: db_title
|
|
# PURPOSE: Return a title for the Archive.org item
|
|
# PARAMETERS: $h - hashref returned from the database
|
|
# RETURNS: The title
|
|
# DESCRIPTION: Makes the title string we want for the item
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub db_title {
|
|
my ($h) = @_;
|
|
|
|
# print STDERR "D> ", $h->{title}, "\n";
|
|
return sprintf( "hpr%04d :: %s", $h->{eps_id}, $h->{title} );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: db_notes
|
|
# PURPOSE: Return the description for an episode
|
|
# PARAMETERS: $sourceURLtpl - printf template for making the HPR link to
|
|
# the original show (from config file)
|
|
# $h - hashref returned from the database
|
|
# $tree - parsed notes
|
|
# $asource - arrayref containing source audio file URLs
|
|
# RETURNS: The description built from the database for Archive.org
|
|
# DESCRIPTION: This routine generates the item description for Archive.org as
|
|
# HTML by concatenating the summary, series name (if there is
|
|
# one), duration, link to episode on the HPR site and the notes
|
|
# into a long string. The main work is in the formatting of the
|
|
# notes element which is stored as HTML in the database. Since
|
|
# we are making a single string from the multi-line data we have
|
|
# to take special action to preserve the contents of <pre> tags.
|
|
# See the comments in 'flatten_pre' for what is being done to
|
|
# achieve this.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub db_notes {
|
|
my ( $sourceURLtpl, $h, $tree, $asource ) = @_;
|
|
|
|
my ( $lines, $episode, $desc, $sourceurl, $notes );
|
|
|
|
#
|
|
# Ensure leading zeroes on the episode number and prepend 'hpr'
|
|
#
|
|
$episode = sprintf('hpr%04d', $h->{eps_id});
|
|
|
|
# $sourceurl = sprintf( "http://hackerpublicradio.org/eps.php?id=%04d",
|
|
# $h->{eps_id} );
|
|
$sourceurl = sprintf( $sourceURLtpl, $episode );
|
|
|
|
#
|
|
# Make the header with newlines so we can count them
|
|
# TODO: Should we use the series number 0 which denotes "no series" rather
|
|
# than the name "general"? Need to adjust the query to return this if so.
|
|
# FIXME: The "not series" 'general' is being removed and the logic will
|
|
# use NULL for an episode that is not in a series. The local copy of the
|
|
# database has had this change made, but not the live one. We have to
|
|
# cater for both. When it goes we can drop references to 'general'.
|
|
#
|
|
$desc = sprintf( "Summary: %s\n", $h->{summary} );
|
|
unless ( !defined( $h->{s_name} ) or $h->{s_name} eq 'general' ) {
|
|
$desc .= sprintf( "Series: %s\n", coalesce( $h->{s_name}, '' ) );
|
|
}
|
|
#$desc .= sprintf( "Duration: %s\n", $h->{duration} );
|
|
$desc .= sprintf( "Source: <a href=\"%s\">%s</a>\n", $sourceurl, $sourceurl );
|
|
#
|
|
# Add a pointer to the source audio if there is any (cater for multiple,
|
|
# but it's not likely we'll get more than one)
|
|
#
|
|
if (@{$asource}) {
|
|
foreach my $aurl (@{$asource}) {
|
|
$desc .= sprintf( "Original audio: <a href=\"%s\">%s</a>\n",
|
|
$aurl, $aurl );
|
|
}
|
|
}
|
|
|
|
#
|
|
# Count the lines so we can add <br/> tags at the end to make the header
|
|
# a standard size of 4 lines high to accomodate the media widget.
|
|
# TODO: Rationalise this since there is no media widget any more.
|
|
# Temporarily boosted the height to 6 lines now the Original audio has
|
|
# been added.
|
|
#
|
|
$lines = $desc =~ tr/\n//;
|
|
$desc =~ s#\n#<br />#g;
|
|
$desc .= ( "<br />" x ( 6 - $lines ) );
|
|
|
|
#
|
|
# Prepare the notes and add to the header. Actions are:
|
|
# 1. Make all <pre> nodes compatible by adding <br/>
|
|
# 2. Generate entities for all non-ASCII
|
|
# 3. Do the silly thing of double-encoding entities to counteract the IA
|
|
# bug
|
|
# 4. Remove all newlines ('ia' doesn't seem to know what to do with them)
|
|
# 5. Trim trailing spaces
|
|
#
|
|
# ----
|
|
# NOTE: Removed the double-encoding on 2017-10-07 since (unannounced) it
|
|
# looks as if the IA have fixed this bug. There is some debate over
|
|
# whether we should be encoding non-ASCII also.
|
|
# ----
|
|
# NOTE: 2017-11-25 restored the double-encoding since the bug bit again at
|
|
# the last upload. This is not the whole story though. Added a step to
|
|
# decode the notes before re-encoding the non-ASCII bits. This is to
|
|
# counteract what seems to be being done by Pandoc if we process the notes
|
|
# from plain text.
|
|
# ----
|
|
# NOTE: 2021-02-20 found a problem with the strategy of decoding
|
|
# everything and then encoding it again (selectively). The issue was seen
|
|
# in HPR show 3284 where the HTML notes contains a <pre> section with C++
|
|
# code in it which uses characters like '<' and '>'. These were encoded as
|
|
# HTML entities at shownote preparation time since they were flagged as
|
|
# bad HTML by my validator script. The original algorithm then decoded
|
|
# these characters but couldn't encode them with encode_entities because
|
|
# that would have encoded all the HTML tags. The decode/encode stuff is
|
|
# therefore HTML context dependent, and we don't have the means of
|
|
# handling this with the current script. So, the solution seems to be not
|
|
# to decode and encode at all. After running flatten_pre, simply run the
|
|
# re_encode_entities and trust that there's nothing in the notes that need
|
|
# special action here. Anyway, the preparation stage should have handled
|
|
# whatever's necessary.
|
|
# ----
|
|
#
|
|
#$notes = flatten_pre( $h->{notes} );
|
|
$notes = flatten_pre($tree);
|
|
## $notes = decode_entities($notes);
|
|
## $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
|
|
## $notes = re_encode_entities($notes);
|
|
$notes =~ s/\n//g;
|
|
|
|
$desc .= $notes;
|
|
$desc =~ s/\s*$//;
|
|
|
|
return $desc;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: db_tags
|
|
# PURPOSE: Return the episode search tags as an array for populating
|
|
# 'subject' fields on Archive.org
|
|
# PARAMETERS: $h - hashref returned from the database
|
|
# RETURNS: A reference to an array of tags or undef if there are none
|
|
# DESCRIPTION: In the database the tags are held as a comma-delimited string.
|
|
# We need to turn this into an array to populate 'subject'
|
|
# fields in the CSV file. If there are no tags the caller needs
|
|
# an undef to be returned to indicate that a blank field needs
|
|
# to be constructed in the CSV. The format of the database
|
|
# string may be messy since we rely on the show's host to submit
|
|
# this information. We split the string taking this into
|
|
# consideration.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub db_tags {
|
|
my ($h) = @_;
|
|
|
|
return $h->{tags} ? [ split( /\s*,\s*/, $h->{tags} ) ] : undef;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: flatten_pre
|
|
# PURPOSE: Process notes "flattening" <pre> contents
|
|
# PARAMETERS: $tree HTML::TreeBuilder object containing parsed and
|
|
# partially processed notes
|
|
# RETURNS: Processed notes
|
|
# DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
|
|
# contain some formatting tags like <em> and <code>, but spaces
|
|
# and newlines are significant. The Internet Archive upload API
|
|
# uses HTTP headers which are text strings without newlines, so
|
|
# when these tags are uploaded through this route some
|
|
# formatting is lost. What this routine does is parse the
|
|
# contents of all <pre> sections in $notes, adding <br/> tags
|
|
# to replace newlines. It has to perform a full parse
|
|
# since the contents may include HTML tags and these need to be
|
|
# passed through intact. It calls the subroutine 'flatten_item' to
|
|
# deal with the recursive nature of HTML tags.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub flatten_pre {
|
|
my ($tree) = @_;
|
|
|
|
#
|
|
# Find all the <pre> tags
|
|
#
|
|
my @pre_tags = $tree->look_down( _tag => 'pre', );
|
|
|
|
#
|
|
# Walk the various <pre> elements in the document
|
|
#
|
|
foreach my $tag (@pre_tags) {
|
|
#
|
|
# Save the tag and empty the original
|
|
#
|
|
my $saved = $tag->clone();
|
|
$tag->delete_content();
|
|
|
|
#
|
|
# Walk the saved content and rebuild the tag into $atag using the
|
|
# nested arrayref structure permitted by HTML::Element for
|
|
# convenience (the alternative is a little nasty). See the
|
|
# documentation for 'new_from_lol' in HTML::Element.
|
|
#
|
|
my $atag;
|
|
foreach my $item ( @{ $saved->content_array_ref } ) {
|
|
push( @$atag, flatten_item($item) );
|
|
}
|
|
|
|
#
|
|
# Rebuild the tag from the arrayref we built. We treat the arrayref
|
|
# structure we just built as an array because otherwise the top level
|
|
# is interpreted as a spurious <null> tag.
|
|
#
|
|
$tag->push_content(@$atag);
|
|
}
|
|
|
|
#
|
|
# Trim out the original notes from the enclosing tags we added earlier
|
|
#
|
|
my $body = $tree->look_down( _tag => 'body' );
|
|
( my $result = $body->as_HTML( undef, ' ', {} ) )
|
|
=~ s{(^<body[^>]*>|</body>$)}{}gi;
|
|
|
|
return $result;
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: flatten_item
|
|
# PURPOSE: Recursively "flatten" items within the enclosing <pre>
|
|
# PARAMETERS: $item an HTML::Element item parsed from the original
|
|
# <pre> section
|
|
# RETURNS: An arrayref if the last seen item was a tag, otherwise a list
|
|
# DESCRIPTION: Since <pre> sections can contain inline elements which change
|
|
# the rendering of the text we need to parse these as we add
|
|
# <br/> tags. This routine does this by recursively descending
|
|
# through the contents. A common tag sequence is <pre><code> for
|
|
# scripts and the like. This routine deals with such sequences.
|
|
# It expects to receive the contents in sequence and builds the
|
|
# result as a nested arrayref structure.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub flatten_item {
|
|
my ($item) = @_;
|
|
|
|
return unless defined($item);
|
|
|
|
my ( @result, %attr );
|
|
|
|
#
|
|
# Is it a sub-tag or non-tag content?
|
|
#
|
|
if ( ref($item) ) {
|
|
#
|
|
# It's a tag. Save the tag name and any attributes and recurse into
|
|
# it. Return an arrayref
|
|
#
|
|
push( @result, $item->tag() );
|
|
%attr = $item->all_external_attr();
|
|
push( @result, \%attr ) if %attr;
|
|
for my $child ( $item->content_list() ) {
|
|
push( @result, flatten_item($child) );
|
|
}
|
|
return \@result;
|
|
}
|
|
else {
|
|
#
|
|
# It's non-tag content. Join the lines with <br/> tags. Return an
|
|
# array (since this is a simple list).
|
|
#
|
|
# Note that we split with a LIMIT of -1 which causes any trailing list
|
|
# items to be returned; default behaviour is to drop them.
|
|
#
|
|
$item =~ s/\r//g;
|
|
my @content = split( /\n/, $item, -1 );
|
|
if (@content) {
|
|
#
|
|
# Remove a leading blank line - usually the result of
|
|
# a "<pre>'NL'text" sequence
|
|
#
|
|
shift(@content) if ( $content[0] =~ /^\s*$/ );
|
|
|
|
#
|
|
# Join back the lines with <br/> tags between them.
|
|
#
|
|
foreach my $txt (@content) {
|
|
push( @result, $txt, ['br'] );
|
|
}
|
|
|
|
#
|
|
# Remove the <br/> at the end, it's spurious
|
|
#
|
|
pop(@result);
|
|
}
|
|
|
|
return (@result);
|
|
}
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: re_encode_entities
|
|
# PURPOSE: Find all encoded entities and encode them all over again
|
|
# PARAMETERS: $notes - string containing unprocessed notes
|
|
# RETURNS: Processed notes
|
|
# DESCRIPTION: Uses a brutal regular expression substitution approach, but
|
|
# since this is a very unusual requirement brought about by what
|
|
# is essentially a bug in the way the Internet Archive stores
|
|
# and processes metadata, we have no choice.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: It looks as if the software on archive.org saves HTML metadata
|
|
# in an XML file by encoding it as entities. So HTML tags are
|
|
# turned into <TAG> sequences. Then when HTML is required
|
|
# the XML is decoded back to <TAG>. Unfortunately any existing
|
|
# entities in the HTML like '<' will also be decoded in this
|
|
# phase which may result in invalid HTML sequences. In this
|
|
# routine we are converting existing entities so that the
|
|
# decoding phase turns them into valid entities. The archive.org
|
|
# software should be doing this, but it isn't, and any messages
|
|
# sent to the staff there are ignored. Of course, there may be
|
|
# a point at which this bug is corrected and the double encoding
|
|
# process here becomes redundant. Then the call to this routine
|
|
# can be omitted.
|
|
# Note also that the IA editor corrupts HTML containing entities
|
|
# so should not be used. To make a change, edit the notes on the
|
|
# HPR database and generate new metadata with this script. Use
|
|
# the -meta_only option to avoid the need for the media (unless
|
|
# it too is being updated of course) and re-submit the CSV file
|
|
# to ias3upload.pl or ia.
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub re_encode_entities {
|
|
my ($notes) = @_;
|
|
|
|
#
|
|
# Replace any '&xxx;' sequence by '&xxx;'
|
|
#
|
|
$notes =~ s#\&([^;]+;)#&$1#g;
|
|
return $notes;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: find_links_in_notes
|
|
# PURPOSE: Finds HPR links in show notes
|
|
# PARAMETERS: $episode episode number we're dealing with
|
|
# $notes a string containing the shownotes
|
|
# $rtree reference to an HTML::TreeBuilder object built
|
|
# from the notes
|
|
# $rlinks hashref containing the links found
|
|
# $rconfig hashref containing config data
|
|
# $verbose setting controlling what reports are generated
|
|
# $silent setting controlling what reports are generated
|
|
# RETURNS: Count of the number of links
|
|
# DESCRIPTION: Parses the notes passed as an argument then scans these notes
|
|
# looking for links which relate to items on the HPR server.
|
|
# Normally these are files and images but could be anchor
|
|
# references as well. If the latter then the URL has to be
|
|
# parsed to remove the anchor. Unique links are saved in a hash
|
|
# as the path to the file that will be saved on the VPS and as
|
|
# the link to the item in the new notes. This hash will be
|
|
# passed back to the caller so it can get the file(s) and
|
|
# prepare them for upload to the IA. Finally, the link in the
|
|
# notes is modified to refer to the file that will be uploaded
|
|
# to the IA. The function returns the number of links it has
|
|
# recorded (not the number it has changed), as well as passing
|
|
# back the parsed tree and the link hash it has constructed.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub find_links_in_notes {
|
|
my ( $episode, $notes, $rtree, $rlinks, $rconfig, $verbose, $silent )
|
|
= @_;
|
|
|
|
my ( $linkre, $epstr, $uri, $slink );
|
|
my ( $oldfile, $newfile, $newURL );
|
|
|
|
_debug( $DEBUG >= 3, "Entered find_links_in_notes\n" );
|
|
|
|
#
|
|
# Initialise links
|
|
#
|
|
undef %$rlinks; # TODO: Consider whether we should do this?
|
|
|
|
#
|
|
# Create an object that can be shared downstream
|
|
#
|
|
$$rtree = HTML::TreeBuilder->new;
|
|
$$rtree->ignore_unknown(0);
|
|
$$rtree->no_expand_entities(1);
|
|
$$rtree->p_strict(1);
|
|
$$rtree->store_comments(1);
|
|
$$rtree->warn(1);
|
|
|
|
#
|
|
# Parse the notes. Die if we fail because then we know this show needs
|
|
# some urgent attention.
|
|
#
|
|
$$rtree->parse_content($notes)
|
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
|
|
|
#
|
|
# NOTE: No longer filtering out links not relating to the episode.
|
|
# Experience with 'upload_manager' has shown that there are some weird
|
|
# links in the HPR database which will cause problems.
|
|
#
|
|
# Regular expression to match links to the HPR server. Examples:
|
|
# http://hackerpublicradio.org/eps/hpr2153.png
|
|
# http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
|
|
# Also things like this (**Why Ken?**)
|
|
# ../eps/hpr2945/IMG_20191018_122746Z.jpg
|
|
# Don't match things like when *not* processing 1986:
|
|
# http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
|
|
#
|
|
$epstr = sprintf( "hpr%04d", $episode );
|
|
# my $re
|
|
# = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
|
|
$linkre = qr{
|
|
^https?://
|
|
(?:www.)?
|
|
(?:hacker|hobby)publicradio.org/eps/
|
|
(.+)$
|
|
}x;
|
|
|
|
#
|
|
# Scan the HTML tree for links we care about
|
|
#
|
|
for ( @{ $$rtree->extract_links( 'a', 'img' ) } ) {
|
|
my ( $link, $element, $attr, $tag ) = @$_;
|
|
|
|
#
|
|
# Standardise the link (expands relative URLs, removes any fragment).
|
|
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
|
|
# URIs are removed.
|
|
#
|
|
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
|
|
$uri = URI->new_abs( $link, $rconfig->{baseURL} );
|
|
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
|
|
|
|
_debug( $DEBUG >= 3, "\$uri = $uri\n" );
|
|
_debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
|
|
if $uri->fragment;
|
|
_debug( $DEBUG >= 3, "\$slink = $slink, \n" );
|
|
|
|
#
|
|
# Is it an HPR link?
|
|
#
|
|
if ( $slink =~ $linkre ) {
|
|
#
|
|
# Save the last bracketed match, without any 'fragment' if there
|
|
# is one (we want this not to be URL-related)
|
|
# NOTE: Will we ever have a fragment here?
|
|
#
|
|
( $oldfile = "$+" ) =~ s/#.*$//;
|
|
_debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
|
|
|
|
#
|
|
# Does this file path begin with an 'hpr' prefix? If so is it the
|
|
# show id? If not we don't want to process it.
|
|
#
|
|
if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
|
|
if ( $1 ne $epstr ) {
|
|
_debug( $DEBUG >= 3, "Ignored $slink\n" );
|
|
next;
|
|
}
|
|
}
|
|
|
|
#
|
|
# The path and URL might end with a slash which means the URL is
|
|
# relying on the Web server to fill in the filename as
|
|
# 'index.html'. We have to make this explicit.
|
|
#
|
|
if ( $slink =~ /\/$/ ) {
|
|
$slink .= 'index.html';
|
|
$oldfile .= 'index.html';
|
|
}
|
|
|
|
|
|
#
|
|
# Save the original link if it's unique. We have a hashref in
|
|
# $rlinks (pointing to a global hash).
|
|
#
|
|
# We add a key made from the parsed link in which we store an
|
|
# anonymous hashref containing the name of the file we'll store in
|
|
# the cache area for upload, and the new IA-based URL we'll use in
|
|
# the notes. We rename files that don't start with 'hprNNNN_' with
|
|
# that prefix to make it clear it belongs to the show (it mainly
|
|
# helps organise and manage the cache if truth be told)
|
|
#
|
|
unless ( exists( $rlinks->{$slink} ) ) {
|
|
#
|
|
# Originally we turned "hpr9999/file.dat" into
|
|
# "hpr9999_file.dat". We don't want to do this any more so the
|
|
# code is much simpler
|
|
#
|
|
$newfile = $rconfig->{uploads} . "/$oldfile";
|
|
$newURL = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
|
|
|
|
#
|
|
# Save the link details as a sub-hash indexed by the
|
|
# standardised URL. Elements are:
|
|
# {
|
|
# cached => 'file path for the cache area',
|
|
# new => 'URL to be used on the IA',
|
|
# }
|
|
#
|
|
$rlinks->{$slink} = {};
|
|
$rlinks->{$slink}->{cached} = $newfile;
|
|
$rlinks->{$slink}->{new} = $newURL;
|
|
|
|
}
|
|
|
|
#
|
|
# Simply change the content of the link (in the parsed HTML) with
|
|
# the new URL built above. We know the attribute ('src' or 'href')
|
|
# from what 'extract_links' returned. Deal with any fragment we
|
|
# found as well.
|
|
#
|
|
if ( $uri->fragment ) {
|
|
$element->attr( $attr,
|
|
$rlinks->{$slink}->{new} . '#' . $uri->fragment );
|
|
}
|
|
else {
|
|
$element->attr( $attr, $rlinks->{$slink}->{new} );
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# If we found any links the notes will have been changed, but since the
|
|
# tree is shared with the caller this will result in changes to the
|
|
# metadata. However, we want to look at any HTML files in those links
|
|
#
|
|
if ( scalar( keys(%$rlinks) ) > 0 ) {
|
|
#
|
|
# If any links are HTML we need to download them and recursively parse,
|
|
# record and possibly change their contents. This updates $rlinks
|
|
#
|
|
foreach my $key ( keys(%$rlinks) ) {
|
|
if ( $key =~ /\.html$/ ) {
|
|
my $linkfile = $rlinks->{$key}->{cached};
|
|
|
|
#
|
|
# Get the file unless we've already collected it
|
|
#
|
|
if ( ! -e $linkfile ) {
|
|
download_url( $key, $linkfile, $rlinks->{$key}->{new},
|
|
$verbose, $silent );
|
|
}
|
|
|
|
#
|
|
# Do the recursive parsing since we need to know about further
|
|
# links
|
|
#
|
|
find_links_in_file(
|
|
$episode, $linkfile, $rlinks,
|
|
$rconfig, $verbose, $silent
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# Return the link count after all recursion through supplementary files
|
|
# and so on
|
|
#
|
|
return scalar( keys(%$rlinks) );
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: find_links_in_file
|
|
# PURPOSE: Finds HPR links in related files
|
|
# PARAMETERS: $episode episode number we're dealing with
|
|
# $filename file being examined
|
|
# $rlinks hashref containing the links found
|
|
# $rconfig hashref containing config data
|
|
# $verbose setting controlling what reports are generated
|
|
# $silent setting controlling what reports are generated
|
|
# RETURNS: The number of new links found and downloaded
|
|
# DESCRIPTION: The routine is related to 'find_links_in_notes' but doesn't work
|
|
# quite the same way. It is given the name of an HTML which has
|
|
# already been downloaded. It parses this with HTML::TreeBuilder
|
|
# because it neds to be scanned and possibly edited. The HTML
|
|
# tree is scanned for <a> and <img> tags. If the URL returned is
|
|
# an HPR URL then we will want to change it to an IA one so the
|
|
# show can reference its components on archive.org and be
|
|
# independent of HPR. If a link is found which hasn't been seen
|
|
# before it is saved in the $rlinks hash keyed by the original
|
|
# URL. The link URL is replaced by the IA version in the HTML
|
|
# tree. Once the scan is complete if URL edits have been counted
|
|
# the file is rewritten from the modified tree. Then if any new
|
|
# links have been found these are checked, and if any are HTML
|
|
# they are downloaded ready to be parsed. Parsing is done by
|
|
# recursively calling this routine. Other links, which are not
|
|
# HTML exist in the $rlinks hash and are downloaded later in the
|
|
# script since they do not need to be edited.
|
|
# NOTE: As of 2017-12-10 this code has been used in a live
|
|
# situation but the recursive capabilities have not been
|
|
# invoked. This is because we have shows with the usual short
|
|
# notes which reference longer notes, and sometimes the latter
|
|
# have links to other files. However, so far none of these cases
|
|
# have referenced further HTML files needing scanning and
|
|
# editing.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub find_links_in_file {
|
|
my ( $episode, $filename, $rlinks, $rconfig, $verbose, $silent ) = @_;
|
|
|
|
my ( $linkre, $epstr, $tree, $uri, $slink );
|
|
my ( $oldfile, $newfile, $newURL );
|
|
my ( $encoding, $linkcount, $linkedits );
|
|
|
|
_debug( $DEBUG >= 3, "Entered find_links_in_file\n" );
|
|
|
|
#
|
|
# Create a tree object
|
|
#
|
|
$tree = HTML::TreeBuilder->new;
|
|
$tree->ignore_unknown(0);
|
|
$tree->no_expand_entities(1);
|
|
$tree->p_strict(1);
|
|
$tree->store_comments(1);
|
|
$tree->warn(1);
|
|
|
|
#
|
|
# Parse the file using IO::HTML to grab it. Die if we fail because then we
|
|
# know this stuff needs some urgent attention.
|
|
#
|
|
$tree->parse_file( html_file($filename) )
|
|
or die "HTML::TreeBuilder failed to process $filename: $!\n";
|
|
|
|
#
|
|
# NOTE: No longer filtering out links not relating to the episode.
|
|
# Experience with 'upload_manager' has shown that there are some weird
|
|
# links in the HPR database which will cause problems.
|
|
#
|
|
# Regular expression to match links to the HPR server. Examples:
|
|
# http://hackerpublicradio.org/eps/hpr2153.png
|
|
# http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
|
|
# Also things like this (**Why Ken?**)
|
|
# ../eps/hpr2945/IMG_20191018_122746Z.jpg
|
|
# Don't match things like when *not* processing 1986:
|
|
# http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
|
|
#
|
|
$epstr = sprintf( "hpr%04d", $episode );
|
|
# my $re
|
|
# = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
|
|
$linkre = qr{
|
|
^https?://
|
|
(?:www.)?
|
|
(?:hacker|hobby)publicradio.org/eps/
|
|
(.+)$
|
|
}x;
|
|
|
|
#
|
|
# Counting new links found and stashed as well as edits made
|
|
#
|
|
$linkcount = 0;
|
|
$linkedits = 0;
|
|
|
|
#
|
|
# Scan the HTML tree for links we care about
|
|
#
|
|
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
|
|
my ( $link, $element, $attr, $tag ) = @$_;
|
|
|
|
#
|
|
# Standardise the link (expands relative URLs, removes any fragment)
|
|
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
|
|
# URIs are removed.
|
|
#
|
|
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
|
|
$uri = URI->new_abs( $link, $rconfig->{baseURL} );
|
|
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
|
|
|
|
_debug( $DEBUG >= 3, "\$uri = $uri\n" );
|
|
_debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
|
|
if $uri->fragment;
|
|
_debug( $DEBUG >= 3, "\$slink = $slink, \n" );
|
|
|
|
#
|
|
# Is it an HPR link?
|
|
#
|
|
if ( $slink =~ $linkre ) {
|
|
#
|
|
# Save the last bracketed match, without any 'fragment' if there
|
|
# is one (we want this not to be URL-related)
|
|
# NOTE: Will we ever have a fragment here?
|
|
#
|
|
( $oldfile = "$+" ) =~ s/#.*$//;
|
|
_debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
|
|
|
|
#
|
|
# Does this file path begin with an 'hpr' prefix? If so is it the
|
|
# show id? If not we don't want to process it.
|
|
#
|
|
if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
|
|
if ( $1 ne $epstr ) {
|
|
_debug( $DEBUG >= 3, "Ignored $slink\n" );
|
|
next;
|
|
}
|
|
}
|
|
|
|
#
|
|
# The path and URL might end with a slash which means the URL is
|
|
# relying on the Web server to fill in the filename as
|
|
# 'index.html'. We have to make this explicit.
|
|
#
|
|
if ( $slink =~ /\/$/ ) {
|
|
$slink .= 'index.html';
|
|
$oldfile .= 'index.html';
|
|
}
|
|
|
|
|
|
#
|
|
# Save the original link if it's unique. We have a hashref in
|
|
# $rlinks (pointing to a global hash).
|
|
#
|
|
# We add a key made from the parsed link in which we store an
|
|
# anonymous hashref containing the name of the file we'll store in
|
|
# the cache area for upload, and the new IA-based URL we'll use in
|
|
# the notes. We rename files that don't start with 'hprNNNN_' with
|
|
# that prefix to make it clear it belongs to the show (it mainly
|
|
# helps organise and manage the cache if truth be told)
|
|
#
|
|
unless ( exists( $rlinks->{$slink} ) ) {
|
|
#
|
|
# Originally we turned "hpr9999/file.dat" into
|
|
# "hpr9999_file.dat". We don't want to do this any more so the
|
|
# code is much simpler
|
|
#
|
|
$newfile = $rconfig->{uploads} . "/$oldfile";
|
|
$newURL = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
|
|
|
|
#
|
|
# Save the link details as a sub-hash indexed by the
|
|
# standardised URL. Elements are:
|
|
# {
|
|
# cached => 'file path for the cache area',
|
|
# new => 'URL to be used on the IA',
|
|
# }
|
|
#
|
|
$rlinks->{$slink} = {};
|
|
$rlinks->{$slink}->{cached} = $newfile;
|
|
$rlinks->{$slink}->{new} = $newURL;
|
|
|
|
$linkcount++;
|
|
|
|
}
|
|
|
|
#
|
|
# Simply change the content of the link (in the parsed HTML) with
|
|
# the new URL built above. We know the attribute ('src' or 'href')
|
|
# from what 'extract_links' returned. Deal with any fragment we
|
|
# found as well.
|
|
#
|
|
if ( $uri->fragment ) {
|
|
$element->attr( $attr,
|
|
$rlinks->{$slink}->{new} . '#' . $uri->fragment );
|
|
}
|
|
else {
|
|
$element->attr( $attr, $rlinks->{$slink}->{new} );
|
|
}
|
|
$linkedits++;
|
|
}
|
|
}
|
|
|
|
#
|
|
# If we found HPR links then we'll have changed them in the tree, so we
|
|
# need to update the file. We use 'as_HTML' with no entities, indentation
|
|
# by one space and no optional end tags (see HTML::Element). If the file
|
|
# gets examined after the edits there should be no eligible HPR links, so
|
|
# no action.
|
|
#
|
|
if ( $linkedits > 0 ) {
|
|
open( my $out, ">:encoding(UTF-8)", $filename )
|
|
or die "Unable to open $filename for writing: $!\n";
|
|
print $out $tree->as_HTML( undef, ' ', {} );
|
|
close($out);
|
|
print STDERR "Altered links in $filename\n" unless $silent;
|
|
}
|
|
|
|
#
|
|
# If we found new links then $linkcount will be non-zero and we need to
|
|
# download and parse them, otherwise nothing else to do.
|
|
#
|
|
if ( $linkcount > 0 ) {
|
|
#
|
|
# If any links are HTML we need to download them and recursively parse,
|
|
# record and possibly change their contents
|
|
#
|
|
foreach my $key ( keys(%$rlinks) ) {
|
|
if ( $key =~ /\.html$/ ) {
|
|
my $linkfile = $rlinks->{$key}->{cached};
|
|
|
|
#
|
|
# Get the file unless we've already collected it
|
|
#
|
|
if ( !-e $linkfile ) {
|
|
download_url( $key, $linkfile, $rlinks->{$key}->{new},
|
|
$verbose, $silent );
|
|
}
|
|
|
|
#
|
|
# Do the recursive parsing since we need to know about further
|
|
# links
|
|
#
|
|
find_links_in_file(
|
|
$episode, $linkfile, $rlinks,
|
|
$rconfig, $verbose, $silent
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# Return the link count
|
|
#
|
|
return $linkcount;
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: download_url
|
|
# PURPOSE: Download a file from an URL
|
|
# PARAMETERS: $from the URL the file is to be collected from
|
|
# $to the path to which the file is to be written
|
|
# $upload the eventual IA URL (for information)
|
|
# $verbose setting controlling what reports are generated
|
|
# $silent setting controlling what reports are generated
|
|
# RETURNS: The status value from the download
|
|
# DESCRIPTION: If 'verbose' is true the details of the download are reported.
|
|
# We are to download from the URL specified as $from, and the
|
|
# destination is the path in $to which is in a temporary cache
|
|
# area. This may result in the original file being renamed. The
|
|
# value in $upload shows the URL the file will be available at
|
|
# on the IA once the metadata has been used for uploads. A brief
|
|
# message is written to STDERR by default then the HTTP download
|
|
# is initiated with a warning generated if the download is not
|
|
# successful.
|
|
# Moved from LWP::Simple to LWP::UserAgent to get more control
|
|
# over the download. The former failed when an HTTPS URL was
|
|
# used and the server didn't offer this type of connection,
|
|
# whereas the latter does not.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub download_url {
|
|
my ( $from, $to, $upload, $verbose, $silent ) = @_;
|
|
|
|
my ( $dirname, $status );
|
|
|
|
if ($verbose) {
|
|
print "Link to be downloaded: $from\n";
|
|
print " to ", $to, "\n";
|
|
print " and uploaded as ", $upload, "\n";
|
|
}
|
|
|
|
#
|
|
# Extract the directory from the path. If the directory doesn't exist then
|
|
# make it.
|
|
#
|
|
( $dirname = $to ) =~ s|/?[^/]*$||mx;
|
|
make_path($dirname) unless -d $dirname;
|
|
|
|
#
|
|
# Collect the file and save it
|
|
#
|
|
print STDERR "Downloading $from\n" unless $silent;
|
|
$status = getstore( $from, $to, $verbose );
|
|
$status == 200 or warn "Download failed: $status : $from ($to)\n";
|
|
|
|
print '~' x 80, "\n" if $verbose;
|
|
|
|
return $status;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: getstore
|
|
# PURPOSE: Get an URL and store the result in a file
|
|
# PARAMETERS: $from the URL to download
|
|
# $to where to put the result
|
|
# $verbose setting controlling what reports are generated
|
|
# RETURNS: The status code from the 'get'
|
|
# DESCRIPTION: When using LWP::Simple an attempt to fetch an URL with an
|
|
# 'https' method failed since the server doesn't offer this
|
|
# service. Using LWP::UserAgent this is not a problem, so we are
|
|
# effectively emulating LWP::Simple in a more complex way!
|
|
# However, we now have a means of taking action when the
|
|
# download fails in some predictable way.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub getstore {
|
|
my ( $from, $to, $verbose ) = @_;
|
|
|
|
#
|
|
# Create the agent, and identify it (a bit)
|
|
#
|
|
my $ua = LWP::UserAgent->new;
|
|
$ua->agent("make_metadata/$VERSION ");
|
|
|
|
#
|
|
# Get the URL and store it to the requested file
|
|
#
|
|
my $res = $ua->get( $from, ':content_file' => $to );
|
|
|
|
#
|
|
# Report what happened
|
|
#
|
|
print $res->status_line, "\n" if $verbose;
|
|
|
|
#
|
|
# Return status
|
|
#
|
|
return $res->code;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: expand_template
|
|
# PURPOSE: Turns a filename template into a filename or a default
|
|
# PARAMETERS: $option The input from the -output or -script option
|
|
# $prefix Filename prefix
|
|
# $suffix Filename suffix
|
|
# $lbound Lower bound of the episode range
|
|
# $ubound Upper bound of the episode range
|
|
# RETURNS: The expanded template or the default
|
|
# DESCRIPTION: The -output or -script options both take an optional template,
|
|
# so we need to expand it with episode numbers.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub expand_template {
|
|
my ( $option, $prefix, $suffix, $lbound, $ubound ) = @_;
|
|
|
|
if ( defined($option) ) {
|
|
if ( $option =~ /^$/ ) {
|
|
#
|
|
# Build a default template depending on the episode range
|
|
#
|
|
if ( $lbound == $ubound ) {
|
|
$option = $prefix . '_%04d.' . $suffix;
|
|
$option = sprintf( $option, $lbound );
|
|
}
|
|
else {
|
|
$option = $prefix . '_%04d-%04d.' . $suffix;
|
|
$option = sprintf( $option, $lbound, $ubound );
|
|
}
|
|
}
|
|
elsif ( $option =~ /%(\d*)d/ ) {
|
|
#
|
|
# Caller specified a template. We need to check it
|
|
#
|
|
my $count = () = $option =~ /%(\d*)d/g;
|
|
die "Invalid - too many '%d' sequences in '$option'\n"
|
|
if ( $count > ( $lbound == $ubound ? 1 : 2 ) );
|
|
die "Invalid - too few '%d' sequences in '$option'\n"
|
|
if ( $count < ( $lbound == $ubound ? 1 : 2 ) );
|
|
$option =~ s/%(\d*)d/%04d/g;
|
|
$option = sprintf( $option, $lbound, $ubound );
|
|
}
|
|
}
|
|
else {
|
|
#
|
|
# The default
|
|
#
|
|
$option = "$prefix.$suffix"
|
|
}
|
|
|
|
return $option;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: coalesce
|
|
# PURPOSE: To find the first defined argument and return it
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
# RETURNS: The first defined argument or undef if there are none
|
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
|
# returned when doing so might be a problem.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub coalesce {
|
|
foreach (@_) {
|
|
return $_ if defined($_);
|
|
}
|
|
return undef; ## no critic
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: _debug
|
|
# PURPOSE: Prints debug reports
|
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
|
# $message Message to print
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
|
# newline and then adds one in the 'print' to the caller doesn't
|
|
# have to bother. Prepends the message with 'D> ' to show it's
|
|
# a debug message.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub _debug {
|
|
my ( $active, $message ) = @_;
|
|
|
|
chomp($message);
|
|
print "D> $message\n" if $active;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: Options
|
|
# PURPOSE: Processes command-line options
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
# RETURNS: Undef
|
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
|
# for details
|
|
# THROWS: no exceptions
|
|
# COMMENTS: none
|
|
# SEE ALSO: n/a
|
|
#===============================================================================
|
|
sub Options {
|
|
my ($optref) = @_;
|
|
|
|
my @options = (
|
|
"help", "documentation|man",
|
|
"debug=i", "verbose!",
|
|
"silent!", "test!",
|
|
"meta_only|noaudio!", "from=i",
|
|
"to=i", "count=i",
|
|
"list=s", "output:s",
|
|
"fetch!", "ignore_missing|im!",
|
|
"assets!", "script:s",
|
|
"a_count=s", "dbconfig=s",
|
|
"config=s",
|
|
);
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\n",
|
|
-exitval => 1,
|
|
-verbose => 0
|
|
);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
__END__
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# Application Documentation
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
#{{{
|
|
|
|
=head1 NAME
|
|
|
|
make_metadata - Generate metadata from the HPR database for Archive.org
|
|
|
|
=head1 VERSION
|
|
|
|
This documentation refers to make_metadata version 0.4.14
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
make_metadata [-help] [-documentation]
|
|
|
|
make_metadata -from=FROM [-to=TO] [-count=COUNT] [-output[=FILE]]
|
|
[-script[=FILE]] [-a_count=FILE] [-[no]meta_only] [-[no]fetch]
|
|
[-[no]assets] [-[no]silent] [-[no]verbose] [-[no]test]
|
|
[-[no]ignore_missing] [-config=FILE] [-dbconfig=FILE] [-debug=N]
|
|
|
|
make_metadata -list=LIST [-output[=FILE]] [-script[=FILE]]
|
|
[-[no]meta_only] [-[no]fetch] [-[no]assets] [-[no]silent]
|
|
[-[no]verbose] [-[no]test] [-[no]ignore_missing] [-config=FILE]
|
|
[-dbconfig=FILE] [-debug=N]
|
|
|
|
Examples:
|
|
|
|
make_metadata -from=1234 -nofetch
|
|
|
|
make_metadata -from=1234 -to=1235
|
|
|
|
make_metadata -from=1234 -count=10
|
|
|
|
make_metadata -from=1 -to=3 -output=metadata_1-3.csv
|
|
|
|
make_metadata -from=1500 -to=1510 -out=metadata_1500-1510.csv -verbose
|
|
|
|
make_metadata -from=1500 -to=1510 -out=metadata_%d-%d.csv -verbose
|
|
|
|
make_metadata -from=500 -to=510 -out=metadata_%04d-%04d.csv -verbose
|
|
|
|
make_metadata -from=1500 -to=1510 -out -verbose
|
|
|
|
make_metadata -from=1500 -to=1510 -out
|
|
|
|
make_metadata -from=1675 -to=1680 -out=metadata_%d-%d.csv -meta_only
|
|
|
|
make_metadata -from=1450 -test
|
|
|
|
make_metadata -list='1234,2134,2314' -out -meta_only
|
|
|
|
make_metadata -list="931,932,933,935,938,939,940" -out -meta -ignore
|
|
|
|
make_metadata -dbconf=.hpr_livedb.cfg -from=1234 -to=1235
|
|
|
|
make_metadata -from=3004 -out -meta_only -noassets
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<-help>
|
|
|
|
Reports brief information about how to use the script and exits. To see the
|
|
full documentation use the option B<-documentation> or B<-man>. Alternatively,
|
|
to generate a PDF version use the I<pod2pdf> tool from
|
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
|
installed with the cpan tool as App::pod2pdf.
|
|
|
|
=item B<-documentation> or B<-man>
|
|
|
|
Reports full information about how to use the script and exits. Alternatively,
|
|
to generate a PDF version use the I<pod2pdf> tool from
|
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
|
installed with the cpan tool as App::pod2pdf.
|
|
|
|
=item B<-debug=N>
|
|
|
|
Run in debug mode at the level specified by I<N>. Possible values are:
|
|
|
|
=over 4
|
|
|
|
=item B<0>
|
|
|
|
No debugging (the default).
|
|
|
|
=item B<1>
|
|
|
|
TBA
|
|
|
|
=item B<2>
|
|
|
|
TBA
|
|
|
|
=item B<3>
|
|
|
|
TBA
|
|
|
|
=item B<4 and above>
|
|
|
|
The metadata hash is dumped.
|
|
|
|
Each call of the function I<find_links_in_notes> is reported. On finding an
|
|
<a> or <img> tag the I<uri> value is shown, as is any fragment and the related
|
|
link. The original file is reported here.
|
|
|
|
Each call of the function I<find_links_in_file> is reported. On finding an
|
|
<a> or <img> tag the I<uri> value is shown, as is any fragment and the related
|
|
link. The original file is reported here, and if a link is to be ignored this
|
|
is reported.
|
|
|
|
=back
|
|
|
|
=item B<-from=NUMBER>
|
|
|
|
This option defines the starting episode number of a group. It is mandatory to
|
|
provide either the B<-from=NUMBER> option or the B<-list=LIST> option (see
|
|
below).
|
|
|
|
=item B<-to=NUMBER>
|
|
|
|
This option specifies the final episode number of a group. If not given the
|
|
script generates metadata for the single episode indicated by B<-from>.
|
|
|
|
The value given here must be greater than or equal to that given in the
|
|
B<-from> option. The option must not be present with the B<-count> option.
|
|
|
|
The difference between the episode numbers given by the B<-from> and B<-to>
|
|
options must not be greater than 20.
|
|
|
|
=item B<-count=NUMBER>
|
|
|
|
This option specifies the number of episodes to process (starting from the
|
|
episode number specified by the B<-from>) option. The option must not be
|
|
present with the B<-to> option.
|
|
|
|
The number of episodes specified must not be greater than 20.
|
|
|
|
=item B<-list=LIST>
|
|
|
|
This option is an alternative to B<-from=NUMBER> and its associated modifying
|
|
options. The B<LIST> is a comma-separated list of not necessarily consecutive
|
|
episode numbers, and must consist of at least one and no more than 20 numbers.
|
|
|
|
This option is useful for the case when non-sequential episode numbers are to
|
|
be uploaded, and is particularly useful when repairing elements of particular
|
|
episodes (such as adding summary fields and tags) where they have already
|
|
been uploaded.
|
|
|
|
For example, the following shows have no summary and/or tags, but the shows
|
|
are already in the IA. The missing items have been provided, so we wish to
|
|
update the HTML part of the upload:
|
|
|
|
$ ./make_metadata -list='2022,2027,2028,2029,2030,2033' -out -meta
|
|
Output file: metadata_2022-2033.csv
|
|
|
|
=item B<-output[=FILE]>
|
|
|
|
This option specifies the file to receive the generated CSV data. If omitted
|
|
the output is written to B<metadata.csv> in the current directory.
|
|
|
|
The file name may contain one or two instances of the characters '%d', with
|
|
a leading width specification if desired (such as '%04d'). These will be
|
|
substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
|
|
B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
|
|
appropriate endpoint (adding the count to the starting number). If neither of
|
|
the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
|
|
be one instance of '%d' or the script will abort.
|
|
|
|
If no value is provided to B<-output> then a suitable template will be
|
|
generated. It will be 'metadata_%04d.csv' if one episode is being processed, and
|
|
'metadata_%04d-%04d.csv' if a range has been specified.
|
|
|
|
Example:
|
|
|
|
./make_metadata -from=1430 -out=metadata_%04d.csv
|
|
|
|
the output file name will be B<metadata_1430.csv>. The same effect can be
|
|
achieved with:
|
|
|
|
./make_metadata -from=1430 -out=
|
|
|
|
or
|
|
|
|
./make_metadata -from=1430 -out
|
|
|
|
=item B<-script[=FILE]>
|
|
|
|
This option specifies the file to receive commands required to upload certain
|
|
files relating to a show. If omitted the commands are written to B<script.sh>
|
|
in the current directory.
|
|
|
|
The file name may contain one or two instances of the characters '%d', with
|
|
a leading width specification if desired (such as '%04d'). These will be
|
|
substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
|
|
B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
|
|
appropriate endpoint (adding the count to the starting number). If neither of
|
|
the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
|
|
be one instance of '%d' or the script will abort.
|
|
|
|
If no value is provided to B<-script> then a suitable template will be
|
|
generated. It will be 'script_%04d.sh' if one episode is being processed, and
|
|
'script_%04d-%04d.sh' if a range has been specified.
|
|
|
|
Example:
|
|
|
|
./make_metadata -from=1430 -script=script_%04d.sh
|
|
|
|
the output file name will be B<script_1430.sh>. The same effect can be
|
|
achieved with:
|
|
|
|
./make_metadata -from=1430 -script=
|
|
|
|
or
|
|
|
|
./make_metadata -from=1430 -script
|
|
|
|
=item B<-a_count=FILE>
|
|
|
|
Defines a file into which the script writes details of assets downloaded
|
|
during the analysis of notes (and other HTML files associated with a show).
|
|
The listing consists of the show identifier (e.g. 'hpr3901') followed by the
|
|
number of links followed to collect the files for this show.
|
|
|
|
This feature was added to allow other scripts to perform tasks with these
|
|
assets, but is now deprecated.
|
|
|
|
The feature will probably be removed in a later release of this script.
|
|
|
|
=item B<-[no]fetch>
|
|
|
|
This option controls whether the script attempts to fetch the MP3 audio file
|
|
from the HPR website should there be no WAV file in the upload area. The
|
|
default setting is B<-fetch>.
|
|
|
|
Normally the script is run as part of the workflow to upload the metadata and
|
|
audio to archive.org. The audio is expected to be a WAV file and to be in the
|
|
location referenced in the configuration file under the 'uploads' label.
|
|
However, not all of the WAV files exist for older shows.
|
|
|
|
When the WAV file is missing and B<-fetch> is selected or defaulted, the
|
|
script will attempt to download the MP3 version of the audio and will store it
|
|
in the 'uploads' area for the upload script (B<ias3upload.pl> or B<ia>) to
|
|
send to archive.org. If the MP3 file is not found then the script will abort.
|
|
|
|
If B<-fetch> is specified (or defaulted) as well as B<-nometa_only> (see
|
|
below) then the audio file fetching process will not be carried out. This is
|
|
because it makes no sense to fetch this file if it's not going to be
|
|
referenced in the metadata.
|
|
|
|
=item B<-[no]assets>
|
|
|
|
This option controls the downloading of any assets that may be associated with
|
|
a show. Assets are the files held on the HPR server which are referenced by
|
|
the show. Examples might be photographs, scripts, and supplementary notes.
|
|
Normally all such assets are collected and stored in the upload area and are
|
|
then sent to the archive via the script. The notes sent to the archive are
|
|
adjusted to refer to these notes on archive.org, making the HPR episode
|
|
completely self-contained.
|
|
|
|
=item B<-[no]meta_only> (alias B<-[no]noaudio>)
|
|
|
|
This option controls whether the output file will contain a reference to the
|
|
audio file(s) or only the metadata. The default is B<-nometa_only> meaning that
|
|
the file reference(s) and the metadata are present.
|
|
|
|
Omitting the file(s) allows the metadata to be regenerated, perhaps due to
|
|
edits and corrections in the database, and the changes to be propagated to
|
|
archive.org. If the file reference(s) exist(s) in the metadata file then the
|
|
file(s) must be available at the time the uploader is run.
|
|
|
|
Note that making changes this way is highly preferable to editing the entry on
|
|
archive.org using the web-based editor. This is because there is a problem
|
|
with the way HTML entities are treated and this can cause the HTML to be
|
|
corrupted.
|
|
|
|
=item B<-[no]silent>
|
|
|
|
The option enables (B<-silent>) and disables (B<-nosilent>) I<silent mode>.
|
|
When enabled the script reports nothing on STDOUT. If the script cannot find
|
|
the audio files and downloads the MP3 version from the HPR site for upload to
|
|
archive.org then the downloads are reported on STDERR. This cannot be
|
|
disabled, though the STDERR output could be redirected to a file or to
|
|
/dev/null.
|
|
|
|
If B<-silent> is specified with B<-verbose> then the latter "wins".
|
|
|
|
The script runs with silent mode disabled by default. When B<-nosilent> is
|
|
used with B<-noverbose> the script reports the output file name and nothing
|
|
else.
|
|
|
|
=item B<-[no]verbose>
|
|
|
|
This option enables (B<-verbose>) and disables (B<-noverbose>)
|
|
I<verbose mode>. When enabled the script reports the metadata it has collected
|
|
from the database before writing it to the output file. The data is reported
|
|
in a more readable mode than examining the CSV file, although another script
|
|
B<show_metadata> is also available to help with this.
|
|
|
|
If B<-verbose> is specified with B<-silent> then the former "wins".
|
|
|
|
The script runs with verbose mode disabled by default.
|
|
|
|
=item B<-[no]ignore_missing>
|
|
|
|
The script checks each episode to ensure it has a summary and tags. If either
|
|
of these fields is missing then a warning message is printed for that episode
|
|
(unless B<-silent> has been chosen), and if any episodes are lacking this
|
|
information the script aborts without producing metadata. If the option
|
|
B<-ignore_missing> is selected then the warnings are produced (dependent on
|
|
B<-silent>) but the script runs to completion.
|
|
|
|
The default setting is B<-noignore_missing>; the script checks and aborts if
|
|
any summaries or tags are missing.
|
|
|
|
=item B<-[no]test>
|
|
|
|
DO NOT USE!
|
|
|
|
This option enables (B<-test>) and disables (B<-notest>)
|
|
I<test mode>. When enabled the script generates metadata containing various
|
|
test values.
|
|
|
|
In test mode the following changes are made:
|
|
|
|
=over 4
|
|
|
|
=item .
|
|
|
|
The item names, which normally contain 'hprnnnn', built from the episode
|
|
number, have 'test_' prepended to them.
|
|
|
|
=item .
|
|
|
|
The collection, which is normally a list containing 'hackerpublicradio' and
|
|
'podcasts', is changed to 'test_collection'. Items in this collection are
|
|
normally deleted by Archive.org after 30 days.
|
|
|
|
=item .
|
|
|
|
The contributor, which is normally 'HackerPublicRadio' is changed to
|
|
'perlist'.
|
|
|
|
=back
|
|
|
|
B<NOTE> The test mode only works for the author!
|
|
|
|
=item B<-config=FILE>
|
|
|
|
This option allows an alternative script configuration file to be used. This
|
|
file defines various settings relating to the running of the script - things
|
|
like the place to look for the files to be uploaded. It is rare to need to use
|
|
any other file than the default since these are specific to the environmewnt
|
|
in which the script runs. However, this has been added at the same time as an
|
|
alternative database configuration option was added.
|
|
|
|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
|
|
|
If the option is omitted the default file is used: B<.make_metadata.cfg>
|
|
|
|
=item B<-dbconfig=FILE>
|
|
|
|
This option allows an alternative database configuration file to be used. This
|
|
file defines the location of the database, its port, its name and the username
|
|
and password to be used to access it. This feature was added to allow the
|
|
script to access alternative databases or the live database over an SSH
|
|
tunnel.
|
|
|
|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
|
|
|
If the option is omitted the default file is used: B<.hpr_db.cfg>
|
|
|
|
=back
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This script generates metadata suitable for uploading Hacker Public Radio
|
|
episodes to the Internet Archive (archive.org).
|
|
|
|
The metadata is in comma-separated variable (CSV) format suitable for
|
|
processing with an upload script. The original upload script was called
|
|
B<ias3upload.pl>, and could be obtained from
|
|
I<https://github.com/kngenie/ias3upload>. This script is no longer supported
|
|
and B<make_metadata> no longer generates output suitable for it (though it is
|
|
simple to make it compatible if necessary). The replacement script is called
|
|
B<internetarchive> which is a Python tool which can also be run from the
|
|
command line. It can be found at I<https://github.com/jjjake/internetarchive>.
|
|
|
|
The B<make_metadata> script generates CSV from the HPR database. It looks up
|
|
details for each episode selected by the options, and performs various
|
|
conversions and concatenations. The goal is to prepare items for the Internet
|
|
Archive with as much detail as the format can support.
|
|
|
|
The resulting CSV file contains a header line listing the field names required
|
|
by archive.org followed by as many CSV lines of episode data as requested (up
|
|
to a limit of 20).
|
|
|
|
Since the upload method uses the HTTP protocol with fields stored in headers,
|
|
there are restrictions on the way HTML can be formatted in the B<Details>
|
|
field. The script converts newlines, which are not allowed into I<<br/>> tags
|
|
where necessary.
|
|
|
|
HPR shows often have associated files, such as pictures, examples, long-form
|
|
notes and so forth. The script finds these and downloads them to the cache
|
|
area where the audio is kept and writes the necessary lines to the CSV file to
|
|
ensure they are uploaded with the show. It modifies any HTML which links to
|
|
these files to link to the archive.org copies in order to make the complete
|
|
show self-contained.
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
=over 8
|
|
|
|
=item B<Configuration file ... not found>
|
|
|
|
One or more of the configuration files has not been found.
|
|
|
|
=item B<Path ... not found>
|
|
|
|
The path specified in the B<uploads> definition in the configuration file
|
|
B<.make_metadata.cfg> does not exist. Check the configuration file.
|
|
|
|
=item B<Configuration data missing>
|
|
|
|
While checking the configuration file(s) the script has detected that settings
|
|
are missing. Check the details specified below and provide the missing
|
|
elements.
|
|
|
|
=item B<Mis-match between @fields and %dispatch!>
|
|
|
|
An internal error in the script has been detected where the elements of the
|
|
@fields array do not match the keys of the %dispatch hash. This is probably the
|
|
result of a failed attempt to edit either of these components.
|
|
|
|
Correct the error and run the script again.
|
|
|
|
=item B<Invalid list; no elements>
|
|
|
|
There are no list elements in the B<-list=LIST> option.
|
|
|
|
=item B<Invalid list; too many elements>
|
|
|
|
There are more than the allowed 20 elements in the list specified by the
|
|
B<-list=LIST> option.
|
|
|
|
=item B<Failed to parse -list=...>
|
|
|
|
A list was specified that did not contain a CSV list of numbers.
|
|
|
|
=item B<Invalid starting episode number (...)>
|
|
|
|
The value used in the B<-from> option must be greater than 0.
|
|
|
|
=item B<Do not combine -to and -count>
|
|
|
|
Using both the B<-to> and B<-count> is not permitted (and makes no sense).
|
|
|
|
=item B<Invalid range; ... is greater than ...>
|
|
|
|
The B<-from> episode number must be less than or equal to the B<-to> number.
|
|
|
|
=item B<Invalid range; range is too big (E<gt>20)>
|
|
|
|
The difference between the starting and ending episode number is greater than
|
|
20.
|
|
|
|
=item B<Invalid - too many '%d' sequences in '...'>
|
|
|
|
There were more than two '%d' sequences in the the name of the output file if
|
|
a range of episodes is being processed, or more than one if a single episode
|
|
has been specified.
|
|
|
|
=item B<Invalid - too few '%d' sequences in '...'>
|
|
|
|
There were fewer than two '%d' sequences in the the name of the output file
|
|
when a range of episodes was being processed.
|
|
|
|
=item B<Unable to open ... for output: ...>
|
|
|
|
The script was unable to open the requested output file.
|
|
|
|
=item B<Unable to find or download ...>
|
|
|
|
The script has not found a I<.WAV> file in the cache area so has attempted to
|
|
download the I<MP3> copy of the audio from the HPR website. This process has
|
|
failed.
|
|
|
|
=item B<Failed to find requested episode>
|
|
|
|
An episode number could not be found in the database. This error is not fatal.
|
|
|
|
=item B<Nothing to do>
|
|
|
|
After processing the range of episodes specified the script could not find
|
|
anything to do. This is most often caused by all of the episodes in the range
|
|
being invalid.
|
|
|
|
=item B<Aborted due to missing summaries and/or tags>
|
|
|
|
One or more of the shows being processed does not have a summary or tags. The
|
|
script has been told not to ignore this so has aborted before generating
|
|
metadata.
|
|
|
|
=item B<HTML::TreeBuilder failed to parse notes: ...>
|
|
|
|
The script failed to parse the HTML in the notes of one of the episodes. This
|
|
indicates a serious problem with these notes and is fatal since these notes
|
|
need to be corrected before the episode is uploaded to the Internet Archive.
|
|
|
|
=item B<HTML::TreeBuilder failed to process ...: ...>
|
|
|
|
While parsing the HTML in a related file the parse has failed. The file being
|
|
parsed is reported as well as the error that was encountered. This is likely
|
|
due to bad HTML.
|
|
|
|
=item B<Unable to open ... for writing: ...>
|
|
|
|
The script is attempting to open an HTML file which it has downloaded to
|
|
write back edited HTML, yet the open has failed. The filename is in the error
|
|
message as is the cause of the error.
|
|
|
|
=back
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
This script reads two configuration files in B<Config::General> format
|
|
(similar to Apache configuration files) for the path to the files to be
|
|
uploaded and for credentials to access the HPR database. Two files are used
|
|
because the database configuration file is used by several other scripts.
|
|
|
|
=head2 SCRIPT CONFIGURATION
|
|
|
|
The general configuration file is B<.make_metadata.cfg> (although this can be
|
|
overridden through the B<-config=FILE> option) and contains the following
|
|
lines:
|
|
|
|
uploads = "<path to files>"
|
|
filetemplate = "hpr%04d.%s"
|
|
baseURL = "http://hackerpublicradio.org"
|
|
sourceURLtemplate = "https://hackerpublicradio.org/eps/%s/index.html"
|
|
IAURLtemplate = "http://archive.org/download/%s/%s"
|
|
|
|
The I<uploads> line defines where the WAV files are to be found (currently
|
|
I</var/IA/uploads> on the VPS). The same area is used to store downloaded MP3
|
|
files and any supplementary files associated with the episode.
|
|
|
|
The I<filetemplate> line defines the format of an audio file such as
|
|
I<hpr1234.wav>. This should not be changed.
|
|
|
|
The I<baseURL> line defines the common base for download URLs. It is used when
|
|
parsing and standardising URLs relating to files on the HPR server.
|
|
|
|
The I<sourceURLtemplate> line defines the format of the URL required to access the
|
|
show on the HPR site. This should not be changed except in the unlikely event that the
|
|
these URLs change.
|
|
|
|
The I<IAURLtemplate> line defines the format of URLs on archive.org which is
|
|
used when generating new links in HTML notes or supplementary files.
|
|
|
|
=head2 DATABASE CONFIGURATION
|
|
|
|
The database configuration file is B<.hpr_db.cfg> (although this can be
|
|
overridden through the B<-dbconfig=FILE> option).
|
|
|
|
The layout of the file should be as follows:
|
|
|
|
<database>
|
|
host = 127.0.0.1
|
|
port = PORT
|
|
name = DATABASE
|
|
user = USERNAME
|
|
password = PASSWORD
|
|
</database>
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
Carp
|
|
Config::General
|
|
DBI
|
|
Data::Dumper
|
|
File::Find::Rule
|
|
File::Path
|
|
Getopt::Long
|
|
HTML::Entities
|
|
HTML::TreeBuilder
|
|
IO::HTML
|
|
LWP::Simple
|
|
List::MoreUtils
|
|
List::Util
|
|
Pod::Usage
|
|
Text::CSV_XS
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
There are no known bugs in this module.
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
Patches are welcome.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
Copyright (c) 2014-2019 Dave Morriss (Dave.Morriss@gmail.com).
|
|
All rights reserved.
|
|
|
|
This module is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
=cut
|
|
|
|
#}}}
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|