hpr-tools/InternetArchive/make_metadata

2818 lines
102 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: make_metadata
#
# USAGE: ./make_metadata -from=FROM [-to=TO] [-count=COUNT]
# [-output=FILE] [-[no]fetch] [-[no]meta_only] [-[no]verbose]
# [-[no]silent] [-[no]test] [-help] [-{documentation|man}]
#
# DESCRIPTION: Make metadata for the uploading of a range of HPR episodes to
# the Internet Archive by collecting fields from the database.
#
# OPTIONS: See the POD documentation for full details (./make_metadata
# -man) or use 'pod2pdf' to generate a PDF version in the
# directory holding this script.
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: 2023-07-06: A version in development 0.4.14 has been put aside
# and this version (0.4.12) made into the main line version
# because 4.14 was developing in a direction that doesn't fit
# with the changes made to the HPR system in June/July 2023.
# Will now move forward with version numbers (and will get
# a duplicate).
# 2024-01-23: Added the 'open' pragma for UTF-8
# 2024-07-08: Fixed a bug where the top-level directory was
# being added to assets paths. See the definition of $linkre for
# more detals.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.4.14
# CREATED: 2014-06-13 12:51:04
# REVISION: 2024-07-08 15:21:02
#
#===============================================================================
use 5.010;
use strict;
use warnings;
use open ':std', ':encoding(UTF-8)';
use Carp;
use Getopt::Long;
use Pod::Usage;
use List::Util qw{max};
use List::MoreUtils qw{uniq apply};
use Config::General;
use Text::CSV_XS;
use IO::HTML;
use HTML::TreeBuilder 5 -weak;
use HTML::Entities;
use File::Find::Rule;
use File::Path qw{make_path};
#use LWP::Simple;
use LWP::UserAgent;
use HTTP::Status qw(status_message);
use DBI;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.4.14';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
#-------------------------------------------------------------------------------
# Constants and defaults
#-------------------------------------------------------------------------------
my $basedir = "$ENV{HOME}/HPR/IA";
my $configfile = "$basedir/.$PROG.cfg";
my $dbconfigfile = "$basedir/.hpr_db.cfg";
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants used in the metadata
#
my $mediatype = 'audio';
my $collection = 'hackerpublicradio';
my $language = 'eng';
my $contributor = 'HackerPublicRadio';
#
# Variables, arrays and hashes
#
my ( $dbh, $sth1, $h1 );
my ( $DEBUG, $verbose, $silent, $test, $from, $to, $count );
my ( $list, $meta_only, $outfile, $scriptfile, $fetch, $assets);
my ( $acountfile, $ignore );
my ( @range, $tree, %links, %meta, @source, @transcripts, @counts, @head, @data );
my ( $ep_name, $ft, $file, $URL, $status, $incomplete );
my ( $filepath, $max_epno, $filetemplate, $sourceURLtemplate );
my ( $iauploadtemplate, $iauploadoptions );
#
# File types we'll look for when working out (and collecting) the main audio
# file and their order
#
my @filetypes = (qw{ wav mp3 ogg });
#
# Additional filetypes we'll look for if we're working round the lack of tags
# on the derived audio files. See the discussion at
# https://gitlab.anhonesthost.com/HPR/HPR_Public_Code/issues/34
#
my @additional_audio = (qw{ flac mp3 ogg opus spx });
#
# Templates for finding the transcripts.
#
# We expect these to be in $uploads/$episode/ where we expect to find:
# $episode.json
# $episode.srt
# $episode.tsv
# $episode.txt
# $episode.vtt
# The regular expression below should find the transcripts given that the
# search is directed to the sub-directory in $uploads.
#
my $transcript_re = qr{hpr\d{4}\.(json|srt|tsv|txt|vtt)};
#
# Names of CSV fields and their order
#
my @fields = (
qw{
identifier file mediatype collection title creator language
description contributor date subject licenseurl
}
);
#
# Maximum episodes per run
#
my $max_episodes = 20;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# A dispatch table to define how to build metadata. The keys are actually
# header names to be used in the metadata file and must relate to the @fields
# array above. Two types of values are expected here: 1) anonymous subroutine
# references which return the value for the field or 2) scalars (or strings)
# to be placed as-is. Some of the anonymous subroutines are written as
# closures (see
# http://en.wikipedia.org/wiki/Closure_%28computer_programming%29), which make
# it easier to build the calling code since we can pass in a variety of
# arguments internally while offering a simple no-argument call externally.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
my %dispatch = (
identifier => sub { return make_item( $h1, $test ) },
file => sub {
return (
$meta_only
? ''
: make_filename( $filepath, $filetemplate, \@filetypes, $h1 )
);
},
mediatype => $mediatype,
collection => $collection,
#title => sub { return decode_entities( db_title($h1) ) },
title => sub { return db_title($h1) },
creator => sub { return $h1->{host} },
language => $language,
description => sub { return db_notes( $sourceURLtemplate, $h1, $tree, \@source ) },
contributor => $contributor,
date => sub { return $h1->{date} },
subject => sub { return db_tags($h1) },
licenseurl => sub { return $h1->{license_url} }
);
#
# Validate the data structures above to ensure there have been no mis-edits
#
foreach (@fields) {
die "Mis-match between \@fields and \%dispatch!\n"
unless ( defined( $dispatch{$_} ) );
}
#
# The main query on the database. We want episode details, the host's name,
# the series name and the Creative Commons licence the episode is under.
# Note that the 'LEFT JOIN' for 'miniseries' is for the case where the
# 'series' column of 'eps' is 'NULL'. This is not the case for the live
# database - yet!
#
my $sql = q{
SELECT
e.id AS eps_id,
e.date,
e.title,
sec_to_time(e.duration) AS duration,
e.summary,
e.notes,
e.explicit,
e.license,
l.url as license_url,
e.tags,
e.valid,
h.host,
s.name AS s_name,
s.description AS s_desc
FROM eps e
JOIN hosts h ON e.hostid = h.hostid
LEFT JOIN miniseries s ON e.series = s.id
JOIN licenses l ON e.license = l.short_name
WHERE e.id = ?
};
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;
my %options;
Options( \%options );
#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
if ( $options{'help'} );
#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
if ( $options{'documentation'} );
#
# Collect options
#
$DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
$verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
$silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
$test = ( defined( $options{test} ) ? $options{test} : 0 );
$from = $options{from};
$to = $options{to};
$count = $options{count};
$list = $options{list};
$meta_only = ( defined( $options{meta_only} ) ? $options{meta_only} : 0 );
$fetch = ( defined( $options{fetch} ) ? $options{fetch} : 1 );
$assets = ( defined( $options{assets} ) ? $options{assets} : 1 );
$ignore
= ( defined( $options{ignore_missing} ) ? $options{ignore_missing} : 0 );
$outfile = $options{output};
$scriptfile = $options{script};
$acountfile = $options{a_count};
#
# The two config files can be overridden or default to the original
# declarations
#
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $dbcfgfile
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $dbconfigfile );
#-------------------------------------------------------------------------------
# Load and validate program configuration data
#-------------------------------------------------------------------------------
die "Configuration file $cfgfile not found\n" unless ( -e $cfgfile );
my $conf = Config::General->new(
-ConfigFile => $configfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %config = $conf->getall();
#
# Check the config settings
#
$filepath = $config{uploads};
die "Path $filepath not found\n" unless ( -e $filepath );
$max_epno = $config{max_epno};
$filetemplate = $config{filetemplate};
$sourceURLtemplate = $config{sourceURLtemplate};
$iauploadtemplate = $config{iauploadtemplate} . "\n";
$iauploadoptions = $config{iauploadoptions};
die "Configuration data missing\n"
unless $max_epno
&& $filetemplate
&& $sourceURLtemplate
&& $iauploadtemplate
&& $iauploadoptions;
#-------------------------------------------------------------------------------
# Load database configuration data
#-------------------------------------------------------------------------------
die "Configuration file $dbcfgfile not found\n" unless ( -e $dbcfgfile );
my $dbconf = Config::General->new(
-ConfigFile => $dbcfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %dbcfg = $dbconf->getall();
#-------------------------------------------------------------------------------
# Validate options
#-------------------------------------------------------------------------------
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
if ( ( !defined($from) and !defined($list) )
or ( defined($from) and defined($list) ) );
#
# Deal with the values specified by one of:
# -from=N1 -to=N2
# -from=N3 -count=N4
# -list="N1,N2,N3"
#
# End by populating @range with a list of episode numbers
#
if ( defined($list) ) {
#
# We have a list which we'll parse, validate, sort, make unique and filter
#
my $lcsv = Text::CSV_XS->new( { binary => 1, } );
if ( $lcsv->parse($list) ) {
@range = uniq( sort { $a <=> $b } $lcsv->fields() );
@range = grep {/\d+/} @range;
@range = grep { $_ > 0 && $_ <= $max_epno } @range;
@range = apply { $_ =~ s/(^\s*|\s*$)// } @range;
die "Invalid list; no elements\n" if scalar(@range) == 0;
die "Invalid list; too many elements\n"
if scalar(@range) > $max_episodes;
}
else {
die "Failed to parse -list='$list'\n" . $lcsv->error_diag() . "\n";
}
}
else {
#
# We have -from=N, -to=M or -count=X
#
die "Invalid starting episode number ($from)\n" unless $from > 0;
die "Do not combine -to and -count\n"
if ( defined($to) && defined($count) );
if ( defined($count) ) {
$to = $from + $count - 1;
}
elsif ( !defined($to) ) {
$to = $from;
}
die "Invalid range; $from is greater than $to\n" unless $from <= $to;
die "Invalid range; range is too big (>$max_episodes): $to - $from = "
. ( $to - $from ) . "\n" if ( $to - $from ) > $max_episodes;
@range = ( $from .. $to );
}
#
# Check we got a sensible range
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
unless scalar(@range) > 0;
#
# You can't be silent and verbose at the same time
#
$silent = 0 if $verbose;
#
# Fetching audio files is not relevant in 'metadata only' mode
# TODO: -nofetch with -meta_only incorrectly results in the download of files;
# see the Journal in regard to show 3004 in April 2020.
# TODO: 2023-07-06 Time to drop '-[no]fetch' completely?
#
$fetch = 0 if $meta_only;
#
# If there's a defined output filename then there are three options: a null
# string, a plain filename and a substitution string with '%d' sequences. The
# null string means the user used '-output' without a value, so we want to
# generate a substitution string. A string with '%d' requires a check to
# ensure there's the right number, one for $ubound and one for $lbound. The
# plain filename needs no more work.
#
# If no defined output filename we'll provide a default.
#
$outfile = expand_template( $outfile, 'metadata', 'csv',
( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
#
# Check the -script option. Like -output the choices are: a null string,
# a plain filename or a substitution string with '%d' sequences.
#
$scriptfile = expand_template( $scriptfile, 'script', 'sh',
( defined($list) ? ( $range[0], $range[$#range] ) : ( $from, $to ) ) );
#-------------------------------------------------------------------------------
# In verbose mode report the options
#-------------------------------------------------------------------------------
if ($verbose) {
my $fmt = "%18s: %s\n";
printf $fmt, 'From', $from if defined( $options{from} );
printf $fmt, 'To', $to if defined( $options{to} );
printf $fmt, 'Count', $count if defined( $options{count} );
printf $fmt, 'List', $list if defined( $options{list} );
printf $fmt, 'Output', $outfile;
printf $fmt, 'Output script', $scriptfile;
printf $fmt, 'Output asset count', ( $acountfile ? $acountfile : '--' );
printf $fmt, 'Meta only', ( $meta_only ? 'Yes' : 'No' );
printf $fmt, 'Fetch', ( $fetch ? 'Yes' : 'No' );
printf $fmt, 'Silent', ( $silent ? 'Yes' : 'No' );
printf $fmt, 'Verbose', ( $verbose ? 'Yes' : 'No' );
printf $fmt, 'Debug', $DEBUG;
printf $fmt, 'Script config', $cfgfile;
printf $fmt, 'Database config', $dbcfgfile;
print '-' x 80, "\n";
}
#-------------------------------------------------------------------------------
# Set up output and script file
#-------------------------------------------------------------------------------
open( my $out, '>:encoding(UTF-8)', $outfile )
or die "Unable to open $outfile for output: $!\n";
#
# Open the script file and add a header. Set execute permission on the
# file for the owner, and nobody else. We keep a count of lines (other than
# the header stuff) written to this file.
#
my $script_lines = 0;
open( my $script, '>:encoding(UTF-8)', $scriptfile )
or die "Unable to open $scriptfile for output: $!\n";
print $script "#!/usr/bin/env bash\n\n";
print $script <<'EndHead';
Upload () {
local id=${1}
local file=${2}
local remote=${3:-}
local options=${4:-}
if [[ -e $file ]]; then
if [[ -z $remote ]]; then
ia upload ${id} ${file} ${options}
else
ia upload ${id} ${file} --remote-name=${remote} ${options}
fi
else
echo "File missing: $file"
fi
}
EndHead
chmod(0744,$scriptfile);
#-------------------------------------------------------------------------------
# Make changes for test mode
#-------------------------------------------------------------------------------
# NOTE: Commented out 2023-07-06 since this is dangerous
#if ($test) {
# $dispatch{collection} = 'test_collection';
# $dispatch{contributor} = 'perlist';
#}
#
# Prepare to turn the data into CSV data. Since we're allowing embedded
# newlines (binary mode) add an explicit end of line.
#
# TODO: The new tool 'ia' does not accept newlines in the CSV, so no point in
# defining them here. They have been removed elsewhere in the code.
#
my $csv = Text::CSV_XS->new( { binary => 1, eol => "\r\n" } );
#-------------------------------------------------------------------------------
# Copy audio files from the website if requested and if needed. We look for
# the WAV file in the 'uploads' area but if it's not found we get the MP3
# (which is all we have for older shows).
#
# TODO: Given we're redirecting requests for audio files on the HPR site to
# archive.org there's potential for a nasty paradox here if the rewrite rules
# aren't perfect. We do the MP3 thing only for old shows, so we need to make
# sure that it works when we need it.
# TODO: Are there any logic flaws here? This was designed before we started
# creating all the audio formats we have now (partly to avoid the 'derive'
# process on archive.org from making copies without tags).
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE: Commented out 2023-07-06 because (a) this situation should not occur
# again now that all audio versions are on the IA and we get everything for
# new shows, (b) the URLs used for downloads are obsolete.
#-------------------------------------------------------------------------------
#if ( $fetch && !$meta_only ) {
#
# foreach my $episode (@range) {
# $ft = 0;
#
# #
# # Does the WAV version already exist?
# #
# $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
# if ( ! -e "$filepath/$file" ) {
# #
# # Doesn't exist, try the alternative version
# #
# $ft++;
# $file = sprintf( $filetemplate, $episode, $filetypes[$ft] );
# if ( !-e "$filepath/$file" ) {
# #
# # We need to download the file
# #
# $URL = sprintf( $URLtemplate, $file );
# print STDERR "Downloading $URL\n" unless $silent;
# $status = getstore( $URL, "$filepath/$file" );
# $status == 200 or warn "$status : $URL ($file)\n";
# }
# }
#
# die "Unable to find or download $file\n"
# unless ( -e "$filepath/$file" );
# }
#
#}
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $dbcfg{database}->{host} // '127.0.0.1';
my $dbport = $dbcfg{database}->{port} // 3306;
my $dbname = $dbcfg{database}->{name};
my $dbuser = $dbcfg{database}->{user};
my $dbpwd = $dbcfg{database}->{password};
$dbh = DBI->connect( "dbi:mysql:database=$dbname;host=$dbhost;port=$dbport",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or croak $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;
$dbh->{mysql_enable_utf8mb4} = 1;
#-------------------------------------------------------------------------------
# Collect the necessary information for constructing the metadata for each of
# the selected episodes.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Each key of the %meta hash will contain an arrayref with an element per
# episode in the @range array. Some of these elements will be objects
# (arrayrefs or hashrefs) containing multi-part items such as 'subject'
# strings from the 'tags' field in the database. The %meta keys correspond to
# the strings of the @fields array which correspond to IA metadata fields and
# are also used by the %dispatch hash to contain actions to perform to make
# the contents. A few of the %meta keys correspond to other items relating to
# the episode such as supplementary audio and supplementary files like
# pictures.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE: See the dump of the %meta hash in the Journal that goes with this
# project (Example 1). It should give a better representation of this
# structure.
#-------------------------------------------------------------------------------
$sth1 = $dbh->prepare($sql);
$incomplete = 0;
foreach my $episode (@range) {
#
# Get the episode from the database
#
$sth1->execute($episode);
if ( $dbh->err ) {
croak $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
unless ($h1) {
#
# Somehow this episode doesn't exist in the database. No idea why, but
# skip this one and keep going anyway.
#
carp "Failed to find requested episode number $episode\n";
next;
}
print STDERR "D> ", $h1->{title}, "\n";
#
# Make the episode name with leading zeroes
#
$ep_name = sprintf( 'hpr%04d', $episode );
#
# Check whether this episode has a summary and tags. Count up the number
# that are missing either of these.
# TODO: Time to remove this check?
#
if ( length( $h1->{summary} ) == 0 || length( $h1->{tags} ) == 0 ) {
print STDERR "Episode $episode is missing summary and/or tags\n"
unless ($silent);
$incomplete++;
}
#
# Handle supplementary audio
#
unless ( defined( $meta{audio} ) ) {
$meta{audio} = [];
}
my @audio;
#
# Look for other audio files for upload.
# TODO: Consider catering for hpr\d{4}(\.\d)?\.(flac|mp3|ogg|opus|spx)
# e.g. hpr2781.1.ogg which could happen if the audio has had to be
# re-released.
#
foreach my $ext ( @additional_audio ) {
my $audio
= sprintf( "%s/hpr%04d.%s", $config{uploads}, $episode, $ext );
if ( -e $audio ) {
push( @audio, $audio );
}
}
#
# Look for the source file if provided. It will be called
# 'hpr9999_source.{wav,flac,mp3,ogg,opus}' (though it's doubtful if people
# are sending in opus). For safety we'll accept any extension.
#
@source = File::Find::Rule->file()
->name("hpr${episode}_source.*")
->in($filepath);
#
# Generate the upload command(s) for the source file(s). We upload to the
# main directory for the item and request that no 'derive' process be run
# on the file(s). If we're just generating metadata it's not really
# necessary to do this. Rather than having a different format here we now
# use a generic format from the configuration file but pass in a null
# remote filename. We also add default options, also from the
# configuration file in all cases so we don't derive anything and don't
# generate an archive of old stuff.
#
# TODO: What if the source file(s) had been omitted in the first pass, and
# we need to upload them now? Could just do it by hand of course.
#
unless ($meta_only) {
for my $sou (@source) {
printf $script $iauploadtemplate,
'hpr' . $episode,
$sou,
'',
$iauploadoptions;
$script_lines++;
}
}
#
# Turn all source audio files into URLs pointing to the IA item
#
for (my $i = 0; $i <= $#source; $i++) {
( my $bn = $source[$i] ) =~ s|.*/||mx;
$source[$i] = sprintf( $config{IAURLtemplate}, "hpr$episode", $bn );
}
#push( @audio, @source ) if (@source);
#
# Save whatever was found, whether an arrayref of audio or undef
#
push( @{ $meta{audio} }, \@audio );
#
# Most shows have transcripts since Q2 2023. They aren't referred to in
# the show notes or database (not at present anyway). They get added to
# the static page on the HPR site with a footer template on every page.
# The files themselves arrive in a sub-directory of $uploads (usually
# `/data/IA/uploads/hpr1234/`) as part of the show components. For now
# we'll search for them and upload them using the Bash script we're
# building which is run after the initial 'ia upload'.
#
unless ($meta_only) {
@transcripts = File::Find::Rule->file()
->name($transcript_re)
->in("$filepath/$ep_name");
#
# Write 'Upload' function calls to the script for all of the transcripts
#
for my $ts (@transcripts) {
( my $bn = $ts ) =~ s|.*/||mx;
printf $script $iauploadtemplate, $ep_name, # identifier
$ts, # local file path
"$ep_name/$bn", # Remote file path
$iauploadoptions;
$script_lines++;
}
}
#
# Initialise the hash element for holding links to files in the notes per
# show
#
unless ( defined( $meta{links} ) ) {
$meta{links} = [];
}
#
# Scan the database-stored notes looking for links to files on the HPR
# server. For any HTML files found in that pass, download them and perform
# another scan. If further HTML files are found download and scan them and
# so on recursively. Any non-HTML files are left for downloading later.
#
#<<< (Stop perltidy reformatting oddly)
if (find_links_in_notes($episode, $h1->{notes}, \$tree, \%links, \%config,
$verbose, $silent) > 0) {
#>>>
#
# Save the links we found. Do this by copying the %links hash into the
# %meta hash
#
push( @{ $meta{links} }, {%links} );
}
else {
push( @{ $meta{links} }, undef );
}
#
# Build the metadata hash using the dispatch table. For each field we will
# have an array of data. This may be a scalar for single-valued items or
# an array for multi-valued ones.
#
foreach my $fld (@fields) {
unless ( defined( $meta{$fld} ) ) {
$meta{$fld} = [];
}
#
# If it's a code reference then call the code and save the result
# otherwise save the data
#
if ( ref( $dispatch{$fld} ) eq 'CODE' ) {
push( @{ $meta{$fld} }, $dispatch{$fld}->() );
}
else {
push( @{ $meta{$fld} }, $dispatch{$fld} );
}
}
}
_debug( $DEBUG >= 3, Dumper(\%meta) );
die "Nothing to do\n" unless %meta;
#
# If we're not ignoring missed summaries and tags and we found them then abort
# with a message.
# TODO: 2023-07-06 Not needed any more?
#
if ( $incomplete > 0 ) {
unless ($ignore) {
die "Aborted due to missing summaries and/or tags\n";
}
else {
say "Missing summaries and/or tags - ignored" unless ($silent);
}
}
#
# ~~ Explanation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Considering '%meta' as a "sideways spreadsheet" where the keys are row
# headings and the elements within the arrayrefs held as the hash values, the
# "cells" (array elements) may be arrayrefs with multiple elements (i.e. the
# spreadsheet is 3-dimensional!). We need to know the maximum number of array
# elements per cell because we have to expand the resulting CSV "spreadsheet"
# we'll generate for the 'ia upload' command by providing headers like
# "subject[1]" and "subject[2]". Hard to explain without a diagram!
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Compute the maximum number of sub-fields when a field can be multi-valued.
# This is because all CSV rows must have the same number of fields; the
# maximum of course.
#
@counts = map {
max( map { ref($_) eq 'ARRAY' ? scalar(@$_) : 1 } @{ $meta{$_} } )
} @fields;
#-------------------------------------------------------------------------------
# Report on the collected data if requested.
# See 'Example 1' in the Journal for this project for what the '%meta' hash
# looks like at this stage.
#-------------------------------------------------------------------------------
if ($verbose) {
foreach my $i ( 0 .. $#range ) {
my $ind = 0;
foreach my $fld (@fields) {
printf "%20s[%d]: %s\n", $fld, $counts[$ind],
defined( $meta{$fld}->[$i] )
? (
ref( $meta{$fld}->[$i] ) eq 'ARRAY'
? join( ",", @{ $meta{$fld}->[$i] } )
: $meta{$fld}->[$i]
)
: '';
$ind++;
}
#
# Each show will (most likely) have an array of audio types in an
# array of arrays.
#
if ( defined( $meta{audio}->[$i] ) ) {
printf "%23s: %s\n", 'added audio',
scalar( @{ $meta{audio}->[$i] } );
}
else {
printf "%23s: 0\n", 'added audio';
}
#
# Each show may have links. They will be stored as an array of hashes
# where each hash has sub-hashes, one per "asset". Some will just be
# 'undef' meaning that they have no assets.
#
if ( defined( $meta{links}->[$i] ) ) {
printf "%23s: %s\n", 'links',
scalar( keys( %{ $meta{links}->[$i] } ) );
}
else {
printf "%23s: 0\n", 'links';
}
print '-' x 80, "\n";
}
}
#-------------------------------------------------------------------------------
# Output asset counts if requested
# TODO: Used in 'past_upload', which is no longer used (since that project is
# complete).
#-------------------------------------------------------------------------------
if ($acountfile) {
my $acount_lines = 0;
open( my $acount, '>:encoding(UTF-8)', $acountfile )
or die "Unable to open $acountfile for output: $!\n";
foreach my $i ( 0 .. $#range ) {
if ( defined( $meta{links}->[$i] ) ) {
printf $acount "%s %d\n", $meta{identifier}->[$i],
scalar( keys( %{ $meta{links}->[$i] } ) );
$acount_lines++;
}
}
close($acount);
#
# If we never wrote anything to the asset count file then delete it
#
if ( $acount_lines eq 0 ) {
print "Deleting empty '$acountfile'\n" if $verbose;
unlink($acountfile);
}
}
#-------------------------------------------------------------------------------
# Download any linked files unless the -noassets option was used.
# Report the details in verbose mode. Only download if the file is not already
# downloaded.
# NOTE: HTML downloads will have already happened since we need to recursively
# parse and modify them. So if an asset is HTML on the HPR site an earlier
# stage will have downloaded it so that it can be parsed for any further HTML
# or assets.
#-------------------------------------------------------------------------------
if ($assets) {
foreach my $i ( 0 .. $#range ) {
if ( defined( $meta{links}->[$i] ) ) {
for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
my $linkfile = $meta{links}->[$i]->{$key}->{cached};
if ( ! -e $linkfile ) {
download_url( $key, $linkfile,
$meta{links}->[$i]->{$key}->{new},
$verbose, $silent );
}
}
}
}
}
#-------------------------------------------------------------------------------
# Generate the completed CSV
#-------------------------------------------------------------------------------
#
# First build and output the header fields. If there's a single header then just
# output it, but if it's a multi-item thing output the maximum with indexes
# (like "subject[0]").
#
for my $ind ( 0 .. $#fields ) {
if ( $counts[$ind] == 1 ) {
push( @head, $fields[$ind] );
}
else {
for ( my $i = 0; $i < $counts[$ind]; $i++ ) {
push( @head, $fields[$ind] . "[$i]" );
}
}
}
$csv->print( $out, \@head );
#
# Build and output the data. The structure we have is a hash where each
# element is an array with an element per episode (0 .. $range). The
# per-episode elements may be scalars (for single-value fields) or arrays (for
# multi-value). In the latter case we have to make sure we have filled all of
# the header positions (e.g. we have 10 "subject" headers, but this episode
# has only 1 - we have to fill in the remaining 9 slots with empty items).
#
foreach my $i ( 0 .. $#range ) {
@data = ();
my $ind = 0;
foreach my $fld (@fields) {
my $count = 0;
if ( ref( $meta{$fld}->[$i] ) eq 'ARRAY' ) {
foreach my $elem ( @{ $meta{$fld}->[$i] } ) {
push( @data, $elem );
$count++;
}
}
else {
push( @data, $meta{$fld}->[$i] );
$count++;
}
#
# Cater for any blank slots
#
if ( $count < $counts[$ind] ) {
push( @data, undef ) for 1 .. ( $counts[$ind] - $count );
}
$ind++;
}
#print Dumper( \@data ), "\n";
$csv->print( $out, \@data );
#
# If there are additional audio files or supplementary files for the
# current show write a CSV row for each of them here.
# NOTE: originally the CSV format did not require the second and
# subsequent row belonging to an IA identifier to have an identifier in it
# (maybe it was even illegal in the early days). Now the ia tool seems to
# have made it mandatory - surprisingly.
#
if ( defined( $meta{audio}->[$i] ) ) {
for my $audio (@{$meta{audio}->[$i]}) {
#
# Each row is a file path which we'll add to the CSV. We use the
# length of the @header array to get the number of CSV fields
#
@data = ();
# push( @data, undef, $audio );
push( @data, $meta{identifier}->[$i], $audio );
push( @data, undef ) for 1 .. ( scalar(@head) - 2 );
$csv->print( $out, \@data );
}
}
#
# The 'links' key holds an array of hashes containing details of the
# 'assets' relating to a show (files other than audio files, but including
# the original "source" audio). In make_metadata V0.3.x we used these
# hashes to add rows to the CSV file which results in the upload of these
# files. However, this mechanism does not cater for (a) the uploading of
# directories to the IA, and (b) the ability to disable the "derive"
# mechanism for a given file. Therefore we build a Bash script containing
# a function that calls 'ia' commands which can be used instead, but only
# if $meta_only is not set.
# TODO: 2023-07-06 We want to upload the transcripts created by 'whisper'
# and placed in the 'uploads' directory. We will add these to the 'script'
# file.
#
if ( ! $meta_only && defined( $meta{links}->[$i] ) ) {
for my $key ( keys( %{ $meta{links}->[$i] } ) ) {
#
# Build a call to the 'Upload' function being explicit about the
# remote name
#
( my $rem = $meta{links}->[$i]->{$key}->{cached} )
=~ s|$config{uploads}/||;
printf $script $iauploadtemplate,
$meta{identifier}->[$i],
$meta{links}->[$i]->{$key}->{cached},
$rem,
$iauploadoptions;
$script_lines++;
}
}
}
#
# Report the output and script file names
#
unless ($silent) {
print "Output file: $outfile\n";
print "Script file: $scriptfile\n" unless ( $script_lines eq 0 );
}
close($out);
close($script);
#
# If we never wrote anything to the script file then delete it
#
if ($script_lines eq 0) {
print "Deleting empty '$scriptfile'\n" if $verbose;
unlink($scriptfile);
}
exit;
#=== FUNCTION ================================================================
# NAME: make_item
# PURPOSE: Build an Archive.org item name for this item
# PARAMETERS: $h - hashref returned from the database
# $testmode - Boolean denoting test mode
# RETURNS: The newly fashioned item string
# DESCRIPTION: The method for generating an unique item string for
# Archive.org is embodied in this function
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub make_item {
my ( $h, $testmode ) = @_;
return sprintf( "%shpr%04d", ( $testmode ? 'test_' : '' ), $h->{eps_id} );
}
#=== FUNCTION ================================================================
# NAME: make_filename
# PURPOSE: Determine the filename path for the metadata
# PARAMETERS: $path - relative path to the files
# $template - template for forming the file name
# $ftypes - arrayref holding ordered file types
# $h - hashref returned from the database
# RETURNS: The path to the file
# DESCRIPTION: Forms the file path then checks that there really is a file of
# that name. Since we don't have *.wav files for every episode
# we might have one of the other formats. The array referenced
# by $ftypes contains a list of these file types, starting with
# the WAV default. At the moment we only look at the second
# choice if there is no WAV, but we could be more fancy if
# necessary.
# TODO The design of this code is not good. The earlier fetch
# stage will have determined what is available. We are
# potentially repeating these checks here, though they are
# necessary if -nofetch was specified. Needs some further
# thought.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub make_filename {
my ( $path, $template, $ftypes, $h ) = @_;
my $file;
my $ft = 0;
#
# Look for the WAV file first. If found, return it
#
$file = sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
if ( -e $file ) {
return $file;
}
else {
#
# Look for the alternative type of file and return that if found
#
$ft++;
$file
= sprintf( "%s/$template", $path, $h->{eps_id}, $ftypes->[$ft] );
if ( -e $file ) {
return $file;
}
else {
warn "No file found for hpr" . $h->{eps_id} . "\n";
return;
}
}
}
#=== FUNCTION ================================================================
# NAME: db_title
# PURPOSE: Return a title for the Archive.org item
# PARAMETERS: $h - hashref returned from the database
# RETURNS: The title
# DESCRIPTION: Makes the title string we want for the item
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub db_title {
my ($h) = @_;
# print STDERR "D> ", $h->{title}, "\n";
return sprintf( "hpr%04d :: %s", $h->{eps_id}, $h->{title} );
}
#=== FUNCTION ================================================================
# NAME: db_notes
# PURPOSE: Return the description for an episode
# PARAMETERS: $sourceURLtpl - printf template for making the HPR link to
# the original show (from config file)
# $h - hashref returned from the database
# $tree - parsed notes
# $asource - arrayref containing source audio file URLs
# RETURNS: The description built from the database for Archive.org
# DESCRIPTION: This routine generates the item description for Archive.org as
# HTML by concatenating the summary, series name (if there is
# one), duration, link to episode on the HPR site and the notes
# into a long string. The main work is in the formatting of the
# notes element which is stored as HTML in the database. Since
# we are making a single string from the multi-line data we have
# to take special action to preserve the contents of <pre> tags.
# See the comments in 'flatten_pre' for what is being done to
# achieve this.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub db_notes {
my ( $sourceURLtpl, $h, $tree, $asource ) = @_;
my ( $lines, $episode, $desc, $sourceurl, $notes );
#
# Ensure leading zeroes on the episode number and prepend 'hpr'
#
$episode = sprintf('hpr%04d', $h->{eps_id});
# $sourceurl = sprintf( "http://hackerpublicradio.org/eps.php?id=%04d",
# $h->{eps_id} );
$sourceurl = sprintf( $sourceURLtpl, $episode );
#
# Make the header with newlines so we can count them
# TODO: Should we use the series number 0 which denotes "no series" rather
# than the name "general"? Need to adjust the query to return this if so.
# FIXME: The "not series" 'general' is being removed and the logic will
# use NULL for an episode that is not in a series. The local copy of the
# database has had this change made, but not the live one. We have to
# cater for both. When it goes we can drop references to 'general'.
#
$desc = sprintf( "Summary: %s\n", $h->{summary} );
unless ( !defined( $h->{s_name} ) or $h->{s_name} eq 'general' ) {
$desc .= sprintf( "Series: %s\n", coalesce( $h->{s_name}, '' ) );
}
#$desc .= sprintf( "Duration: %s\n", $h->{duration} );
$desc .= sprintf( "Source: <a href=\"%s\">%s</a>\n", $sourceurl, $sourceurl );
#
# Add a pointer to the source audio if there is any (cater for multiple,
# but it's not likely we'll get more than one)
#
if (@{$asource}) {
foreach my $aurl (@{$asource}) {
$desc .= sprintf( "Original audio: <a href=\"%s\">%s</a>\n",
$aurl, $aurl );
}
}
#
# Count the lines so we can add <br/> tags at the end to make the header
# a standard size of 4 lines high to accomodate the media widget.
# TODO: Rationalise this since there is no media widget any more.
# Temporarily boosted the height to 6 lines now the Original audio has
# been added.
#
$lines = $desc =~ tr/\n//;
$desc =~ s#\n#<br />#g;
$desc .= ( "<br />" x ( 6 - $lines ) );
#
# Prepare the notes and add to the header. Actions are:
# 1. Make all <pre> nodes compatible by adding <br/>
# 2. Generate entities for all non-ASCII
# 3. Do the silly thing of double-encoding entities to counteract the IA
# bug
# 4. Remove all newlines ('ia' doesn't seem to know what to do with them)
# 5. Trim trailing spaces
#
# ----
# NOTE: Removed the double-encoding on 2017-10-07 since (unannounced) it
# looks as if the IA have fixed this bug. There is some debate over
# whether we should be encoding non-ASCII also.
# ----
# NOTE: 2017-11-25 restored the double-encoding since the bug bit again at
# the last upload. This is not the whole story though. Added a step to
# decode the notes before re-encoding the non-ASCII bits. This is to
# counteract what seems to be being done by Pandoc if we process the notes
# from plain text.
# ----
# NOTE: 2021-02-20 found a problem with the strategy of decoding
# everything and then encoding it again (selectively). The issue was seen
# in HPR show 3284 where the HTML notes contains a <pre> section with C++
# code in it which uses characters like '<' and '>'. These were encoded as
# HTML entities at shownote preparation time since they were flagged as
# bad HTML by my validator script. The original algorithm then decoded
# these characters but couldn't encode them with encode_entities because
# that would have encoded all the HTML tags. The decode/encode stuff is
# therefore HTML context dependent, and we don't have the means of
# handling this with the current script. So, the solution seems to be not
# to decode and encode at all. After running flatten_pre, simply run the
# re_encode_entities and trust that there's nothing in the notes that need
# special action here. Anyway, the preparation stage should have handled
# whatever's necessary.
# ----
#
#$notes = flatten_pre( $h->{notes} );
$notes = flatten_pre($tree);
## $notes = decode_entities($notes);
## $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
## $notes = re_encode_entities($notes);
$notes =~ s/\n//g;
$desc .= $notes;
$desc =~ s/\s*$//;
return $desc;
}
#=== FUNCTION ================================================================
# NAME: db_tags
# PURPOSE: Return the episode search tags as an array for populating
# 'subject' fields on Archive.org
# PARAMETERS: $h - hashref returned from the database
# RETURNS: A reference to an array of tags or undef if there are none
# DESCRIPTION: In the database the tags are held as a comma-delimited string.
# We need to turn this into an array to populate 'subject'
# fields in the CSV file. If there are no tags the caller needs
# an undef to be returned to indicate that a blank field needs
# to be constructed in the CSV. The format of the database
# string may be messy since we rely on the show's host to submit
# this information. We split the string taking this into
# consideration.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub db_tags {
my ($h) = @_;
return $h->{tags} ? [ split( /\s*,\s*/, $h->{tags} ) ] : undef;
}
#=== FUNCTION ================================================================
# NAME: flatten_pre
# PURPOSE: Process notes "flattening" <pre> contents
# PARAMETERS: $tree HTML::TreeBuilder object containing parsed and
# partially processed notes
# RETURNS: Processed notes
# DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
# contain some formatting tags like <em> and <code>, but spaces
# and newlines are significant. The Internet Archive upload API
# uses HTTP headers which are text strings without newlines, so
# when these tags are uploaded through this route some
# formatting is lost. What this routine does is parse the
# contents of all <pre> sections in $notes, adding <br/> tags
# to replace newlines. It has to perform a full parse
# since the contents may include HTML tags and these need to be
# passed through intact. It calls the subroutine 'flatten_item' to
# deal with the recursive nature of HTML tags.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flatten_pre {
my ($tree) = @_;
#
# Find all the <pre> tags
#
my @pre_tags = $tree->look_down( _tag => 'pre', );
#
# Walk the various <pre> elements in the document
#
foreach my $tag (@pre_tags) {
#
# Save the tag and empty the original
#
my $saved = $tag->clone();
$tag->delete_content();
#
# Walk the saved content and rebuild the tag into $atag using the
# nested arrayref structure permitted by HTML::Element for
# convenience (the alternative is a little nasty). See the
# documentation for 'new_from_lol' in HTML::Element.
#
my $atag;
foreach my $item ( @{ $saved->content_array_ref } ) {
push( @$atag, flatten_item($item) );
}
#
# Rebuild the tag from the arrayref we built. We treat the arrayref
# structure we just built as an array because otherwise the top level
# is interpreted as a spurious <null> tag.
#
$tag->push_content(@$atag);
}
#
# Trim out the original notes from the enclosing tags we added earlier
#
my $body = $tree->look_down( _tag => 'body' );
( my $result = $body->as_HTML( undef, ' ', {} ) )
=~ s{(^<body[^>]*>|</body>$)}{}gi;
return $result;
}
#=== FUNCTION ================================================================
# NAME: flatten_item
# PURPOSE: Recursively "flatten" items within the enclosing <pre>
# PARAMETERS: $item an HTML::Element item parsed from the original
# <pre> section
# RETURNS: An arrayref if the last seen item was a tag, otherwise a list
# DESCRIPTION: Since <pre> sections can contain inline elements which change
# the rendering of the text we need to parse these as we add
# <br/> tags. This routine does this by recursively descending
# through the contents. A common tag sequence is <pre><code> for
# scripts and the like. This routine deals with such sequences.
# It expects to receive the contents in sequence and builds the
# result as a nested arrayref structure.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flatten_item {
my ($item) = @_;
return unless defined($item);
my ( @result, %attr );
#
# Is it a sub-tag or non-tag content?
#
if ( ref($item) ) {
#
# It's a tag. Save the tag name and any attributes and recurse into
# it. Return an arrayref
#
push( @result, $item->tag() );
%attr = $item->all_external_attr();
push( @result, \%attr ) if %attr;
for my $child ( $item->content_list() ) {
push( @result, flatten_item($child) );
}
return \@result;
}
else {
#
# It's non-tag content. Join the lines with <br/> tags. Return an
# array (since this is a simple list).
#
# Note that we split with a LIMIT of -1 which causes any trailing list
# items to be returned; default behaviour is to drop them.
#
$item =~ s/\r//g;
my @content = split( /\n/, $item, -1 );
if (@content) {
#
# Remove a leading blank line - usually the result of
# a "<pre>'NL'text" sequence
#
shift(@content) if ( $content[0] =~ /^\s*$/ );
#
# Join back the lines with <br/> tags between them.
#
foreach my $txt (@content) {
push( @result, $txt, ['br'] );
}
#
# Remove the <br/> at the end, it's spurious
#
pop(@result);
}
return (@result);
}
}
#=== FUNCTION ================================================================
# NAME: re_encode_entities
# PURPOSE: Find all encoded entities and encode them all over again
# PARAMETERS: $notes - string containing unprocessed notes
# RETURNS: Processed notes
# DESCRIPTION: Uses a brutal regular expression substitution approach, but
# since this is a very unusual requirement brought about by what
# is essentially a bug in the way the Internet Archive stores
# and processes metadata, we have no choice.
# THROWS: No exceptions
# COMMENTS: It looks as if the software on archive.org saves HTML metadata
# in an XML file by encoding it as entities. So HTML tags are
# turned into &lt;TAG&gt; sequences. Then when HTML is required
# the XML is decoded back to <TAG>. Unfortunately any existing
# entities in the HTML like '&lt;' will also be decoded in this
# phase which may result in invalid HTML sequences. In this
# routine we are converting existing entities so that the
# decoding phase turns them into valid entities. The archive.org
# software should be doing this, but it isn't, and any messages
# sent to the staff there are ignored. Of course, there may be
# a point at which this bug is corrected and the double encoding
# process here becomes redundant. Then the call to this routine
# can be omitted.
# Note also that the IA editor corrupts HTML containing entities
# so should not be used. To make a change, edit the notes on the
# HPR database and generate new metadata with this script. Use
# the -meta_only option to avoid the need for the media (unless
# it too is being updated of course) and re-submit the CSV file
# to ias3upload.pl or ia.
# SEE ALSO: N/A
#===============================================================================
sub re_encode_entities {
my ($notes) = @_;
#
# Replace any '&xxx;' sequence by '&amp;xxx;'
#
$notes =~ s#\&([^;]+;)#&amp;$1#g;
return $notes;
}
#=== FUNCTION ================================================================
# NAME: find_links_in_notes
# PURPOSE: Finds HPR links in show notes
# PARAMETERS: $episode episode number we're dealing with
# $notes a string containing the shownotes
# $rtree reference to an HTML::TreeBuilder object built
# from the notes
# $rlinks hashref containing the links found
# $rconfig hashref containing config data
# $verbose setting controlling what reports are generated
# $silent setting controlling what reports are generated
# RETURNS: Count of the number of links
# DESCRIPTION: Parses the notes passed as an argument then scans these notes
# looking for links which relate to items on the HPR server.
# Normally these are files and images but could be anchor
# references as well. If the latter then the URL has to be
# parsed to remove the anchor. Unique links are saved in a hash
# as the path to the file that will be saved on the VPS and as
# the link to the item in the new notes. This hash will be
# passed back to the caller so it can get the file(s) and
# prepare them for upload to the IA. Finally, the link in the
# notes is modified to refer to the file that will be uploaded
# to the IA. The function returns the number of links it has
# recorded (not the number it has changed), as well as passing
# back the parsed tree and the link hash it has constructed.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_links_in_notes {
my ( $episode, $notes, $rtree, $rlinks, $rconfig, $verbose, $silent )
= @_;
my ( $linkre, $epstr, $uri, $slink );
my ( $oldfile, $newfile, $newURL );
_debug( $DEBUG >= 3, "Entered find_links_in_notes\n" );
#
# Initialise links
#
undef %$rlinks; # TODO: Consider whether we should do this?
#
# Create an object that can be shared downstream
#
$$rtree = HTML::TreeBuilder->new;
$$rtree->ignore_unknown(0);
$$rtree->no_expand_entities(1);
$$rtree->p_strict(1);
$$rtree->store_comments(1);
$$rtree->warn(1);
#
# Parse the notes. Die if we fail because then we know this show needs
# some urgent attention.
#
$$rtree->parse_content($notes)
or die "HTML::TreeBuilder failed to parse notes: $!\n";
#
# NOTE: No longer filtering out links not relating to the episode.
# Experience with 'upload_manager' has shown that there are some weird
# links in the HPR database which will cause problems.
#
# Regular expression to match links to the HPR server. Examples:
# http://hackerpublicradio.org/eps/hpr2153.png
# http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
# Also things like this (**Why Ken?**)
# ../eps/hpr2945/IMG_20191018_122746Z.jpg
# Don't match things like this when *not* processing 1986:
# http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
# ----------------------------------------------------------------------
# NOTE: 2024-07-08
#
# It used to be that we added a top-level hprXXXX directory to URLs
# because there wasn't one on the HPR server. This was because the
# majority of shows without assets had no files; the notes were taken from
# the database and displayed dynamically.
#
# Now all HPR shows have a top-level directory for holding the index.html
# with the pre-created notes page. So we DO NOT want to create that
# top-level part. The RE below matches but doesn't store it or we'd get
# one too many directory levels.
# ----------------------------------------------------------------------
#
$epstr = sprintf( "hpr%04d", $episode );
# my $re
# = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
$linkre = qr{
^https?://
(?:www.)?
(?:hacker|hobby)publicradio.org/eps/
$epstr/
(.+)$
}x;
#
# Scan the HTML tree for links we care about
#
for ( @{ $$rtree->extract_links( 'a', 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
#
# Standardise the link (expands relative URLs, removes any fragment).
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
# URIs are removed.
#
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
$uri = URI->new_abs( $link, $rconfig->{baseURL} );
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
_debug( $DEBUG >= 3, "\$uri = $uri\n" );
_debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
if $uri->fragment;
_debug( $DEBUG >= 3, "\$slink = $slink\n" );
#
# Is it an HPR link?
#
if ( $slink =~ $linkre ) {
#
# Save the last bracketed match, without any 'fragment' if there
# is one (we want this not to be URL-related)
# NOTE: Will we ever have a fragment here?
#
( $oldfile = "$+" ) =~ s/#.*$//;
_debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
#
# Does this file path begin with an 'hpr' prefix? If so is it the
# show id? If not we don't want to process it.
#
if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
if ( $1 ne $epstr ) {
_debug( $DEBUG >= 3, "Ignored $slink\n" );
next;
}
}
#
# The path and URL might end with a slash which means the URL is
# relying on the Web server to fill in the filename as
# 'index.html'. We have to make this explicit.
#
if ( $slink =~ /\/$/ ) {
$slink .= 'index.html';
$oldfile .= 'index.html';
}
#
# Save the original link if it's unique. We have a hashref in
# $rlinks (pointing to a global hash).
#
# We add a key made from the parsed link in which we store an
# anonymous hashref containing the name of the file we'll store in
# the cache area for upload, and the new IA-based URL we'll use in
# the notes. We rename files that don't start with 'hprNNNN_' with
# that prefix to make it clear it belongs to the show (it mainly
# helps organise and manage the cache if truth be told)
#
unless ( exists( $rlinks->{$slink} ) ) {
#
# Originally we turned "hpr9999/file.dat" into
# "hpr9999_file.dat". We don't want to do this any more so the
# code is much simpler
#
$newfile = $rconfig->{uploads} . "/$oldfile";
$newURL = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
#
# Save the link details as a sub-hash indexed by the
# standardised URL. Elements are:
# {
# cached => 'file path for the cache area',
# new => 'URL to be used on the IA',
# }
#
$rlinks->{$slink} = {};
$rlinks->{$slink}->{cached} = $newfile;
$rlinks->{$slink}->{new} = $newURL;
}
#
# Simply change the content of the link (in the parsed HTML) with
# the new URL built above. We know the attribute ('src' or 'href')
# from what 'extract_links' returned. Deal with any fragment we
# found as well.
#
if ( $uri->fragment ) {
$element->attr( $attr,
$rlinks->{$slink}->{new} . '#' . $uri->fragment );
}
else {
$element->attr( $attr, $rlinks->{$slink}->{new} );
}
}
}
#
# If we found any links the notes will have been changed, but since the
# tree is shared with the caller this will result in changes to the
# metadata. However, we want to look at any HTML files in those links
#
if ( scalar( keys(%$rlinks) ) > 0 ) {
#
# If any links are HTML we need to download them and recursively parse,
# record and possibly change their contents. This updates $rlinks
#
foreach my $key ( keys(%$rlinks) ) {
if ( $key =~ /\.html$/ ) {
my $linkfile = $rlinks->{$key}->{cached};
#
# Get the file unless we've already collected it
#
if ( ! -e $linkfile ) {
download_url( $key, $linkfile, $rlinks->{$key}->{new},
$verbose, $silent );
}
#
# Do the recursive parsing since we need to know about further
# links
#
find_links_in_file(
$episode, $linkfile, $rlinks,
$rconfig, $verbose, $silent
);
}
}
}
#
# Return the link count after all recursion through supplementary files
# and so on
#
return scalar( keys(%$rlinks) );
}
#=== FUNCTION ================================================================
# NAME: find_links_in_file
# PURPOSE: Finds HPR links in related files
# PARAMETERS: $episode episode number we're dealing with
# $filename file being examined
# $rlinks hashref containing the links found
# $rconfig hashref containing config data
# $verbose setting controlling what reports are generated
# $silent setting controlling what reports are generated
# RETURNS: The number of new links found and downloaded
# DESCRIPTION: The routine is related to 'find_links_in_notes' but doesn't work
# quite the same way. It is given the name of an HTML which has
# already been downloaded. It parses this with HTML::TreeBuilder
# because it neds to be scanned and possibly edited. The HTML
# tree is scanned for <a> and <img> tags. If the URL returned is
# an HPR URL then we will want to change it to an IA one so the
# show can reference its components on archive.org and be
# independent of HPR. If a link is found which hasn't been seen
# before it is saved in the $rlinks hash keyed by the original
# URL. The link URL is replaced by the IA version in the HTML
# tree. Once the scan is complete if URL edits have been counted
# the file is rewritten from the modified tree. Then if any new
# links have been found these are checked, and if any are HTML
# they are downloaded ready to be parsed. Parsing is done by
# recursively calling this routine. Other links, which are not
# HTML exist in the $rlinks hash and are downloaded later in the
# script since they do not need to be edited.
# NOTE: As of 2017-12-10 this code has been used in a live
# situation but the recursive capabilities have not been
# invoked. This is because we have shows with the usual short
# notes which reference longer notes, and sometimes the latter
# have links to other files. However, so far none of these cases
# have referenced further HTML files needing scanning and
# editing.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_links_in_file {
my ( $episode, $filename, $rlinks, $rconfig, $verbose, $silent ) = @_;
my ( $linkre, $epstr, $tree, $uri, $slink );
my ( $oldfile, $newfile, $newURL );
my ( $encoding, $linkcount, $linkedits );
_debug( $DEBUG >= 3, "Entered find_links_in_file\n" );
#
# Create a tree object
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);
$tree->warn(1);
#
# Parse the file using IO::HTML to grab it. Die if we fail because then we
# know this stuff needs some urgent attention.
#
$tree->parse_file( html_file($filename) )
or die "HTML::TreeBuilder failed to process $filename: $!\n";
#
# NOTE: No longer filtering out links not relating to the episode.
# Experience with 'upload_manager' has shown that there are some weird
# links in the HPR database which will cause problems.
#
# Regular expression to match links to the HPR server. Examples:
# http://hackerpublicradio.org/eps/hpr2153.png
# http://www.hackerpublicradio.org/eps/hpr1303/Music_Notes.html
# Also things like this (**Why Ken?**)
# ../eps/hpr2945/IMG_20191018_122746Z.jpg
# Don't match things like this when *not* processing 1986:
# http://hackerpublicradio.org/eps/hpr1986/full_shownotes.html#example-2
#
$epstr = sprintf( "hpr%04d", $episode );
# my $re
# = qr{^http://(?:www.)?(?:hacker|hobby)publicradio.org/eps/(?:hpr$epstr/(.+)|(hpr$epstr.+))$}x;
$linkre = qr{
^https?://
(?:www.)?
(?:hacker|hobby)publicradio.org/eps/
(.+)$
}x;
#
# Counting new links found and stashed as well as edits made
#
$linkcount = 0;
$linkedits = 0;
#
# Scan the HTML tree for links we care about
#
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
#
# Standardise the link (expands relative URLs, removes any fragment)
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
# URIs are removed.
#
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
$uri = URI->new_abs( $link, $rconfig->{baseURL} );
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
_debug( $DEBUG >= 3, "\$uri = $uri\n" );
_debug( $DEBUG >= 3, "\$uri->fragment = " . $uri->fragment )
if $uri->fragment;
_debug( $DEBUG >= 3, "\$slink = $slink, \n" );
#
# Is it an HPR link?
#
if ( $slink =~ $linkre ) {
#
# Save the last bracketed match, without any 'fragment' if there
# is one (we want this not to be URL-related)
# NOTE: Will we ever have a fragment here?
#
( $oldfile = "$+" ) =~ s/#.*$//;
_debug( $DEBUG >= 3, "\$oldfile = $oldfile\n" );
#
# Does this file path begin with an 'hpr' prefix? If so is it the
# show id? If not we don't want to process it.
#
if ( $oldfile =~ /^(hpr[0-9]{1,4})/ ) {
if ( $1 ne $epstr ) {
_debug( $DEBUG >= 3, "Ignored $slink\n" );
next;
}
}
#
# The path and URL might end with a slash which means the URL is
# relying on the Web server to fill in the filename as
# 'index.html'. We have to make this explicit.
#
if ( $slink =~ /\/$/ ) {
$slink .= 'index.html';
$oldfile .= 'index.html';
}
#
# Save the original link if it's unique. We have a hashref in
# $rlinks (pointing to a global hash).
#
# We add a key made from the parsed link in which we store an
# anonymous hashref containing the name of the file we'll store in
# the cache area for upload, and the new IA-based URL we'll use in
# the notes. We rename files that don't start with 'hprNNNN_' with
# that prefix to make it clear it belongs to the show (it mainly
# helps organise and manage the cache if truth be told)
#
unless ( exists( $rlinks->{$slink} ) ) {
#
# Originally we turned "hpr9999/file.dat" into
# "hpr9999_file.dat". We don't want to do this any more so the
# code is much simpler
#
$newfile = $rconfig->{uploads} . "/$oldfile";
$newURL = sprintf( $rconfig->{IAURLtemplate}, $epstr, $oldfile );
#
# Save the link details as a sub-hash indexed by the
# standardised URL. Elements are:
# {
# cached => 'file path for the cache area',
# new => 'URL to be used on the IA',
# }
#
$rlinks->{$slink} = {};
$rlinks->{$slink}->{cached} = $newfile;
$rlinks->{$slink}->{new} = $newURL;
$linkcount++;
}
#
# Simply change the content of the link (in the parsed HTML) with
# the new URL built above. We know the attribute ('src' or 'href')
# from what 'extract_links' returned. Deal with any fragment we
# found as well.
#
if ( $uri->fragment ) {
$element->attr( $attr,
$rlinks->{$slink}->{new} . '#' . $uri->fragment );
}
else {
$element->attr( $attr, $rlinks->{$slink}->{new} );
}
$linkedits++;
}
}
#
# If we found HPR links then we'll have changed them in the tree, so we
# need to update the file. We use 'as_HTML' with no entities, indentation
# by one space and no optional end tags (see HTML::Element). If the file
# gets examined after the edits there should be no eligible HPR links, so
# no action.
#
if ( $linkedits > 0 ) {
open( my $out, ">:encoding(UTF-8)", $filename )
or die "Unable to open $filename for writing: $!\n";
print $out $tree->as_HTML( undef, ' ', {} );
close($out);
print STDERR "Altered links in $filename\n" unless $silent;
}
#
# If we found new links then $linkcount will be non-zero and we need to
# download and parse them, otherwise nothing else to do.
#
if ( $linkcount > 0 ) {
#
# If any links are HTML we need to download them and recursively parse,
# record and possibly change their contents
#
foreach my $key ( keys(%$rlinks) ) {
if ( $key =~ /\.html$/ ) {
my $linkfile = $rlinks->{$key}->{cached};
#
# Get the file unless we've already collected it
#
if ( !-e $linkfile ) {
download_url( $key, $linkfile, $rlinks->{$key}->{new},
$verbose, $silent );
}
#
# Do the recursive parsing since we need to know about further
# links
#
find_links_in_file(
$episode, $linkfile, $rlinks,
$rconfig, $verbose, $silent
);
}
}
}
#
# Return the link count
#
return $linkcount;
}
#=== FUNCTION ================================================================
# NAME: download_url
# PURPOSE: Download a file from an URL
# PARAMETERS: $from the URL the file is to be collected from
# $to the path to which the file is to be written
# $upload the eventual IA URL (for information)
# $verbose setting controlling what reports are generated
# $silent setting controlling what reports are generated
# RETURNS: The status value from the download
# DESCRIPTION: If 'verbose' is true the details of the download are reported.
# We are to download from the URL specified as $from, and the
# destination is the path in $to which is in a temporary cache
# area. This may result in the original file being renamed. The
# value in $upload shows the URL the file will be available at
# on the IA once the metadata has been used for uploads. A brief
# message is written to STDERR by default then the HTTP download
# is initiated with a warning generated if the download is not
# successful.
# Moved from LWP::Simple to LWP::UserAgent to get more control
# over the download. The former failed when an HTTPS URL was
# used and the server didn't offer this type of connection,
# whereas the latter does not.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub download_url {
my ( $from, $to, $upload, $verbose, $silent ) = @_;
my ( $dirname, $status );
if ($verbose) {
print "Link to be downloaded: $from\n";
print " to ", $to, "\n";
print " and uploaded as ", $upload, "\n";
}
#
# Extract the directory from the path. If the directory doesn't exist then
# make it.
#
( $dirname = $to ) =~ s|/?[^/]*$||mx;
make_path($dirname) unless -d $dirname;
#
# Collect the file and save it
#
print STDERR "Downloading $from\n" unless $silent;
$status = getstore( $from, $to, $verbose );
$status == 200 or warn "Download failed: $status : $from ($to)\n";
print '~' x 80, "\n" if $verbose;
return $status;
}
#=== FUNCTION ================================================================
# NAME: getstore
# PURPOSE: Get an URL and store the result in a file
# PARAMETERS: $from the URL to download
# $to where to put the result
# $verbose setting controlling what reports are generated
# RETURNS: The status code from the 'get'
# DESCRIPTION: When using LWP::Simple an attempt to fetch an URL with an
# 'https' method failed since the server doesn't offer this
# service. Using LWP::UserAgent this is not a problem, so we are
# effectively emulating LWP::Simple in a more complex way!
# However, we now have a means of taking action when the
# download fails in some predictable way.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub getstore {
my ( $from, $to, $verbose ) = @_;
#
# Create the agent, and identify it (a bit)
#
my $ua = LWP::UserAgent->new;
$ua->agent("make_metadata/$VERSION ");
#
# Get the URL and store it to the requested file
#
my $res = $ua->get( $from, ':content_file' => $to );
#
# Report what happened
#
print $res->status_line, "\n" if $verbose;
#
# Return status
#
return $res->code;
}
#=== FUNCTION ================================================================
# NAME: expand_template
# PURPOSE: Turns a filename template into a filename or a default
# PARAMETERS: $option The input from the -output or -script option
# $prefix Filename prefix
# $suffix Filename suffix
# $lbound Lower bound of the episode range
# $ubound Upper bound of the episode range
# RETURNS: The expanded template or the default
# DESCRIPTION: The -output or -script options both take an optional template,
# so we need to expand it with episode numbers.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub expand_template {
my ( $option, $prefix, $suffix, $lbound, $ubound ) = @_;
if ( defined($option) ) {
if ( $option =~ /^$/ ) {
#
# Build a default template depending on the episode range
#
if ( $lbound == $ubound ) {
$option = $prefix . '_%04d.' . $suffix;
$option = sprintf( $option, $lbound );
}
else {
$option = $prefix . '_%04d-%04d.' . $suffix;
$option = sprintf( $option, $lbound, $ubound );
}
}
elsif ( $option =~ /%(\d*)d/ ) {
#
# Caller specified a template. We need to check it
#
my $count = () = $option =~ /%(\d*)d/g;
die "Invalid - too many '%d' sequences in '$option'\n"
if ( $count > ( $lbound == $ubound ? 1 : 2 ) );
die "Invalid - too few '%d' sequences in '$option'\n"
if ( $count < ( $lbound == $ubound ? 1 : 2 ) );
$option =~ s/%(\d*)d/%04d/g;
$option = sprintf( $option, $lbound, $ubound );
}
}
else {
#
# The default
#
$option = "$prefix.$suffix"
}
return $option;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the first defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument or undef if there are none
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
# returned when doing so might be a problem.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return undef; ## no critic
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION: Process the options we want to offer. See the documentation
# for details
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "documentation|man",
"debug=i", "verbose!",
"silent!", "test!",
"meta_only|noaudio!", "from=i",
"to=i", "count=i",
"list=s", "output:s",
"fetch!", "ignore_missing|im!",
"assets!", "script:s",
"a_count=s", "dbconfig=s",
"config=s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage(
-msg => "$PROG version $VERSION\n",
-exitval => 1,
-verbose => 0
);
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
make_metadata - Generate metadata from the HPR database for Archive.org
=head1 VERSION
This documentation refers to make_metadata version 0.4.14
=head1 USAGE
make_metadata [-help] [-documentation]
make_metadata -from=FROM [-to=TO] [-count=COUNT] [-output[=FILE]]
[-script[=FILE]] [-a_count=FILE] [-[no]meta_only] [-[no]fetch]
[-[no]assets] [-[no]silent] [-[no]verbose] [-[no]test]
[-[no]ignore_missing] [-config=FILE] [-dbconfig=FILE] [-debug=N]
make_metadata -list=LIST [-output[=FILE]] [-script[=FILE]]
[-[no]meta_only] [-[no]fetch] [-[no]assets] [-[no]silent]
[-[no]verbose] [-[no]test] [-[no]ignore_missing] [-config=FILE]
[-dbconfig=FILE] [-debug=N]
Examples:
make_metadata -from=1234 -nofetch
make_metadata -from=1234 -to=1235
make_metadata -from=1234 -count=10
make_metadata -from=1 -to=3 -output=metadata_1-3.csv
make_metadata -from=1500 -to=1510 -out=metadata_1500-1510.csv -verbose
make_metadata -from=1500 -to=1510 -out=metadata_%d-%d.csv -verbose
make_metadata -from=500 -to=510 -out=metadata_%04d-%04d.csv -verbose
make_metadata -from=1500 -to=1510 -out -verbose
make_metadata -from=1500 -to=1510 -out
make_metadata -from=1675 -to=1680 -out=metadata_%d-%d.csv -meta_only
make_metadata -from=1450 -test
make_metadata -list='1234,2134,2314' -out -meta_only
make_metadata -list="931,932,933,935,938,939,940" -out -meta -ignore
make_metadata -dbconf=.hpr_livedb.cfg -from=1234 -to=1235
make_metadata -from=3004 -out -meta_only -noassets
=head1 OPTIONS
=over 8
=item B<-help>
Reports brief information about how to use the script and exits. To see the
full documentation use the option B<-documentation> or B<-man>. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.
=item B<-documentation> or B<-man>
Reports full information about how to use the script and exits. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.
=item B<-debug=N>
Run in debug mode at the level specified by I<N>. Possible values are:
=over 4
=item B<0>
No debugging (the default).
=item B<1>
TBA
=item B<2>
TBA
=item B<3>
TBA
=item B<4 and above>
The metadata hash is dumped.
Each call of the function I<find_links_in_notes> is reported. On finding an
<a> or <img> tag the I<uri> value is shown, as is any fragment and the related
link. The original file is reported here.
Each call of the function I<find_links_in_file> is reported. On finding an
<a> or <img> tag the I<uri> value is shown, as is any fragment and the related
link. The original file is reported here, and if a link is to be ignored this
is reported.
=back
=item B<-from=NUMBER>
This option defines the starting episode number of a group. It is mandatory to
provide either the B<-from=NUMBER> option or the B<-list=LIST> option (see
below).
=item B<-to=NUMBER>
This option specifies the final episode number of a group. If not given the
script generates metadata for the single episode indicated by B<-from>.
The value given here must be greater than or equal to that given in the
B<-from> option. The option must not be present with the B<-count> option.
The difference between the episode numbers given by the B<-from> and B<-to>
options must not be greater than 20.
=item B<-count=NUMBER>
This option specifies the number of episodes to process (starting from the
episode number specified by the B<-from>) option. The option must not be
present with the B<-to> option.
The number of episodes specified must not be greater than 20.
=item B<-list=LIST>
This option is an alternative to B<-from=NUMBER> and its associated modifying
options. The B<LIST> is a comma-separated list of not necessarily consecutive
episode numbers, and must consist of at least one and no more than 20 numbers.
This option is useful for the case when non-sequential episode numbers are to
be uploaded, and is particularly useful when repairing elements of particular
episodes (such as adding summary fields and tags) where they have already
been uploaded.
For example, the following shows have no summary and/or tags, but the shows
are already in the IA. The missing items have been provided, so we wish to
update the HTML part of the upload:
$ ./make_metadata -list='2022,2027,2028,2029,2030,2033' -out -meta
Output file: metadata_2022-2033.csv
=item B<-output[=FILE]>
This option specifies the file to receive the generated CSV data. If omitted
the output is written to B<metadata.csv> in the current directory.
The file name may contain one or two instances of the characters '%d', with
a leading width specification if desired (such as '%04d'). These will be
substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
appropriate endpoint (adding the count to the starting number). If neither of
the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
be one instance of '%d' or the script will abort.
If no value is provided to B<-output> then a suitable template will be
generated. It will be 'metadata_%04d.csv' if one episode is being processed, and
'metadata_%04d-%04d.csv' if a range has been specified.
Example:
./make_metadata -from=1430 -out=metadata_%04d.csv
the output file name will be B<metadata_1430.csv>. The same effect can be
achieved with:
./make_metadata -from=1430 -out=
or
./make_metadata -from=1430 -out
=item B<-script[=FILE]>
This option specifies the file to receive commands required to upload certain
files relating to a show. If omitted the commands are written to B<script.sh>
in the current directory.
The file name may contain one or two instances of the characters '%d', with
a leading width specification if desired (such as '%04d'). These will be
substituted by the B<-from=NUMBER> and B<-to=NUMBER> values or if
B<-from=NUMBER> and B<-count=NUMBER> are used, the second number will be the
appropriate endpoint (adding the count to the starting number). If neither of
the B<-to=NUMBER> and B<-count=NUMBER> options are used then there should only
be one instance of '%d' or the script will abort.
If no value is provided to B<-script> then a suitable template will be
generated. It will be 'script_%04d.sh' if one episode is being processed, and
'script_%04d-%04d.sh' if a range has been specified.
Example:
./make_metadata -from=1430 -script=script_%04d.sh
the output file name will be B<script_1430.sh>. The same effect can be
achieved with:
./make_metadata -from=1430 -script=
or
./make_metadata -from=1430 -script
=item B<-a_count=FILE>
Defines a file into which the script writes details of assets downloaded
during the analysis of notes (and other HTML files associated with a show).
The listing consists of the show identifier (e.g. 'hpr3901') followed by the
number of links followed to collect the files for this show.
This feature was added to allow other scripts to perform tasks with these
assets, but is now deprecated.
The feature will probably be removed in a later release of this script.
=item B<-[no]fetch>
This option controls whether the script attempts to fetch the MP3 audio file
from the HPR website should there be no WAV file in the upload area. The
default setting is B<-fetch>.
Normally the script is run as part of the workflow to upload the metadata and
audio to archive.org. The audio is expected to be a WAV file and to be in the
location referenced in the configuration file under the 'uploads' label.
However, not all of the WAV files exist for older shows.
When the WAV file is missing and B<-fetch> is selected or defaulted, the
script will attempt to download the MP3 version of the audio and will store it
in the 'uploads' area for the upload script (B<ias3upload.pl> or B<ia>) to
send to archive.org. If the MP3 file is not found then the script will abort.
If B<-fetch> is specified (or defaulted) as well as B<-nometa_only> (see
below) then the audio file fetching process will not be carried out. This is
because it makes no sense to fetch this file if it's not going to be
referenced in the metadata.
=item B<-[no]assets>
This option controls the downloading of any assets that may be associated with
a show. Assets are the files held on the HPR server which are referenced by
the show. Examples might be photographs, scripts, and supplementary notes.
Normally all such assets are collected and stored in the upload area and are
then sent to the archive via the script. The notes sent to the archive are
adjusted to refer to these notes on archive.org, making the HPR episode
completely self-contained.
=item B<-[no]meta_only> (alias B<-[no]noaudio>)
This option controls whether the output file will contain a reference to the
audio file(s) or only the metadata. The default is B<-nometa_only> meaning that
the file reference(s) and the metadata are present.
Omitting the file(s) allows the metadata to be regenerated, perhaps due to
edits and corrections in the database, and the changes to be propagated to
archive.org. If the file reference(s) exist(s) in the metadata file then the
file(s) must be available at the time the uploader is run.
Note that making changes this way is highly preferable to editing the entry on
archive.org using the web-based editor. This is because there is a problem
with the way HTML entities are treated and this can cause the HTML to be
corrupted.
=item B<-[no]silent>
The option enables (B<-silent>) and disables (B<-nosilent>) I<silent mode>.
When enabled the script reports nothing on STDOUT. If the script cannot find
the audio files and downloads the MP3 version from the HPR site for upload to
archive.org then the downloads are reported on STDERR. This cannot be
disabled, though the STDERR output could be redirected to a file or to
/dev/null.
If B<-silent> is specified with B<-verbose> then the latter "wins".
The script runs with silent mode disabled by default. When B<-nosilent> is
used with B<-noverbose> the script reports the output file name and nothing
else.
=item B<-[no]verbose>
This option enables (B<-verbose>) and disables (B<-noverbose>)
I<verbose mode>. When enabled the script reports the metadata it has collected
from the database before writing it to the output file. The data is reported
in a more readable mode than examining the CSV file, although another script
B<show_metadata> is also available to help with this.
If B<-verbose> is specified with B<-silent> then the former "wins".
The script runs with verbose mode disabled by default.
=item B<-[no]ignore_missing>
The script checks each episode to ensure it has a summary and tags. If either
of these fields is missing then a warning message is printed for that episode
(unless B<-silent> has been chosen), and if any episodes are lacking this
information the script aborts without producing metadata. If the option
B<-ignore_missing> is selected then the warnings are produced (dependent on
B<-silent>) but the script runs to completion.
The default setting is B<-noignore_missing>; the script checks and aborts if
any summaries or tags are missing.
=item B<-[no]test>
DO NOT USE!
This option enables (B<-test>) and disables (B<-notest>)
I<test mode>. When enabled the script generates metadata containing various
test values.
In test mode the following changes are made:
=over 4
=item .
The item names, which normally contain 'hprnnnn', built from the episode
number, have 'test_' prepended to them.
=item .
The collection, which is normally a list containing 'hackerpublicradio' and
'podcasts', is changed to 'test_collection'. Items in this collection are
normally deleted by Archive.org after 30 days.
=item .
The contributor, which is normally 'HackerPublicRadio' is changed to
'perlist'.
=back
B<NOTE> The test mode only works for the author!
=item B<-config=FILE>
This option allows an alternative script configuration file to be used. This
file defines various settings relating to the running of the script - things
like the place to look for the files to be uploaded. It is rare to need to use
any other file than the default since these are specific to the environmewnt
in which the script runs. However, this has been added at the same time as an
alternative database configuration option was added.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<.make_metadata.cfg>
=item B<-dbconfig=FILE>
This option allows an alternative database configuration file to be used. This
file defines the location of the database, its port, its name and the username
and password to be used to access it. This feature was added to allow the
script to access alternative databases or the live database over an SSH
tunnel.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<.hpr_db.cfg>
=back
=head1 DESCRIPTION
This script generates metadata suitable for uploading Hacker Public Radio
episodes to the Internet Archive (archive.org).
The metadata is in comma-separated variable (CSV) format suitable for
processing with an upload script. The original upload script was called
B<ias3upload.pl>, and could be obtained from
I<https://github.com/kngenie/ias3upload>. This script is no longer supported
and B<make_metadata> no longer generates output suitable for it (though it is
simple to make it compatible if necessary). The replacement script is called
B<internetarchive> which is a Python tool which can also be run from the
command line. It can be found at I<https://github.com/jjjake/internetarchive>.
The B<make_metadata> script generates CSV from the HPR database. It looks up
details for each episode selected by the options, and performs various
conversions and concatenations. The goal is to prepare items for the Internet
Archive with as much detail as the format can support.
The resulting CSV file contains a header line listing the field names required
by archive.org followed by as many CSV lines of episode data as requested (up
to a limit of 20).
Since the upload method uses the HTTP protocol with fields stored in headers,
there are restrictions on the way HTML can be formatted in the B<Details>
field. The script converts newlines, which are not allowed into I<<br/>> tags
where necessary.
HPR shows often have associated files, such as pictures, examples, long-form
notes and so forth. The script finds these and downloads them to the cache
area where the audio is kept and writes the necessary lines to the CSV file to
ensure they are uploaded with the show. It modifies any HTML which links to
these files to link to the archive.org copies in order to make the complete
show self-contained.
=head1 DIAGNOSTICS
=over 8
=item B<Configuration file ... not found>
One or more of the configuration files has not been found.
=item B<Path ... not found>
The path specified in the B<uploads> definition in the configuration file
B<.make_metadata.cfg> does not exist. Check the configuration file.
=item B<Configuration data missing>
While checking the configuration file(s) the script has detected that settings
are missing. Check the details specified below and provide the missing
elements.
=item B<Mis-match between @fields and %dispatch!>
An internal error in the script has been detected where the elements of the
@fields array do not match the keys of the %dispatch hash. This is probably the
result of a failed attempt to edit either of these components.
Correct the error and run the script again.
=item B<Invalid list; no elements>
There are no list elements in the B<-list=LIST> option.
=item B<Invalid list; too many elements>
There are more than the allowed 20 elements in the list specified by the
B<-list=LIST> option.
=item B<Failed to parse -list=...>
A list was specified that did not contain a CSV list of numbers.
=item B<Invalid starting episode number (...)>
The value used in the B<-from> option must be greater than 0.
=item B<Do not combine -to and -count>
Using both the B<-to> and B<-count> is not permitted (and makes no sense).
=item B<Invalid range; ... is greater than ...>
The B<-from> episode number must be less than or equal to the B<-to> number.
=item B<Invalid range; range is too big (E<gt>20)>
The difference between the starting and ending episode number is greater than
20.
=item B<Invalid - too many '%d' sequences in '...'>
There were more than two '%d' sequences in the the name of the output file if
a range of episodes is being processed, or more than one if a single episode
has been specified.
=item B<Invalid - too few '%d' sequences in '...'>
There were fewer than two '%d' sequences in the the name of the output file
when a range of episodes was being processed.
=item B<Unable to open ... for output: ...>
The script was unable to open the requested output file.
=item B<Unable to find or download ...>
The script has not found a I<.WAV> file in the cache area so has attempted to
download the I<MP3> copy of the audio from the HPR website. This process has
failed.
=item B<Failed to find requested episode>
An episode number could not be found in the database. This error is not fatal.
=item B<Nothing to do>
After processing the range of episodes specified the script could not find
anything to do. This is most often caused by all of the episodes in the range
being invalid.
=item B<Aborted due to missing summaries and/or tags>
One or more of the shows being processed does not have a summary or tags. The
script has been told not to ignore this so has aborted before generating
metadata.
=item B<HTML::TreeBuilder failed to parse notes: ...>
The script failed to parse the HTML in the notes of one of the episodes. This
indicates a serious problem with these notes and is fatal since these notes
need to be corrected before the episode is uploaded to the Internet Archive.
=item B<HTML::TreeBuilder failed to process ...: ...>
While parsing the HTML in a related file the parse has failed. The file being
parsed is reported as well as the error that was encountered. This is likely
due to bad HTML.
=item B<Unable to open ... for writing: ...>
The script is attempting to open an HTML file which it has downloaded to
write back edited HTML, yet the open has failed. The filename is in the error
message as is the cause of the error.
=back
=head1 CONFIGURATION AND ENVIRONMENT
This script reads two configuration files in B<Config::General> format
(similar to Apache configuration files) for the path to the files to be
uploaded and for credentials to access the HPR database. Two files are used
because the database configuration file is used by several other scripts.
=head2 SCRIPT CONFIGURATION
The general configuration file is B<.make_metadata.cfg> (although this can be
overridden through the B<-config=FILE> option) and contains the following
lines:
uploads = "<path to files>"
filetemplate = "hpr%04d.%s"
baseURL = "http://hackerpublicradio.org"
sourceURLtemplate = "https://hackerpublicradio.org/eps/%s/index.html"
IAURLtemplate = "http://archive.org/download/%s/%s"
The I<uploads> line defines where the WAV files are to be found (currently
I</var/IA/uploads> on the VPS). The same area is used to store downloaded MP3
files and any supplementary files associated with the episode.
The I<filetemplate> line defines the format of an audio file such as
I<hpr1234.wav>. This should not be changed.
The I<baseURL> line defines the common base for download URLs. It is used when
parsing and standardising URLs relating to files on the HPR server.
The I<sourceURLtemplate> line defines the format of the URL required to access the
show on the HPR site. This should not be changed except in the unlikely event that the
these URLs change.
The I<IAURLtemplate> line defines the format of URLs on archive.org which is
used when generating new links in HTML notes or supplementary files.
=head2 DATABASE CONFIGURATION
The database configuration file is B<.hpr_db.cfg> (although this can be
overridden through the B<-dbconfig=FILE> option).
The layout of the file should be as follows:
<database>
host = 127.0.0.1
port = PORT
name = DATABASE
user = USERNAME
password = PASSWORD
</database>
=head1 DEPENDENCIES
Carp
Config::General
DBI
Data::Dumper
File::Find::Rule
File::Path
Getopt::Long
HTML::Entities
HTML::TreeBuilder
IO::HTML
LWP::Simple
List::MoreUtils
List::Util
Pod::Usage
Text::CSV_XS
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2014-2019 Dave Morriss (Dave.Morriss@gmail.com).
All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker