1
0
forked from HPR/hpr-tools
hpr-tools/InternetArchive/upload_manager

2804 lines
92 KiB
Perl
Executable File

#!/usr/bin/env perl
#===============================================================================
#
# FILE: upload_manager
#
# USAGE: ./upload_manager [-help] [-[no]dry-run] [-[no]verbose]
# [-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]]
# [-range=START [-range=END]] [-json=FILE] [-scandir=FILE]
# [-dbconfig=FILE]
#
# DESCRIPTION: Collect details about HPR shows and the copies on Archive.org
# and keep them in a separate SQLite database (for
# the moment). This database is used to populate the
# developing PostgreSQL Database (which is not currently
# progressing very much).
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: 2022-02-15 Had to revert to DBD::MySQL temporarily.
# 2023-06-12 This code is no longer valid. It expects the main
# HTML to point to files on the server (if there are "assets")
# and recurses through extra HTML it finds there. It currently
# looks for file details in any sub-directory "/eps/hpr1234/".
# Now there are no files, so the details of all assets
# (including the audio) needs to be in the 'assets' table. Not
# sure we are there yet.
# 2024-03-08 Smartmatch is deprecated; moved to 'match::smart'.
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.2.19
# CREATED: 2017-06-14 10:50:28
# REVISION: 2024-03-08 20:49:51
#
#===============================================================================
use 5.010;
use strict;
use warnings;
use utf8;
#use experimental 'smartmatch';
use match::smart;
use Carp;
use Getopt::Long;
use Pod::Usage;
use Config::General;
use File::Slurper qw{ read_text read_lines };
use JSON;
use Net::OpenSSH;
#use TryCatch; # Broke in late May 2020 due to a problem with Devel::Declare
use Try::Tiny;
use Log::Handler;
use HTML::TreeBuilder 5 -weak;
use HTML::Entities;
use List::Util qw{ min max };
use List::MoreUtils qw{ any };
use LWP::Simple;
use DBI;
use SQL::Abstract;
use SQL::Abstract::Plugin::InsertMulti;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.2.19';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
#( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
#$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/IA";
my $logdir = "$basedir/logs";
my $logfile = "$logdir/${PROG}.log";
my $configfile = "$basedir/.$PROG.cfg";
my $db1configfile = "$basedir/.hpr_db.cfg";
my $database2 = "$basedir/ia.db";
my ( $dbh1, $dbh2, $sth1, $h1 );
#
# NOTE: Program config not currently used
#
die "Configuration file $configfile not found\n" unless ( -e $configfile );
my $conf = Config::General->new(
-ConfigFile => $configfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %config = $conf->getall();
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;
#
# Options and arguments
#
my %options;
Options( \%options );
#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
if ( $options{'help'} );
#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
if ( $options{'documentation'} );
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
my $db1cfgfile
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
my $mode = $options{mode};
die "Mode is required\n" unless defined($mode);
$mode = lc($mode);
my @episodes;
@episodes = @{ $options{episode} } if ( defined( $options{episode} ) );
if ($mode ne 'update' and @episodes) {
die "The -episode=N options are only for use in update mode\n";
}
my @range;
@range = @{ $options{range} } if ( defined( $options{range} ) );
if ($mode ne 'update' and @range) {
die "The -range=N options are only for use in update mode\n";
}
_debug( $DEBUG >= 3, '@episodes: ' . Dumper( \@episodes ) );
_debug( $DEBUG >= 3, '@range: ' . Dumper( \@range ) );
#
# Check -episode=N versus -range=N. We can't have both
#
die "Choose only -episode=N or -range=N, not both\n"
if ( defined( $options{episode} ) && defined( $options{range} ) );
#
# Only two range values
#
die "-range=N should not be used more than twice\n" if ($#range gt 2);
#
# Turn the range into a list of episodes
#
if (defined($options{range})) {
@episodes = ( min(@range) .. max(@range) );
}
@episodes = map { sprintf('%04d',$_)} @episodes;
_debug( $DEBUG >= 3, '@episodes: ' . Dumper(\@episodes) );
die "Too many episodes requested\n" if (scalar(@episodes) > 20);
my $json = $options{json};
if ( $mode eq 'json' ) {
die "JSON file is required\n" unless defined($json);
die "Unable to find file '$json'\n" unless -e $json;
}
my $dirfile = $options{scandir};
if ( $mode eq 'scandir' ) {
die "Directory listing file is required\n" unless defined($dirfile);
die "Unable to find file '$dirfile'\n" unless -e $dirfile;
}
die "Configuration file $db1cfgfile not found\n" unless ( -e $db1cfgfile );
#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date. The format
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
# the message.
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();
$log->add(
file => {
timeformat => "%Y-%m-%d %H:%M:%S",
filename => $logfile,
minlevel => 0,
maxlevel => 7,
}
);
#-------------------------------------------------------------------------------
# Load MySQL configuration data
#-------------------------------------------------------------------------------
my $db1conf = Config::General->new(
-ConfigFile => $db1cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %db1cfg = $db1conf->getall();
if ($verbose) {
print "MySQL database configuration chosen: $db1cfgfile\n";
print "Dry run\n" if $dry_run;
}
#-------------------------------------------------------------------------------
# Connect to the MariaDB database [reverted to mysql temporarily]
#-------------------------------------------------------------------------------
my $dbhost = $db1cfg{database}->{host} // '127.0.0.1';
my $dbport = $db1cfg{database}->{port} // 3306;
my $dbname = $db1cfg{database}->{name};
my $dbuser = $db1cfg{database}->{user};
my $dbpwd = $db1cfg{database}->{password};
#$dbh1 = DBI->connect( "DBI:MariaDB:database=$dbname;host=$dbhost;port=$dbport",
# $dbuser, $dbpwd, { AutoCommit => 1 } )
# or croak $DBI::errstr;
$dbh1 = DBI->connect( "DBI:mysql:database=$dbname;host=$dbhost;port=$dbport",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or croak $DBI::errstr;
#
# Enable client-side UTF8 [temporary]
#
$dbh1->{mysql_enable_utf8} = 1;
#-------------------------------------------------------------------------------
# Connect to the SQLite database
#-------------------------------------------------------------------------------
$dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" );
#
# Log the starting conditions
#
$log->info("Starting $PROG version $VERSION");
if (@episodes) {
$log->info('Episodes:', join( ", ", @episodes ) );
}
else {
$log->info('Looking for new episodes');
}
$log->info("Mode: $mode");
if ($dry_run) {
$log->info('Dry run: Yes');
}
#-------------------------------------------------------------------------------
# Set up for connecting to the server with SSH
#-------------------------------------------------------------------------------
my $remhost = 'hackerpublicradio.org';
#my $remport = 22074;
my $remport = 22;
my $remuser = 'hpr';
my $rempath = '/home/hpr/www/eps/';
my $ssh = Net::OpenSSH->new( $remhost, user => $remuser, port => $remport );
$ssh->error and die "Can't ssh to $remhost: " . $ssh->error;
$log->info("Connected to ${remuser}\@${remhost}, port $remport");
#-------------------------------------------------------------------------------
# Perform the action requested by -mode=X
#-------------------------------------------------------------------------------
if ( $mode eq 'initialise' ) {
print "Initialise mode\n";
initialise( $dbh1, $dbh2 );
}
elsif ( $mode eq 'update' ) {
print "Update mode\n";
#
# Range updating is achieved by running the single-shot mode repeatedly
#
if ( @episodes ) {
foreach my $episode (@episodes) {
update_episode( $dbh1, $dbh2, $episode, $dry_run, $verbose );
}
}
else {
update_db( $dbh1, $dbh2, $dry_run, $verbose );
}
}
elsif ( $mode eq 'json' ) {
print "JSON mode\n";
load_metadata( $dbh2, $json, $dry_run, $verbose );
#load_upload_dates( $dbh2, $json );
}
elsif ( $mode eq 'scandir' ) {
print "Scan directory mode\n";
scan_dir( $dbh2, $dirfile );
}
else {
die "Invalid mode: $mode\n";
}
exit;
#=== FUNCTION ================================================================
# NAME: initialise
# PURPOSE: Initialise an empty database
# PARAMETERS: $dbh1 Handle for the MySQL database
# $dbh2 Handle for the SQLite database
# RETURNS: Nothing
# DESCRIPTION: Runs the various initialisation functions, but only if the
# database tables 'episodes' and/or 'assets' are empty
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub initialise {
my ( $dbh1, $dbh2 ) = @_;
my ($count);
#
# Initialise the 'episodes' table if it's empty
#
$count = count_rows( $dbh2, 'episodes' );
if ( $count == 0 ) {
print "Initialising the 'episodes' table\n";
initialise_episodes( $dbh1, $dbh2 );
}
else {
print "Did not initialise the 'episodes' table ($count rows)\n";
$log->info("Did not initialise the 'episodes' table ($count rows)");
}
#
# Initialise the 'assets' table if it's empty
#
$count = count_rows( $dbh2, 'assets' );
if ( $count == 0 ) {
print "Initialising the 'assets' table\n";
initialise_assets( $dbh1, $dbh2 );
}
else {
print "Did not initialise the 'assets' table ($count rows)\n";
$log->info("Did not initialise the 'assets' table ($count rows)");
}
}
#=== FUNCTION ================================================================
# NAME: initialise_episodes
# PURPOSE: Initialise an empty 'episodes' table
# PARAMETERS: $dbh1 Handle for the MySQL database
# $dbh2 Handle for the SQLite database
# RETURNS: Nothing
# DESCRIPTION: Assuming the episodes table is empty loads all episodes from
# the MariaDB database
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub initialise_episodes {
my ( $dbh1, $dbh2 ) = @_;
my ( $sql1, $sth1, $h1, $count );
$sql1 = q{
SELECT
id,
date,
title,
summary
FROM eps
};
$sth1 = $dbh1->prepare($sql1);
$sth1->execute();
if ( $dbh1->err ) {
croak $dbh1->errstr;
}
$count = 0;
while ( $h1 = $sth1->fetchrow_hashref ) {
#
# Fields are:
# id integer,
# rdate date NOT NULL,
# title varchar ( 100 ) NOT NULL,
# summary varchar ( 100 ) NOT NULL,
# uploaded integer DEFAULT 0,
# has_files INTEGER DEFAULT 0,
# with_files INTEGER DEFAULT 0,
# with_derived INTEGER DEFAULT 0,
# with_source INTEGER DEFAULT 0,
# archive_date date,
# item_last_updated integer
# IA_URL text,
# notes text,
#
#>>>
$dbh2->do(
q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
undef,
$h1->{id},
$h1->{date},
$h1->{title},
$h1->{summary},
) or die $dbh2->errstr;
#<<<
$count++;
}
print "Records added: $count\n";
$log->info("Records added to 'episodes' table: $count");
}
#=== FUNCTION ================================================================
# NAME: initialise_assets
# PURPOSE: Initialise an empty 'assets' table (during database
# initialisation)
# PARAMETERS: $dbh1 Handle for the MariaDB database
# $dbh2 Handle for the SQLite database
# RETURNS: Nothing
# DESCRIPTION: Assuming the assets table is empty loads all assets (links to
# files on the HPR server) into the table
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub initialise_assets {
my ( $dbh1, $dbh2 ) = @_;
my ( $sql1, $sth1, $h1, $count, $links, $id, $episode, $notes );
my %assets;
#
# Collect the notes for shows that have links to other HPR files
#
$sql1 = q{
SELECT
id,
notes
FROM eps
WHERE notes REGEXP 'https?://(www.)?(hacker|hobby)publicradio.org/eps/'
};
# LIMIT 20 OFFSET 30
$sth1 = $dbh1->prepare($sql1);
$sth1->execute();
if ( $dbh1->err ) {
croak $dbh1->errstr;
}
#
# Process notes one show at a time, recursing into HTML where necessary
# and building the %assets hash
#
$count = 0;
while ( $h1 = $sth1->fetchrow_hashref ) {
$links = 0;
$id = $h1->{id};
$episode = sprintf("%04d", $id);
$notes = $h1->{notes};
print "Episode: $id\n";
#$links = find_links($id,$notes,\%assets);
$links = find_assets($ssh,$episode,$rempath,\%assets);
print "Links found: $links\n";
}
#print Dumper(\%assets),"\n";
#
# Use the %assets hash to populate the 'assets' table
#
foreach my $key (sort(keys(%assets))) {
foreach my $obj (@{$assets{$key}}) {
#>>>
$dbh2->do(
q{INSERT INTO assets (episode_id, URL, filename) VALUES(?,?,?)},
undef,
$key,
$obj->{URL},
$obj->{filename},
) or die $dbh2->errstr;
#<<<
$count++;
}
}
print "Records added: $count\n";
$log->info("Records added to 'assets' table: $count");
}
#=== FUNCTION ================================================================
# NAME: update_db
# PURPOSE: Loads any additional rows from the MySQL database to the
# SQLite one.
# PARAMETERS: $dbh1 Handle for the MySQL database
# $dbh2 Handle for the SQLite database
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Nothing
# DESCRIPTION: Finds the first gap in the 'id' field of the 'episodes' table
# in the SQLite database. This is the starting show number it
# uses when querying the MySQL database. The function loops over
# the 'eps' table in the MySQL database returning details for
# storage in the SQLite database. Before storing it double
# checks that the row in question does not exist.
# TODO
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub update_db {
my ( $dbh1, $dbh2, $dry_run, $verbose ) = @_;
my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2 );
my ( $ep_count, $ffree);
my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
my ( $id, $episode, $notes, $links );
print "Updating SQLite database from MySQL\n" if $verbose;
#
# Find the first episode id gap in the SQLite database
#
$sql1 = q{
SELECT MIN (id) + 1 AS first_free
FROM episodes t1
WHERE NOT EXISTS (
SELECT *
FROM episodes t2
WHERE t1.id + 1 = t2.id)
};
$sth1 = $dbh2->prepare($sql1);
$sth1->execute();
if ( $dbh1->err ) {
croak $dbh1->errstr;
}
$h1 = $sth1->fetchrow_hashref;
$ffree = $h1->{first_free};
print "First free id: $ffree\n" if $verbose;
#
# SQL to find new rows in the MySQL database
#
$sql1 = q{
SELECT
id,
date,
title,
summary,
notes
FROM eps
WHERE id >= ?
};
#
# SQL to look for pre-existing episodes in the SQLite database
#
$sql2 = q{
SELECT * FROM episodes WHERE id = ?
};
#
# Prepare the SQL statements
#
$sth1 = $dbh1->prepare($sql1);
$sth2 = $dbh2->prepare($sql2);
#
# Query MySQL, from $ffree onwards
#
$sth1->execute($ffree);
if ( $dbh1->err ) {
croak $dbh1->errstr;
}
#
# Loop through the MySQL rows
#
$ep_count = $asset_count = 0;
while ( $h1 = $sth1->fetchrow_hashref ) {
#
# Query the SQLite database for the episode number we just got
#
$sth2->execute( $h1->{id} );
if ( $dbh2->err ) {
croak $dbh2->errstr;
}
$h2 = $sth2->fetchrow_hashref;
#
# If the row isn't already there create it
#
unless ($h2) {
unless ($dry_run) {
#>>>
$dbh2->do(
q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
undef,
$h1->{id},
$h1->{date},
$h1->{title},
$h1->{summary},
) or die $dbh2->errstr;
#<<<
printf "Added episode: %d\n", $h1->{id} if $verbose;
$ep_count++;
}
else {
printf
"Would have inserted a row into the 'episodes' table (%s)\n",
$h1->{id};
}
}
#
# Check this show for "assets" (whether we have the episode already or
# not). Even in dry-run mode this gets executed
#
$links = 0;
$id = $h1->{id};
$episode = sprintf("%04d", $id);
# $notes = $h1->{notes};
# $links = find_links($id,$notes,\%assets);
$links = find_assets($ssh,$episode,$rempath,\%assets);
$asset_count += $links;
}
#
# In debug mode show what assets were collected as we iterated through the
# new shows.
#
_debug( $DEBUG >= 2, '%assets: ' . Dumper(\%assets) );
#
# Add any assets collected from the episodes just processed
#
$assets_added = 0;
if ($asset_count > 0) {
( $assets_added, $assets_deleted, $assets_updated ) =
process_assets( $dbh2, \%assets, 1, $dry_run, $verbose );
}
printf "%-16s %d\n", "Episodes added:", $ep_count;
printf "%-16s %d\n", "Assets found:", $asset_count;
printf "%-16s %d\n", "Assets added:", coalesce( $assets_added, 0 );
printf "%-16s %d\n", "Assets deleted:", coalesce( $assets_deleted, 0 );
printf "%-16s %d\n", "Assets updated:", coalesce( $assets_updated, 0 );
$log->info("Episodes added: $ep_count");
}
#=== FUNCTION ================================================================
# NAME: process_assets
# PURPOSE: Given a collection of asset details obtained by searching
# notes for a given range of episodes determine whether they
# need to be added to the SQLite database.
# PARAMETERS: $dbh SQLite database handle
# $assets hashref pointing to the collected assets which
# have been found by parsing the notes of all
# relevant shows prior to calling this routine.
# $new Boolean flag to indicate that we're dealing with a new
# show or updating an old one
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Number of assets added
# DESCRIPTION: The %assets hash has been accumulated by scanning all the
# notes belonging to new episodes found in the MySQL database.
# The hash is keyed by the episode number and it is scanned in
# sorted order of the key. The SQLite database is queried for
# assets with this key and the result held in a hash keyed by
# the filename. The value of each key in the %assets hash is an
# array of hashes holding the asset details, so we loop through
# these. If the filename of a particular asset hash is not found
# in the hash from the database then we create a new row in the
# 'assets' table (assuming that we're not in dry run mode where
# we simply report what we would have done). If any assets were
# added for an episode then we set the flag 'has_files' (unless
# in dry run mode). The total number of assets added to the
# database is returned by the function.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub process_assets {
my ( $dbh, $assets, $new, $dry_run, $verbose ) = @_;
my ( $sql, $sth, $db_assets, $key_field, $db_count );
my ( $asset_count, $assets_added, $assets_deleted, $assets_updated );
$key_field = 'id';
#
# We should only be called if there are assets, but just in case...
#
return ( 0, 0, 0 ) unless ($assets);
#
# SQL to look for pre-existing assets in the SQLite database
#
$sql = q{
SELECT * FROM assets WHERE episode_id = ?
};
$sth = $dbh->prepare($sql);
say 'Processing assets for ', ( $new ? 'new' : 'existing' ), ' show'
if $verbose;
#
# Use the %assets hash to populate the 'assets' table. The hash is keyed
# by episode numbers, and each entry consists of an array of hashes each
# containing the filename and the URL of the asset.
#
$assets_added = $assets_deleted = $assets_updated = 0;
foreach my $key ( sort( keys(%$assets) ) ) {
print "Checking episode $key\n" if $verbose;
#
# Count the assets for the current show
#
$asset_count = scalar( @{ $assets->{$key} } );
_debug( $DEBUG >= 2, "Episode: $key, Asset_count: $asset_count" );
#
# Query the SQLite database for the episode number we just got, looking
# for entries in the 'assets' table
#
$sth->execute($key);
if ( $dbh->err ) {
croak $dbh->errstr;
}
#
# Grab everything as a hash keyed on the 'filename' field so we can
# check it easily. Count what we got (i.e. count the number of assets
# in SQLite for this episode).
# NOTE: trying the idea of indexing $db_assets by the 'id' field (in
# $key_field)
#
# $db_assets = $sth->fetchall_hashref('filename');
$db_assets = $sth->fetchall_hashref($key_field);
$db_count = scalar( keys(%$db_assets) );
_debug( $DEBUG >= 2, "Assets in ia.db:\n" . Dumper($db_assets) );
_debug( $DEBUG >= 2, "DB count: $db_count" );
#
# Compare the number of "real" assets with what SQLite holds.
#
if ( $asset_count > $db_count ) {
#
# We need to add assets. If it's a newly added show which has
# assets this will always be necessary. If it's an existing show
# then we're repairing an anomaly and want to say so.
#
if ($new) {
say "Adding assets" if $verbose;
}
else {
say "** Found more real assets than in DB -> adding";
}
$assets_added += insert_assets( $dbh, $key, $assets, $db_assets,
$dry_run, $verbose );
}
elsif ( $asset_count < $db_count ) {
#
# We need to delete assets. This can't happen for an existing
# show, so it's always a noteworthy event.
#
say "** Found fewer real assets than in DB -> deleting";
$assets_deleted += delete_assets( $dbh, $key, $assets, $db_assets,
$dry_run, $verbose );
}
#
# Look for field differences in what we have after the inserts and
# deletes
#
if ( $assets_added > 0 || $assets_deleted > 0 ) {
$sth->execute($key);
if ( $dbh->err ) {
croak $dbh->errstr;
}
$db_assets = $sth->fetchall_hashref($key_field);
}
my @results
= synchronise_found_assets( $dbh, $key, $assets, $db_assets,
$dry_run, $verbose );
$assets_added += $results[0];
$assets_deleted += $results[1];
$assets_updated += $results[2];
_debug( $DEBUG >= 4, '@results ' . Dumper( \@results ) );
#
# If there are assets then we need to ensure the 'has_files' setting
# reflects it (unless in dry run mode)
#
if ( $asset_count > 0 ) {
unless ($dry_run) {
$dbh->do(
q{
UPDATE episodes
SET has_files = 1
WHERE id = ?
AND has_files = 0
},
undef,
$key,
) or die $dbh->errstr;
}
else {
say "Would have updated the 'episodes.has_files' ",
"field for $key (if needed)";
}
}
# $assets_added += $asset_count;
}
return ( $assets_added, $assets_deleted, $assets_updated );
}
#=== FUNCTION ================================================================
# NAME: synchronise_found_assets
# PURPOSE: Compares the assets found by searching the shownotes (and
# other files) with those in the SQLite database and updates
# what differs.
# PARAMETERS: $dbh SQLite database handle
# $episode Numeric episode
# $assets Hashref pointing to the collected assets which
# have been found by parsing the show's notes;
# keyed by episode number. Each keyed object is
# an arrayref of hashes containing the URL and
# filename of the asset.
# $db_assets Hash of assets from SQLite, keyed by filename.
# Each keyed object is a hash of columns from
# the database: id, episode_id, URL, filename,
# uploaded.
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Number of additions, deletions and updates
# DESCRIPTION: The assets for this episode have been found by searching the
# HTML of the show notes and any linked HTML files for files on
# the HPR server. The equivalent assets have also been
# collected from the SQLite database. A comparison is made
# between these items and if they differ the definitive details
# collected from the MySQL database are used. There are three
# actions possible: (1) if the URL or filename fields differ
# then we can work out the update and apply it, (2) if the
# incoming assets don't match any in those from the database
# then we can delete the database ones, and (3) we can add the
# incoming assets we also couldn't match because they'll be the
# replecements! For all actions changes are made to the
# versions in the SQLite database using SQL::Abstract to
# construct the queries to do it.
# THROWS: No exceptions
# COMMENTS: TODO: There may be multiple asset changes for the episode.
# Does the SQL::Abstract call perform the multiple changes?
# Needs testing.
# SEE ALSO: N/A
#===============================================================================
sub synchronise_found_assets {
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
my ( $sth, $additions, $deletions, $updates, $update_count, $index);
my ( @fieldvals, @where, @db_rows, @asset_rows, %matches, %changes );
$additions = $deletions = $updates = 0;
$update_count = 0;
#
# Temporary?
#
if ( $DEBUG >= 4 ) {
_debug( 1, 'In subroutine synchronise_found_assets' );
_debug( 1, "\$episode: $episode" );
_debug( 1, '%$assets: ' . Dumper($assets) );
_debug( 1, '%$db_assets: ' . Dumper($db_assets) );
}
_debug( $DEBUG >= 4, 'Looking for assets that match - to exclude them' );
@db_rows = ( keys(%$db_assets) );
@asset_rows = ( 0 .. scalar( @{ $assets->{$episode} } ) - 1 );
_debug( $DEBUG >= 4, 'Initial @db_rows ' . Dumper( \@db_rows ) );
_debug( $DEBUG >= 4, 'Initial @asset_rows ' . Dumper( \@asset_rows ) );
#
# Find any asset pair matches
#
$index = 0;
foreach my $db_key ( keys(%$db_assets) ) {
foreach my $a_obj ( @{ $assets->{$episode} } ) {
if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 2 ) {
$matches{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
# splice( @asset_rows, $index, 1 );
@asset_rows = grep { $_ != $index } @asset_rows;
$index++;
}
}
}
_debug( $DEBUG >= 4, 'Assets to ignore: %matches ' . Dumper( \%matches ) );
_debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
#
# If everything matched then there's nothing to do, so return zero and
# leave.
#
# if (scalar(@db_rows) == scalar( keys(%matches) )) {
if ( scalar(@asset_rows) == 0 ) {
_debug( $DEBUG >= 4, 'All rows match, so nothing to do' );
return ( 0, 0, 0 );
}
_debug( $DEBUG >= 4, 'Some rows do not match' );
#
# Remove the row ids for any matches
#
@db_rows = grep { !exists( $matches{$_} ) } @db_rows;
_debug( $DEBUG >= 4, '@db_rows after matches ' . Dumper( \@db_rows ) );
_debug( $DEBUG >= 4, '@asset_rows after matches ' . Dumper( \@asset_rows ) );
_debug( $DEBUG >= 4, 'Looking for assets that match in only one attribute' );
#
# Find any cases where only one element of the asset object matches. This
# means we have a simple change.
#
$index = 0;
foreach my $db_key (@db_rows) {
foreach my $a_obj ( @{ $assets->{$episode} } ) {
if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 1 ) {
$changes{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
@asset_rows = grep { $_ != $index } @asset_rows;
$index++;
}
}
}
_debug( $DEBUG >= 4, 'Assets to change: %changes ' . Dumper( \%changes ) );
_debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
#
# Remove the row ids for any changes
#
@db_rows = grep { !exists( $changes{$_} ) } @db_rows;
_debug( $DEBUG >= 4, '@db_rows after changes ' . Dumper( \@db_rows ) );
_debug( $DEBUG >= 4, '@asset_rows after changes ' . Dumper( \@asset_rows ) );
#
# If there are changes, do them now
#
if (%changes) {
_debug( $DEBUG >= 4, 'Collecting changes' );
#
# We may have multiple database rows to process
#
$index = 0;
foreach my $key ( keys(%changes) ) {
$fieldvals[$index] = {};
$where[$index] = { episode_id => $episode };
#
# Compare the fields we're interested in, accumulating differences
# (expecting only one per key).
#
foreach my $fld ( 'filename', 'URL' ) {
if (!equal(
$changes{$key}->[0]->{$fld},
$changes{$key}->[1]->{$fld}
)
)
{
$update_count++;
$fieldvals[$index]->{$fld} = $changes{$key}->[0]->{$fld};
}
}
$where[$index]->{id} = $key;
$index++;
}
}
_debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
_debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
if ( $update_count > 0 ) {
_debug( $DEBUG >= 4, 'Performing changes' );
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
foreach my $fv (@fieldvals) {
my ( $stmt, @bindvals )
= $sql->update( 'assets', $fv, shift(@where) );
#
# Perform the updates (unless in dry-run mode)
#
unless ($dry_run) {
$sth = $dbh->prepare($stmt);
$sth->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $episode\n", $dbh->errstr;
}
else {
print "Updated ", join( ',', keys(%$fv) ), "\n"
if $verbose;
$updates++;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
}
#
# If anything is left in @db_rows then they will be replacements
#
if ( scalar(@db_rows) > 0 ) {
say "Replacements found: ", scalar(@db_rows);
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
#
# Every row which came from the database is to be deleted
#
$index = 0;
foreach my $db_key (@db_rows) {
$where[$index] = { episode_id => $episode };
$where[$index]->{id} = $db_key;
$index++;
}
_debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
#
# Do deletions
#
foreach my $db_key (@db_rows) {
my ( $stmt, @bindvals )
= $sql->delete( 'assets', shift(@where) );
#
# Perform the deletions (unless in dry-run mode)
#
unless ($dry_run) {
$sth = $dbh->prepare($stmt);
$sth->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $episode\n", $dbh->errstr;
}
else {
print "Deleted ", join( ',', @db_rows ), "\n"
if $verbose;
$deletions++;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
#
# Every object in the found assets array for this episode is to be
# added.
#
foreach my $i (@asset_rows) {
my $obj = $assets->{$episode}->[$i];
$obj->{episode_id} = $episode;
my ( $stmt, @bindvals )
= $sql->insert( 'assets', $obj );
#
# Perform the additions (unless in dry-run mode)
#
unless ($dry_run) {
$sth = $dbh->prepare($stmt);
$sth->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $episode\n", $dbh->errstr;
}
else {
print "Inserted array elements ", join( ',', @asset_rows ), "\n"
if $verbose;
$additions++;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
}
return ( $additions, $deletions, $updates );
}
################################################################################
#
# Look for database assets to be ignored. The outer loop iterates through
# the primary key values of $db_assets where the rows all match the
# $episode value. For each of these assets the next loop iterates through
# the array of assets in $assets->{$episode} comparing the fields we want
# to check in the next inner loop. If the checked fields are all equal
# then there will be nothing to do.
#
# foreach my $db_key ( keys(%$db_assets) ) {
# foreach my $a_obj ( @{ $assets->{$episode} } ) {
# $ignore = $delete = $update = 0;
# foreach my $fld ( 'filename', 'URL' ) {
# $ignore
# += equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
# $delete
# += !equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
# }
# push( @{ $row_actions{'ignore'} }, $db_assets->{$db_key}->{id} )
# if $ignore == 2;
# push( @{ $row_actions{'delete'} }, $db_assets->{$db_key}->{id} )
# if $delete == 2;
# }
# }
#
# _debug( $DEBUG >= 4, '%row_actions ' . Dumper( \%row_actions ) );
#
#
# Scan the array of hashes obtained from the shownotes for $episode and
# record things that need changing.
#
# $index = 0;
# $updates = 0;
# foreach my $obj ( @{ $assets->{$episode} } ) {
# $fieldvals[$index] = {};
#
# #
# # Compare the fields we're interested in
# #
# foreach my $fld ( 'filename', 'URL' ) {
# if ( !equal( $obj->{$fld}, $db_assets->{$obj->{filename}}->{$fld} ) ) {
# $updates++;
# $fieldvals[$index]->{$fld} = $obj->{$fld};
# }
# }
#
# # TODO
# $fieldvals[$index]->{id} = $db_assets->{$obj->{filename}}->{id};
#
# $index++;
# }
#
# _debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
#
# if ($updates > 0 ) {
# #
# # Prepare to use SQL::Abstract
# #
# my $sql = SQL::Abstract->new();
#
# $where{episode_id} = $episode;
#
# foreach my $fv (@fieldvals) {
# $where{id} = $fv->{id};
# delete($fv->{id});
#
# my ( $stmt, @bindvals )
# = $sql->update( 'assets', $fv, \%where );
#
# #
# # Perform the updates (unless in dry-run mode)
# #
# unless ($dry_run) {
# $sth = $dbh->prepare($stmt);
# $sth->execute(@bindvals);
# if ( $dbh->err ) {
# warn "Processing $episode\n", $dbh->errstr;
# }
# else {
# print "Updated ", join( ',', keys(%$fv) ), "\n"
# if $verbose;
# }
# }
# else {
# print "Dry run mode:\n";
# print "Statement: $stmt\n";
# print "Bindvals: ", join( ", ", @bindvals ), "\n";
# }
# }
# }
#
################################################################################
#=== FUNCTION ================================================================
# NAME: insert_assets
# PURPOSE: Find missing assets and add them to SQLite
# PARAMETERS: $dbh SQLite database handle
# $episode Numeric episode
# $assets Hashref pointing to the collected assets which
# have been found by parsing the show's notes;
# keyed by episode number.
# $db_assets Hash of assets from SQLite, keyed by filename
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Number of insertions
# DESCRIPTION: The assets for this episode have been found by searching the
# HTML of the show notes and any linked HTML files for files on
# the HPR server. The equivalent assets have also been
# collected from the SQLite database. The MySQL assets are
# reorganised by filename. For each file which does not already
# exist in the SQLite list details are prepared and passed to
# SQL::Abstract using the multi insert feature. The missing
# details are inserted into the assets table unless dry-run mode
# is on.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub insert_assets {
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
my ($sth, %db_by_file, %assets_by_file, $insertions, @multi_insert, %where);
#
# Reorder the %$db_assets hash in filename order
#
foreach my $key ( keys( %{$db_assets} ) ) {
$db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
}
#
# Reorder the %assets hash in filename order
#
foreach my $obj ( @{ $assets->{$episode} } ) {
$assets_by_file{$obj->{filename}} = $obj;
}
_debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
_debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
#
# Find stuff for insertion. We need to build @multi_insert such that it
# contains a list of hashrefs, each containing the fields we want to
# insert.
# Note: We make a new %fv each time, fill it as a slice, and push it into
# the array @multi_insert. This may be rather obscure and therefore
# error-prone.
#
$insertions = 0;
foreach my $key ( keys(%assets_by_file) ) {
# if ( !exists( $db_assets->{$key} ) ) {
if ( !exists( $db_by_file{$key} ) ) {
my %fv;
$insertions++;
@fv{ 'episode_id', 'URL', 'filename' } = (
$episode, $assets_by_file{$key}->{URL},
$assets_by_file{$key}->{filename}
);
push( @multi_insert, \%fv );
}
}
_debug( $DEBUG >= 2, '@multi_insert: ' . Dumper( \@multi_insert ) );
if ($insertions > 0) {
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
my ( $stmt, @bindvals )
= $sql->insert_multi( 'assets', \@multi_insert );
#
# Perform the insertions (unless in dry-run mode)
#
unless ($dry_run) {
$sth = $dbh->prepare($stmt);
$sth->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $episode\n", $dbh->errstr;
}
else {
# print "Inserted ",join(',',keys(%where)),"\n" if $verbose;
print "Inserted $insertions assets\n" if $verbose;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
return $insertions;
}
#=== FUNCTION ================================================================
# NAME: delete_assets
# PURPOSE: Find superfluous assets for an episode in the 'assets' table
# and delete them.
# PARAMETERS: $dbh SQLite database handle
# $episode Numeric episode
# $assets Hashref pointing to the collected assets which
# have been found by parsing the show's notes;
# keyed by episode number.
# $db_assets Hash of assets from SQLite, keyed by filename
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Number of deletions
# DESCRIPTION: The assets for this episode have been found by searching the
# HTML of the show notes and any linked HTML files for files on
# the HPR server. The equivalent assets have also been
# collected from the SQLite database. The MySQL assets are
# reorganised by filename. For each file exists in the SQLite
# list details are prepared and passed to SQL::Abstract. The
# unwanted details are deleted from the assets table unless
# dry-run mode is on.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub delete_assets {
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
my ($sth, %db_by_file, %assets_by_file, $deletions, %where);
#
# Reorder the %$db_assets hash in filename order
#
foreach my $key ( keys( %{$db_assets} ) ) {
$db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
}
#
# Reorder the %assets hash in filename order
#
foreach my $obj ( @{ $assets->{$episode} } ) {
$assets_by_file{$obj->{filename}} = $obj;
}
_debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
_debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
#
# Only delete assets with this value in the 'episode_id' field
#
$where{episode_id} = $episode;
#
# Find stuff for deletion. We need to build %where such that it generates
# 'WHERE filename = <value1> OR filename = <value2> ...', so we add these
# values to an (anonymous) array.
#
$deletions = 0;
foreach my $key ( keys(%$db_assets) ) {
# if ( !exists( $assets_by_file{$key} ) ) {
if ( !exists( $db_by_file{$key} ) ) {
$deletions++;
if ( !exists( $where{filename} ) ) {
$where{filename} = [$key];
}
else {
push( @{ $where{filename} }, $key );
}
}
}
_debug( $DEBUG >= 2, "%where: " . Dumper( \%where ) );
if ($deletions > 0) {
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
my ( $stmt, @bindvals )
= $sql->delete( 'assets', \%where );
#
# Perform the deletions (unless in dry-run mode)
#
unless ($dry_run) {
$sth = $dbh->prepare($stmt);
$sth->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $episode\n", $dbh->errstr;
}
else {
print "Deleted ",join(',',keys(%where)),"\n" if $verbose;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
return $deletions;
}
#=== FUNCTION ================================================================
# NAME: update_episode
# PURPOSE: Perform a database update for a specific episode
# PARAMETERS: $dbh1 Handle for the MariaDB database
# $dbh2 Handle for the SQLite database
# $episode Target episode to update
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Nothing
# DESCRIPTION: Performs an update for a single episode in the SQLite
# database. The usual scenario is after the Community News show
# has been posted, or a spelling mistake has been corrected for
# an episode. The episode is selected from the MySQL database
# and the SQLite one. The 'title' and 'summary' fields are
# compared across the two copies, and if they differ the
# necessary changes are set up to be executed or reported in
# 'dry run' mode.
# NOTE: No longer done this way:
# |A search for assets is made by reading the show notes and any
# |notes that are linked from these, and if there are
# |supplementary files they are accumulated and the database
# |updated as necessary (under control of the 'dry run' option).
# |This is currently the only way that the row in the SQLite
# |database can be updated after it has been loaded from the
# |MySQL database, other than when the metadata is collected and
# |applied. The metadata update process does not fix errors in
# |the 'title' or 'summary' fields.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub update_episode {
my ( $dbh1, $dbh2, $episode, $dry_run, $verbose ) = @_;
my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2, $sth3 );
my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
my ( $id, $notes, $links );
my ( $updates, %fieldvals, %where );
#
# SQL to find new rows in the MySQL database
#
$sql1 = q{
SELECT
id,
date,
title,
summary,
notes
FROM eps
WHERE id = ?
};
#
# SQL to look for pre-existing episodes in the SQLite database
#
$sql2 = q{
SELECT * FROM episodes WHERE id = ?
};
#
# Prepare the SQL statements
#
$sth1 = $dbh1->prepare($sql1);
$sth2 = $dbh2->prepare($sql2);
#
# Query MySQL, for the $episode
#
$sth1->execute($episode);
if ( $dbh1->err ) {
croak $dbh1->errstr;
}
#
# Loop through the MySQL row (should only be one)
#
$asset_count = $updates = 0;
while ( $h1 = $sth1->fetchrow_hashref ) {
#
# Query the SQLite database for the episode number we just got
#
$sth2->execute( $h1->{id} );
if ( $dbh2->err ) {
croak $dbh2->errstr;
}
$h2 = $sth2->fetchrow_hashref;
#
# If the row doesn't exist we have a problem.
# TODO: Could we just add the episode at this point?
#
unless ($h2) {
warn "Row $episode is not in the SQLite database\n";
return;
}
#
# Look for title and summary changes and build a hash holding the
# changes
#
foreach my $fld ( 'title', 'summary' ) {
if ( !equal( $h1->{$fld}, $h2->{$fld} ) ) {
$updates++;
$fieldvals{$fld} = $h1->{$fld};
}
}
if ($updates > 0 ) {
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
$where{id} = $episode;
my ( $stmt, @bindvals )
= $sql->update( 'episodes', \%fieldvals, \%where );
#
# Perform the updates (unless in dry-run mode)
#
unless ($dry_run) {
$sth3 = $dbh2->prepare($stmt);
$sth3->execute(@bindvals);
if ( $dbh2->err ) {
warn "Processing $episode\n", $dbh2->errstr;
}
else {
print "Updated ",join(',',keys(%fieldvals)),"\n" if $verbose;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
#
# Check this show for "assets". Even in dry-run mode this gets
# executed
#
$links = 0;
$id = $h1->{id};
$notes = $h1->{notes};
# $links = find_links($id,$notes,\%assets);
$links = find_assets($ssh,$episode,$rempath,\%assets);
$asset_count += $links;
}
if ( $asset_count > 0 ) {
_debug( $DEBUG >= 2, "Scanned assets:\n" . Dumper( \%assets ) );
}
else {
_debug( $DEBUG >= 2, "No assets found" );
}
#
# Add any assets collected from the episode just processed
#
$assets_added = 0;
if ($asset_count > 0) {
( $assets_added, $assets_deleted, $assets_updated ) =
process_assets( $dbh2, \%assets, 0, $dry_run, $verbose );
}
printf "%-16s %d\n", "Episode checked:", $episode;
printf "%-16s %d\n", "Episode updates:", $updates;
printf "%-16s %d\n", "Assets found:", $asset_count;
printf "%-16s %d\n", "Assets added:", coalesce( $assets_added, 0 );
printf "%-16s %d\n", "Assets deleted:", coalesce( $assets_deleted, 0 );
printf "%-16s %d\n", "Assets updated:", coalesce( $assets_updated, 0 );
$log->info("Episode number: $episode");
$log->info("Episode updates: $updates");
$log->info( 'Assets added,deleted,updated:',
coalesce( $assets_added, 0 ), ',',
coalesce( $assets_deleted, 0 ), ',',
coalesce( $assets_updated, 0 )
);
}
#=== FUNCTION ================================================================
# NAME: load_metadata
# PURPOSE: Loads values from a JSON file which has been generated by the
# 'ia metadata' command and stores them in the 'episodes' table.
# Also uses the JSON data to determine if the derived files and
# the source file have been uploaded and which assets have been
# uploaded.
# PARAMETERS: $dbh Handle for the SQLite database
# $file Name of file of JSON data
# $dry_run Boolean dry run setting
# $verbose Boolean verbose setting
# RETURNS: Nothing
# DESCRIPTION: The input file is expected to consist of many JSON complex
# objects, generated by the command 'ia metadata <identifier>'
# for a list of HPR episodes (see script 'collect_show_data').
# A count is kept of successful updates, and this is reported
# once finished.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub load_metadata {
my ( $dbh, $file, $dry_run, $verbose ) = @_;
my ( $filebuffer, @jsonbuffer, %cols, %fieldvals, %where );
my ( @deletions, $index );
my ( @stash, @assets, $orig_count, $source, $epid, $count, $updates );
my ( $sth1, $sth2, $sth3, $rv, $h );
my $template = "https://archive.org/details/hpr%04d";
my @flds = (
'uploaded', 'has_files',
'with_files', 'with_derived',
'with_source', 'archive_date',
'item_last_updated', 'IA_URL',
);
my $re = qr{^hpr\d{4}\.(flac|mp3|ogg|opus|spx|wav)$};
#
# Read the entire JSON file into a buffer
#
try {
$filebuffer = read_text($file);
}
catch {
die "Failed to read JSON file $file\n";
};
#
# Parse the JSON from the buffer
#
my $json = JSON->new;
@jsonbuffer = $json->incr_parse($filebuffer);
#print Dumper(\@jsonbuffer),"\n";
#
# Check that this is the type of JSON we need
#
unless (@jsonbuffer) {
warn "Empty JSON?\n";
return;
}
#
# Look at each top-level JSON object and check for missing components.
# It's possible for there to be missing objects, but only in very rare
# cases when a range has been requested yet there's a gap in that range
# (reserved slot perhaps). We accumulate indices of invalid objects in
# @jsonbuffer and then delete the invalid objects.
#
$index = 0;
foreach my $obj (@jsonbuffer) {
unless ( exists( $obj->{metadata} )
&& exists( $obj->{metadata}->{identifier} )
&& exists( $obj->{files} ))
{
warn "Invalid JSON? Skipped object #$index\n";
push( @deletions, $index );
next;
}
$index++;
}
#
# Delete any bad objects we found in the previous pass
#
for my $ind (@deletions) {
splice( @jsonbuffer, $ind, 1 );
print STDERR "Deleted invalid item with index $ind\n" if $verbose;
}
print "\n" if $verbose;
#
# Prepare to get the database row relating to an episode
#
$sth1 = $dbh->prepare("SELECT * FROM episodes WHERE id = ?");
#
# Prepare to get the rows relating to assets for an episode
#
$sth2 = $dbh->prepare("SELECT * FROM assets WHERE episode_id = ?");
$count = 0;
foreach my $obj (@jsonbuffer) {
if ( $obj->{metadata}->{identifier} =~ /hpr(\d{4})/ ) {
$epid = $1 + 0;
#
# Skip the episodes in the range 1-620
#
#next if ( $epid <= 620 );
#
# 2021-06-27: We don't skip any more. Shows 1-620 have been
# uploaded in blocks, but we want them on the IA in the same
# format as all the others. Today we hit 620, working backwards
# from the earliest IA upload as a project to upload everything
# with show notes and assets. The existing database entries were
# changed to make the 'notes' field hold the text 'Block upload'
# followed by the block archive date and the URL. Then the rows
# between 1-620 are then to have the new date and URL and these
# columns have been made NULL in preparation. See the Journal for
# more information.
#
if ($verbose) {
print "Processing ", $obj->{metadata}->{identifier}, "\n";
print "Publicdate: ", $obj->{metadata}->{publicdate}, "\n";
}
#
# Process the files in the metadata
#
$orig_count = 0;
$source = 0;
undef(@stash);
foreach my $f ( @{ $obj->{files} } ) {
#
# Skip anything with a 'Metatda' format or which has a source
# of 'derivative' or 'metadata' or is the newly added
# '__ia_thumb.jpg'
#
next
if ( $f->{format} eq 'Metadata'
|| $f->{source} =~ /^(derivative|metadata)$/
|| $f->{name} eq '__ia_thumb.jpg'
|| $f->{name} =~ /^history\/files\// );
#
# Count audio files which have been "derived" (by us) prior to upload,
# and those that look like 'source' files (e.g.
# hpr2472_source.flac). Anything else store in @stash because
# it's probably an asset file.
#
if ( $f->{name} =~ $re ) {
$orig_count++;
}
elsif ( $f->{name} =~ /^hpr\d{4}_source\..+$/ ) {
$source++;
}
else {
push(@stash,$f->{name});
}
printf "%s %s\n", $f->{name}, $f->{source} if $verbose;
}
#
# Check the assets before updating the episode
#
@assets = collect_assets($dbh, $sth2, $epid);
_debug( $DEBUG >= 2, '@assets: ' . Dumper(\@assets));
_debug( $DEBUG >= 2, '@stash: ' . Dumper(\@stash));
if (@assets || @stash) {
@assets = sort(@assets);
@stash = sort(@stash);
#
# Smart match the @assets and @stash arrays to look for
# difference. In @assets we have what the database holds, and
# in @stash we have stored what we found in the JSON.
# NOTE: Now that Smartmatch ('~~') is deprecated we use the
# rather clever 'match::smart' with its strange |M| operator.
# It needs arrays being matched to be offered as references
# however.
#
# unless ( @assets ~~ @stash ) {
unless ( \@assets |M| \@stash ) {
print "Difference between DB assets and IA\n";
print "DB assets:\n ",join("\n ",@assets),"\n";
print "IA assets:\n ",join("\n ",@stash),"\n";
}
#
# In live mode mark all assets belonging to this episode as
# 'uploaded'.
# TODO: Is this dangerous?
#
unless ($dry_run) {
mark_assets_uploaded($dbh,$epid);
}
else {
print "Update of 'assets' table skipped; dry run\n";
}
}
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
#
# Get the current row for this episode for comparison to find what
# has been changed.
#
$rv = $sth1->execute($epid);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h = $sth1->fetchrow_hashref;
#
# Store the changes
#
$cols{'uploaded'} = 1;
$cols{'has_files'} = ( scalar(@stash) > 0 ? 1 : 0 );
$cols{'with_files'} = ( scalar(@stash) > 0 ? 1 : 0 );
$cols{'with_derived'} = ( $orig_count >= 2 ? 1 : 0 );
$cols{'with_source'} = ( $source > 0 ? 1 : 0 );
$cols{'archive_date'} = $obj->{metadata}->{publicdate};
$cols{'item_last_updated'} = coalesce($obj->{item_last_updated},0);
$cols{'IA_URL'} = sprintf( $template, $epid );
#
# Look for differences, storing the updated values in %fieldvals
# and counting the changes.
#
$updates = 0;
undef(%fieldvals);
foreach my $fld (@flds) {
if ( !equal( $h->{$fld}, $cols{$fld} ) ) {
$updates++;
$fieldvals{$fld} = $cols{$fld};
}
}
if ( $updates > 0 ) {
#
# Build the update statement containing changes. It will be:
# "UPDATE episodes SET a = b, c = d, ... WHERE id = $epid"
#
$where{id} = $epid;
my ( $stmt, @bindvals )
= $sql->update( 'episodes', \%fieldvals, \%where );
#
# Perform the updates (unless in dry-run mode)
#
unless ($dry_run) {
$sth3 = $dbh->prepare($stmt);
$sth3->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $epid\n", $dbh->errstr;
}
else {
print "Updated\n" if $verbose;
$count++;
}
}
else {
print "Dry run mode:\n";
print "Statement: $stmt\n";
print "Bindvals: ", join( ", ", @bindvals ), "\n";
}
}
}
else {
printf "Unexpected identifier: %s\n", $obj->{metadata}->{identifier};
}
print '-' x 80, "\n" if $verbose;
}
print "Number of updates: $count\n";
$log->info("Number of updates from JSON: $count");
}
#=== FUNCTION ================================================================
# NAME: collect_assets
# PURPOSE: Collects asset names for a given episode
# PARAMETERS: $dbh Handle for the SQLite database
# $sth Statement handle for the prepared query
# $epid Episode id
# RETURNS: A list of the asset names
# DESCRIPTION: Just prepares a list of asset filenames by querying the SQLite
# 'assets' table and returns it to the caller
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub collect_assets {
my ( $dbh, $sth, $epid ) = @_;
my ( $rv, $h, @result );
#
# Query the database
#
$rv = $sth->execute($epid);
if ( $dbh->err ) {
warn $dbh->errstr;
}
while ($h = $sth->fetchrow_hashref) {
push(@result,$h->{filename});
}
return @result;
}
#=== FUNCTION ================================================================
# NAME: mark_assets_uploaded
# PURPOSE: Marks the assets relating to an episode as 'uploaded'
# PARAMETERS: $dbh Handle for the SQLite database
# $epid Episode id
# RETURNS: Number of updates
# DESCRIPTION: Mark any assets for a given episode as uploaded, if they are
# not so marked. Expected to be used during an update from JSON
# data collected from the IA.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub mark_assets_uploaded {
my ( $dbh, $epid ) = @_;
my ($rows);
$rows = $dbh->do(
q{UPDATE assets SET uploaded = 1
WHERE episode_id = ? AND uploaded = 0},
undef,
$epid,
) or die $dbh->errstr;
$rows += 0;
return $rows;
}
#=== FUNCTION ================================================================
# NAME: scan_dir
# PURPOSE: Scans a file from the HPR server generated with a simple
# 'find' command which lists all the files and directories under
# the ~/www/eps/ directory.
# PARAMETERS: $dbh Handle for the SQLite database
# $file Name of file of directory data
# RETURNS: Nothing
# DESCRIPTION: Under development
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub scan_dir {
my ( $dbh, $file ) = @_;
my ( %dirtree, $ptr, %shows, $key, $id );
#
# Open the file of paths
#
open( my $fh, "<:encoding(UTF-8)", $file )
or die "Unable to open '$file'\n";
#
# Loop through all paths. If it's one of the audio files then skip it
#
while ( my $line = <$fh> ) {
chomp($line);
next if ( $line =~ /(flac|mp3|ogg|opus|spx|wav)$/ );
#
# Point into the tree hash and add the necessary nodes from the split
# path. The result is a hierarchical has structure where the 'leaf'
# node has an empty anonymous hash as its value.
#
# %dirtree = {
# 'www' => {
# 'eps' => {
# 'hpr2356' => {
# 'index.html' => {},
# 'fix-ssh-on-pi.bash.txt' => {}
# },
# },
# },
# }
#
$ptr = \%dirtree;
$ptr = $ptr->{$_} //= {} for split( /\//, $line );
}
close($fh);
#print Dumper(\%dirtree),"\n";
#
# Build a hash indexed by IA identifier containing an array per value with
# the files relating to the show therein
#
$ptr = \%dirtree;
# TODO if there are multiple items at a level which one to test
$key = ( keys(%$ptr) )[0];
#$key = first { defined($_) } keys(%$ptr);
until ( $key =~ /^hpr[0-9]{4}/ ) {
print "$key\n";
$ptr = $ptr->{$key};
$key = ( keys(%$ptr) )[0];
}
#print join(", ",keys(%$ptr));
# TODO Doesn't deal with sub-directories
foreach my $k ( keys(%$ptr) ) {
if ( ($id) = ( $k =~ /^(hpr[0-9]{4})/ ) ) {
unless ( defined( $shows{$id} ) ) {
$shows{$id} = [];
}
push( @{ $shows{$id} }, $k );
}
}
print Dumper( \%shows ), "\n";
}
#=== FUNCTION ================================================================
# NAME: find_assets
# PURPOSE: An alternative way to look for assets; connect to the server
# with SSH and use a 'find' command for matching files and
# directories.
# PARAMETERS: $ssh Net::OpenSSH object previously set up to
# connect to the server
# $episode episode number we're dealing with
# $rempath path to search on the server
# $rfiles hashref to receive the files (assets) found
# RETURNS: Number of files found
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_assets {
my ($ssh, $episode, $rempath, $rfiles) = @_;
my $urlbase = 'https://hackerpublicradio.org/eps/';
my (@result, @row, $URL, $file, $assetcount);
$rempath .= '/' unless ($rempath =~ /\/$/);
@result = $ssh->capture({ timeout => 10 },
"find $rempath -regextype egrep -regex '.*/hpr${episode}.*' -printf '%y,%p,%s\n'");
$ssh->error and warn "remote 'find' command failed: " . $ssh->error;
# d,/home/hpr/www/eps/hpr3775,85
# f,/home/hpr/www/eps/hpr3775/hpr3775_source.wav,75006546
# f,/home/hpr/www/eps/hpr3775/hpr3775.vtt,13127
# f,/home/hpr/www/eps/hpr3775/hpr3775.srt,13830
# f,/home/hpr/www/eps/hpr3775/hpr3775.txt,11119
# f,/home/hpr/www/eps/hpr3775.mp3,7449559
# f,/home/hpr/www/eps/hpr3775.ogg,10631998
# f,/home/hpr/www/eps/hpr3775.spx,3516736
$assetcount = 0;
$rfiles->{$episode} = [];
for my $file (@result) {
@row = split( ',', $file );
if ( $row[0] eq 'f' ) {
unless ( $row[1] =~ /$episode\.(mp3|ogg|spx)$/ ) {
( $URL = $row[1] ) =~ s/$rempath/$urlbase/;
( $file = $row[1] ) =~ s/$rempath//;
push(
@{ $rfiles->{$episode} },
{ filename => $file, URL => $URL }
);
$assetcount++;
}
}
}
_debug( $DEBUG >= 2, "Results from 'find':\n" . Dumper( \@result ) );
#
# Return the asset count
#
return $assetcount;
}
#=== FUNCTION ================================================================
# NAME: find_links
# PURPOSE: Parses the HTML in a string for links so that a list of assets
# relating to this show can be made
# PARAMETERS: $episode episode number we're dealing with
# $html string containing HTML
# $rlinks hashref to receive the links found
# RETURNS: Number of links found
# DESCRIPTION: Given HTML from the main notes or a subsidiary file the
# function parses this looking for links in 'a' or 'img' tags.
# Links are standardised, making them absolute if relative and
# removing any 'fragment'. The links need to be to files on the
# HPR website to be of interest. If so the filename part is
# extracted. If it follows the format 'hpr9999' then it's
# checked to see if it's for the current show. If not it's
# ignored. If the filename ends with a '/' then it's assumed
# it's shorthand for 'index.html' so this name is appended. Then
# the link and filename are stashed in the hash referenced by
# $rlinks. If the filename ends with '.html' then we need to
# parse it in turn, so we get the contents of the link and
# recurse to parse it. We return the number of HPR links found
# in the pass through the HTML.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_links {
my ( $episode, $html, $rlinks ) = @_;
my ($tree, $epstr, $linkre, $re2, $filepath,
$uri, $slink, $linkcount, $content
);
_debug( $DEBUG >= 3, "find_links enter\n" );
#
# Create a tree object
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);
$tree->warn(1);
$tree->parse_content($html)
or die "HTML::TreeBuilder failed to parse notes: $!\n";
my $baseURL = "https://hackerpublicradio.org";
$epstr = sprintf( "hpr%04d", $episode );
$linkre = qr{
^https?://
(?:www.)?
(?:hacker|hobby)publicradio.org/eps/
(.+)$
}x;
#(?:(hpr$epstr/.+)|(hpr$epstr.+)|(.+))$
#
# Counting new links found and stashed
#
$linkcount = 0;
#
# Scan for links
#
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
#
# Standardise the link (expands relative URLs, removes any fragment).
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
# URIs are removed.
#
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
$uri = URI->new_abs( $link, $baseURL );
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
#
# Is it an HPR link?
#
if ( $slink =~ $linkre ) {
#
# The URL we found might be a link into an HTML file with an
# '#anchor' component ("fragment"). Save the last bracketed match,
# without any 'fragment' if there is one to get a clean filename
# or path.
#
( $filepath = "$+" ) =~ s/#.*$//;
_debug( $DEBUG >= 3, "Link: $slink\n" );
_debug( $DEBUG >= 3, "File path: $filepath\n" );
#
# Does this file path begin with an 'hpr' prefix? If so is it the
# show id? If not we don't want to process it.
#
if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) {
if ( $1 ne $epstr ) {
_debug( $DEBUG >= 3, "Ignored $slink\n" );
next;
}
}
#
# The path and URL might end with a slash which means the URL is
# relying on the Web server to fill in the filename as
# 'index.html'. We have to make this explicit.
#
if ( $slink =~ /\/$/ ) {
$slink .= 'index.html';
$filepath .= 'index.html';
}
#
# Initialise this hash element if needed
#
unless ( exists( $rlinks->{$episode} ) ) {
$rlinks->{$episode} = [];
}
#
# Stash this filename if it's not already stashed, and if it's
# HTML get the link and recurse
#
unless (
any { $_->{filename} eq $filepath } @{ $rlinks->{$episode} }
)
{
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
push(
@{ $rlinks->{$episode} },
{ filename => $filepath, URL => $slink }
);
$linkcount++;
#
# An HTML file has to be investigated
#
if ( $filepath =~ /\.html$/ ) {
$content = get($slink);
unless ( defined($content) ) {
carp "Link $slink returned nothing\n";
}
else {
$linkcount
+= find_links( $episode, $content, $rlinks );
}
}
}
}
}
_debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" );
#
# Return the link count
#
return $linkcount;
}
#=== FUNCTION ================================================================
# NAME: count_rows
# PURPOSE: Counts the rows in a given SQLite table
# PARAMETERS: $dbh Handle for the SQLite database
# $table Name of table
# RETURNS: Number of rows
# DESCRIPTION: Builds SQL from a template including the selected $table.
# Returns the number of rows once the query has been run.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub count_rows {
my ( $dbh, $table ) = @_;
my ( $sql, $sth, $h, $count );
#
# Count rows in the selected SQLite table
#
$sql = "SELECT count(*) AS rowcount FROM $table";
$sth = $dbh->prepare($sql);
$sth->execute();
if ( $dbh->err ) {
croak $dbh->errstr;
}
$count = 0;
if ($h = $sth->fetchrow_hashref) {
$count = $h->{rowcount};
}
return $count;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the first defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument or undef if there are none
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
# returned when doing so might be a problem.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return undef; ## no critic
}
#=== FUNCTION ================================================================
# NAME: assets_match
# PURPOSE: To compare found assets and those in the database
# PARAMETERS: $found_obj an asset object (hashref) found by parsing show
# notes
# $db_obj an asset object (hashref) from the database
# RETURNS: Value 1 or 2 if they match, otherwise zero
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub assets_match {
my ( $found_obj, $db_obj ) = @_;
return 0 if ( !defined($found_obj) || !defined($db_obj) );
my ( $t1, $t2 ) = (
( $found_obj->{filename} eq $db_obj->{filename} ),
( $found_obj->{URL} eq $db_obj->{URL} )
);
return 2 if ( $t1 && $t2 );
return 1 if ( $t1 || $t2 );
}
#=== FUNCTION ================================================================
# NAME: equal
# PURPOSE: Compare two strings even if undefined
# PARAMETERS: $s1 The first string
# $s2 The second string
# RETURNS: True if both strings are undefined, false if one isn't
# defined, otherwise the result of comparing them.
# DESCRIPTION: Works on the principle that two undefined strings are equal,
# a defined and an undefined string are not, and otherwise they
# are equal if they are equal!
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub equal {
my ( $s1, $s2 ) = @_;
return 1 if ( !defined($s1) && !defined($s2) );
return 0 if ( !defined($s1) || !defined($s2) );
return ( $s1 eq $s2 );
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Usage
# PURPOSE: Displays a usage message and exit
# PARAMETERS: None
# RETURNS: To command line level with exit value 1
# DESCRIPTION: Builds the usage message using global values
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
#sub Usage {
# print STDERR <<EOD;
#
#Usage: $PROG [options] directory
#
#$PROG v$VERSION
#
# -help Display this information
# -[no]dry-run Enable/disable dry run mode (default off)
# -[no]verbose Enable/disable verbose mode (default off)
# -debug=N Set the debug level
# -mode=MODE Selects the mode from the following list:
# - initialise: rebuild the SQLite database from MySQL
# (deletes any data already uploaded)
# - update: updates the SQLite database from updates
# found in the MySQL database
# - json: load the 'archive_date' field from a JSON
# file. Also sets the 'uploaded' flag and populates
# the 'IA_URL' field
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
# - scandir: scans a file captured from the HPR server
# by running a 'find' command on the 'eps' directory
# where all the show files reside. This file is
# scanned to determine which shows have files other
# than the audio so we can keep a tally of which shows
# have had these uploaded to the IA
# ** Not fully implemented **
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
# -episode=N1 [-episode=N2 ...]
# In -mode=update defines one or more single episodes to
# process otherwise, if this option is not given, all
# new episodes in the database are processed. The
# episodes are expected to have been already added if
# the option is used, and the 'title' and 'summary'
# fields and the assets (if any) are to be processed.
# -range=N1 [-range=N2]
# An alternative way of defining existing shows to be
# processed in -mode=update. The option can only be
# repeated twice. The smaller number defines the start
# of the range, and the larger one the end of the range.
# So -range=834 -range=840 defines the range 836-840. If
# one number is given (or two the same) this means the
# same as defining one episode. The episodes are
# expected to have been already added and the 'title'
# and 'summary' fields and the assets (if any) are to be
# processed.
# -json=FILE Defines the file to use for -mode=json
# -scandir=FILE Defines a file to use for -mode=scandir
# -dbconfig=FILE Defines an alternative MySQL configuration file
#
#EOD
# exit(1);
#}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION: Process the options we want to offer. See the documentation
# for details
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "documentation|man",
"debug=i", "mode=s",
"json=s", "scandir=s",
"episode=i@", "range=i@",
"dry-run!", "verbose!",
"dbconfig=s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage(
-msg => "$PROG version $VERSION\n",
-exitval => 1,
-verbose => 0
);
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
upload_manager - manages a SQLite database of details of shows on archive.org
=head1 VERSION
This documentation refers to upload_manager version 0.2.19
=head1 USAGE
./upload_manager [-help] [-documentation] [-[no]dry-run] [-[no]verbose]
[-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]] [-range=START
[-range=END]] [-json=FILE] [-scandir=FILE] [-dbconfig=FILE]
=head1 OPTIONS
=over 8
=item B<-help>
Display a brief summary of how to use this script.
=item B<-documentation> or B<-man>
Display the entirety of the internal documentation in the form of a manual
page.
=item B<-[no]dry-run>
Enable/disable dry run mode (default off)
=item B<-[no]verbose>
Enable/disable verbose mode (default off)
=item B<-debug=N>
Sets the debug level:
=over 4
=item B<0>
This is the default value, and causes no debug output to be generated.
=item B<1>
TBA
=item B<2>
Produces quite a lot of asset-related output. All of the output from the lower debug levels is
displayed as well as:
=over 4
=item .
Contents of the %assets hash. This is generated when scanning a show in the
MySQL database in order to find any associated assets.
=item .
The episode number being checked and the number of assets found while
processing assets. A dump of the assets from the SQLite database is shown as
well as the count of assets found therre.
=item .
If there are assets related to a show and there are already some in the SQLite
database, a comparison is made to see if any updates are needed. These
differences are listed.
=item .
If new assets are being added, these are reported from the hash
%assets_by_file.
=item .
Assets are added using a multiple INSERT statement, and details of what is to
be inserted is shown in the array @multiple_insert.
=back
=item B<3>
Produces the most output. All of the output from the lower debug levels is
displayed as well as:
=over 4
=item .
Contents of the @episodes array. This is generated by using the B<-episode=N>
option once or many times. It is shown before and after the removal of
duplicates.
=item .
Contents of the @range array. This is generated by using the B<-range=N>
option once or twice.
=back
=back
=item B<-mode=MODE>
Selects the mode of operation of the script from the following list:
=over 4
=item B<initialise>
Rebuilds the SQLite database from the MySQL data. Use with caution, though
the script will not initialise tables which contain data.
=item B<update>
Updates the SQLite database from updates found in the MySQL database. Exactly
how this is done is controlled by other options. See below for details.
=item B<json>
Load the 'archive_date' field from a JSON file. Also sets the 'uploaded' flag
and populates the 'IA_URL' and 'item_last_updated' fields.
=item B<scandir>
Scans a file captured from the HPR server by running a 'find' command on the
'eps' directory where all the show files reside. This file is scanned to
determine which shows have files other than the audio so we can keep a tally
of which shows have had these uploaded to the IA
** Not fully implemented; may be phased out **
=back
=item B<-episode=N1 [-episode=N2 ...]>
In B<-mode=update> defines one or more single episodes to process otherwise, if
this option is not given, all new episodes in the database are processed. The
episodes are expected to have been already added if this option is used, and
the 'title' and 'summary' fields and the assets (if any) are to be processed.
=item B<-range=N1 [-range=N2]>
An alternative way of defining existing shows to be processed in
B<-mode=update>. The option can only be repeated twice. The smaller number
defines the start of the range, and the larger one the end of the range. So
B<-range=834 -range=840> defines the range 836-840. If one number is given (or
two the same) this means the same as defining one episode. The episodes are
expected to have been already added and the 'title' and 'summary' fields and
the assets (if any) are to be processed.
=item B<-json=FILE>
Defines the file to use for B<-mode=json>. The contents of the file are
expected to be those produced by running B<collect_show_data>.
=item B<-scandir=FILE>
Defines a file to use for B<-mode=scandir>. This feature is not fully
implemented at the moment.
=item B<-dbconfig=FILE>
Defines an alternative MySQL configuration file
=back
=head1 DESCRIPTION
A full description of the application and its features.
May include numerous subsections (i.e. =head2, =head3, etc.)
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
A full explanation of any configuration system(s) used by the application,
including the names and locations of any configuration files, and the
meaning of any environment variables or properties that can be set. These
descriptions must also include details of any configuration language used
=head1 DEPENDENCIES
A list of all the other modules that this module relies upon, including any
restrictions on versions, and an indication whether these required modules are
part of the standard Perl distribution, part of the module's distribution,
or must be installed separately.
=head1 INCOMPATIBILITIES
A list of any modules that this module cannot be used in conjunction with.
This may be due to name conflicts in the interface, or competition for
system or program resources, or due to internal limitations of Perl
(for example, many modules that use source code filters are mutually
incompatible).
=head1 BUGS AND LIMITATIONS
A list of known problems with the module, together with some indication
whether they are likely to be fixed in an upcoming release.
Also a list of restrictions on the features the module does provide:
data types that cannot be handled, performance issues and the circumstances
in which they may arise, practical limitations on the size of data sets,
special cases that are not (yet) handled, etc.
The initial template usually just has:
There are no known bugs in this module.
Please report problems to <Maintainer name(s)> (<contact address>)
Patches are welcome.
=head1 AUTHOR
<Author name(s)> (<contact address>)
=head1 LICENCE AND COPYRIGHT
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
Followed by whatever licence you wish to release it under.
For Perl code that is often just:
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker