2804 lines
92 KiB
Plaintext
2804 lines
92 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: upload_manager
|
||
|
#
|
||
|
# USAGE: ./upload_manager [-help] [-[no]dry-run] [-[no]verbose]
|
||
|
# [-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]]
|
||
|
# [-range=START [-range=END]] [-json=FILE] [-scandir=FILE]
|
||
|
# [-dbconfig=FILE]
|
||
|
#
|
||
|
# DESCRIPTION: Collect details about HPR shows and the copies on Archive.org
|
||
|
# and keep them in a separate SQLite database (for
|
||
|
# the moment). This database is used to populate the
|
||
|
# developing PostgreSQL Database (which is not currently
|
||
|
# progressing very much).
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: 2022-02-15 Had to revert to DBD::MySQL temporarily.
|
||
|
# 2023-06-12 This code is no longer valid. It expects the main
|
||
|
# HTML to point to files on the server (if there are "assets")
|
||
|
# and recurses through extra HTML it finds there. It currently
|
||
|
# looks for file details in any sub-directory "/eps/hpr1234/".
|
||
|
# Now there are no files, so the details of all assets
|
||
|
# (including the audio) needs to be in the 'assets' table. Not
|
||
|
# sure we are there yet.
|
||
|
# 2024-03-08 Smartmatch is deprecated; moved to 'match::smart'.
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.2.19
|
||
|
# CREATED: 2017-06-14 10:50:28
|
||
|
# REVISION: 2024-03-08 20:49:51
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use 5.010;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use utf8;
|
||
|
#use experimental 'smartmatch';
|
||
|
|
||
|
use match::smart;
|
||
|
|
||
|
use Carp;
|
||
|
use Getopt::Long;
|
||
|
use Pod::Usage;
|
||
|
|
||
|
use Config::General;
|
||
|
use File::Slurper qw{ read_text read_lines };
|
||
|
use JSON;
|
||
|
use Net::OpenSSH;
|
||
|
#use TryCatch; # Broke in late May 2020 due to a problem with Devel::Declare
|
||
|
use Try::Tiny;
|
||
|
use Log::Handler;
|
||
|
|
||
|
use HTML::TreeBuilder 5 -weak;
|
||
|
use HTML::Entities;
|
||
|
use List::Util qw{ min max };
|
||
|
use List::MoreUtils qw{ any };
|
||
|
use LWP::Simple;
|
||
|
|
||
|
use DBI;
|
||
|
use SQL::Abstract;
|
||
|
use SQL::Abstract::Plugin::InsertMulti;
|
||
|
|
||
|
use Data::Dumper;
|
||
|
|
||
|
#
|
||
|
# Version number (manually incremented)
|
||
|
#
|
||
|
our $VERSION = '0.2.19';
|
||
|
|
||
|
#
|
||
|
# Script and directory names
|
||
|
#
|
||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||
|
#( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||
|
#$DIR = '.' unless $DIR;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Declarations
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Constants and other declarations
|
||
|
#
|
||
|
my $basedir = "$ENV{HOME}/HPR/IA";
|
||
|
my $logdir = "$basedir/logs";
|
||
|
my $logfile = "$logdir/${PROG}.log";
|
||
|
my $configfile = "$basedir/.$PROG.cfg";
|
||
|
my $db1configfile = "$basedir/.hpr_db.cfg";
|
||
|
my $database2 = "$basedir/ia.db";
|
||
|
|
||
|
my ( $dbh1, $dbh2, $sth1, $h1 );
|
||
|
|
||
|
#
|
||
|
# NOTE: Program config not currently used
|
||
|
#
|
||
|
die "Configuration file $configfile not found\n" unless ( -e $configfile );
|
||
|
|
||
|
my $conf = Config::General->new(
|
||
|
-ConfigFile => $configfile,
|
||
|
-InterPolateVars => 1,
|
||
|
-ExtendedAccess => 1,
|
||
|
);
|
||
|
my %config = $conf->getall();
|
||
|
|
||
|
#
|
||
|
# Enable Unicode mode
|
||
|
#
|
||
|
binmode STDOUT, ":encoding(UTF-8)";
|
||
|
binmode STDERR, ":encoding(UTF-8)";
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Options
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Option defaults
|
||
|
#
|
||
|
my $DEFDEBUG = 0;
|
||
|
|
||
|
#
|
||
|
# Options and arguments
|
||
|
#
|
||
|
my %options;
|
||
|
Options( \%options );
|
||
|
|
||
|
#
|
||
|
# Default help shows minimal information
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
||
|
if ( $options{'help'} );
|
||
|
|
||
|
#
|
||
|
# The -documentation or -man option shows the full POD documentation through
|
||
|
# a pager for convenience
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
||
|
if ( $options{'documentation'} );
|
||
|
|
||
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
||
|
|
||
|
my $db1cfgfile
|
||
|
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile );
|
||
|
|
||
|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
||
|
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
||
|
|
||
|
my $mode = $options{mode};
|
||
|
die "Mode is required\n" unless defined($mode);
|
||
|
$mode = lc($mode);
|
||
|
|
||
|
my @episodes;
|
||
|
@episodes = @{ $options{episode} } if ( defined( $options{episode} ) );
|
||
|
if ($mode ne 'update' and @episodes) {
|
||
|
die "The -episode=N options are only for use in update mode\n";
|
||
|
}
|
||
|
|
||
|
my @range;
|
||
|
@range = @{ $options{range} } if ( defined( $options{range} ) );
|
||
|
if ($mode ne 'update' and @range) {
|
||
|
die "The -range=N options are only for use in update mode\n";
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 3, '@episodes: ' . Dumper( \@episodes ) );
|
||
|
_debug( $DEBUG >= 3, '@range: ' . Dumper( \@range ) );
|
||
|
|
||
|
#
|
||
|
# Check -episode=N versus -range=N. We can't have both
|
||
|
#
|
||
|
die "Choose only -episode=N or -range=N, not both\n"
|
||
|
if ( defined( $options{episode} ) && defined( $options{range} ) );
|
||
|
|
||
|
#
|
||
|
# Only two range values
|
||
|
#
|
||
|
die "-range=N should not be used more than twice\n" if ($#range gt 2);
|
||
|
|
||
|
#
|
||
|
# Turn the range into a list of episodes
|
||
|
#
|
||
|
if (defined($options{range})) {
|
||
|
@episodes = ( min(@range) .. max(@range) );
|
||
|
}
|
||
|
|
||
|
@episodes = map { sprintf('%04d',$_)} @episodes;
|
||
|
|
||
|
_debug( $DEBUG >= 3, '@episodes: ' . Dumper(\@episodes) );
|
||
|
|
||
|
die "Too many episodes requested\n" if (scalar(@episodes) > 20);
|
||
|
|
||
|
my $json = $options{json};
|
||
|
if ( $mode eq 'json' ) {
|
||
|
die "JSON file is required\n" unless defined($json);
|
||
|
die "Unable to find file '$json'\n" unless -e $json;
|
||
|
}
|
||
|
|
||
|
my $dirfile = $options{scandir};
|
||
|
if ( $mode eq 'scandir' ) {
|
||
|
die "Directory listing file is required\n" unless defined($dirfile);
|
||
|
die "Unable to find file '$dirfile'\n" unless -e $dirfile;
|
||
|
}
|
||
|
|
||
|
die "Configuration file $db1cfgfile not found\n" unless ( -e $db1cfgfile );
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Set up logging keeping the default log layout except for the date. The format
|
||
|
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
|
||
|
# the message.
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $log = Log::Handler->new();
|
||
|
|
||
|
$log->add(
|
||
|
file => {
|
||
|
timeformat => "%Y-%m-%d %H:%M:%S",
|
||
|
filename => $logfile,
|
||
|
minlevel => 0,
|
||
|
maxlevel => 7,
|
||
|
}
|
||
|
);
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Load MySQL configuration data
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $db1conf = Config::General->new(
|
||
|
-ConfigFile => $db1cfgfile,
|
||
|
-InterPolateVars => 1,
|
||
|
-ExtendedAccess => 1,
|
||
|
);
|
||
|
my %db1cfg = $db1conf->getall();
|
||
|
|
||
|
if ($verbose) {
|
||
|
print "MySQL database configuration chosen: $db1cfgfile\n";
|
||
|
print "Dry run\n" if $dry_run;
|
||
|
}
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Connect to the MariaDB database [reverted to mysql temporarily]
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $dbhost = $db1cfg{database}->{host} // '127.0.0.1';
|
||
|
my $dbport = $db1cfg{database}->{port} // 3306;
|
||
|
my $dbname = $db1cfg{database}->{name};
|
||
|
my $dbuser = $db1cfg{database}->{user};
|
||
|
my $dbpwd = $db1cfg{database}->{password};
|
||
|
#$dbh1 = DBI->connect( "DBI:MariaDB:database=$dbname;host=$dbhost;port=$dbport",
|
||
|
# $dbuser, $dbpwd, { AutoCommit => 1 } )
|
||
|
# or croak $DBI::errstr;
|
||
|
$dbh1 = DBI->connect( "DBI:mysql:database=$dbname;host=$dbhost;port=$dbport",
|
||
|
$dbuser, $dbpwd, { AutoCommit => 1 } )
|
||
|
or croak $DBI::errstr;
|
||
|
|
||
|
#
|
||
|
# Enable client-side UTF8 [temporary]
|
||
|
#
|
||
|
$dbh1->{mysql_enable_utf8} = 1;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Connect to the SQLite database
|
||
|
#-------------------------------------------------------------------------------
|
||
|
$dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" );
|
||
|
|
||
|
#
|
||
|
# Log the starting conditions
|
||
|
#
|
||
|
$log->info("Starting $PROG version $VERSION");
|
||
|
if (@episodes) {
|
||
|
$log->info('Episodes:', join( ", ", @episodes ) );
|
||
|
}
|
||
|
else {
|
||
|
$log->info('Looking for new episodes');
|
||
|
}
|
||
|
$log->info("Mode: $mode");
|
||
|
if ($dry_run) {
|
||
|
$log->info('Dry run: Yes');
|
||
|
}
|
||
|
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Set up for connecting to the server with SSH
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $remhost = 'hackerpublicradio.org';
|
||
|
#my $remport = 22074;
|
||
|
my $remport = 22;
|
||
|
my $remuser = 'hpr';
|
||
|
my $rempath = '/home/hpr/www/eps/';
|
||
|
|
||
|
my $ssh = Net::OpenSSH->new( $remhost, user => $remuser, port => $remport );
|
||
|
$ssh->error and die "Can't ssh to $remhost: " . $ssh->error;
|
||
|
|
||
|
$log->info("Connected to ${remuser}\@${remhost}, port $remport");
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Perform the action requested by -mode=X
|
||
|
#-------------------------------------------------------------------------------
|
||
|
if ( $mode eq 'initialise' ) {
|
||
|
print "Initialise mode\n";
|
||
|
initialise( $dbh1, $dbh2 );
|
||
|
}
|
||
|
elsif ( $mode eq 'update' ) {
|
||
|
print "Update mode\n";
|
||
|
|
||
|
#
|
||
|
# Range updating is achieved by running the single-shot mode repeatedly
|
||
|
#
|
||
|
if ( @episodes ) {
|
||
|
foreach my $episode (@episodes) {
|
||
|
update_episode( $dbh1, $dbh2, $episode, $dry_run, $verbose );
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
update_db( $dbh1, $dbh2, $dry_run, $verbose );
|
||
|
}
|
||
|
}
|
||
|
elsif ( $mode eq 'json' ) {
|
||
|
print "JSON mode\n";
|
||
|
load_metadata( $dbh2, $json, $dry_run, $verbose );
|
||
|
#load_upload_dates( $dbh2, $json );
|
||
|
}
|
||
|
elsif ( $mode eq 'scandir' ) {
|
||
|
print "Scan directory mode\n";
|
||
|
scan_dir( $dbh2, $dirfile );
|
||
|
}
|
||
|
else {
|
||
|
die "Invalid mode: $mode\n";
|
||
|
}
|
||
|
|
||
|
exit;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: initialise
|
||
|
# PURPOSE: Initialise an empty database
|
||
|
# PARAMETERS: $dbh1 Handle for the MySQL database
|
||
|
# $dbh2 Handle for the SQLite database
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Runs the various initialisation functions, but only if the
|
||
|
# database tables 'episodes' and/or 'assets' are empty
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub initialise {
|
||
|
my ( $dbh1, $dbh2 ) = @_;
|
||
|
|
||
|
my ($count);
|
||
|
|
||
|
#
|
||
|
# Initialise the 'episodes' table if it's empty
|
||
|
#
|
||
|
$count = count_rows( $dbh2, 'episodes' );
|
||
|
if ( $count == 0 ) {
|
||
|
print "Initialising the 'episodes' table\n";
|
||
|
initialise_episodes( $dbh1, $dbh2 );
|
||
|
}
|
||
|
else {
|
||
|
print "Did not initialise the 'episodes' table ($count rows)\n";
|
||
|
$log->info("Did not initialise the 'episodes' table ($count rows)");
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Initialise the 'assets' table if it's empty
|
||
|
#
|
||
|
$count = count_rows( $dbh2, 'assets' );
|
||
|
if ( $count == 0 ) {
|
||
|
print "Initialising the 'assets' table\n";
|
||
|
initialise_assets( $dbh1, $dbh2 );
|
||
|
}
|
||
|
else {
|
||
|
print "Did not initialise the 'assets' table ($count rows)\n";
|
||
|
$log->info("Did not initialise the 'assets' table ($count rows)");
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: initialise_episodes
|
||
|
# PURPOSE: Initialise an empty 'episodes' table
|
||
|
# PARAMETERS: $dbh1 Handle for the MySQL database
|
||
|
# $dbh2 Handle for the SQLite database
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Assuming the episodes table is empty loads all episodes from
|
||
|
# the MariaDB database
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub initialise_episodes {
|
||
|
my ( $dbh1, $dbh2 ) = @_;
|
||
|
|
||
|
my ( $sql1, $sth1, $h1, $count );
|
||
|
|
||
|
$sql1 = q{
|
||
|
SELECT
|
||
|
id,
|
||
|
date,
|
||
|
title,
|
||
|
summary
|
||
|
FROM eps
|
||
|
};
|
||
|
|
||
|
$sth1 = $dbh1->prepare($sql1);
|
||
|
|
||
|
$sth1->execute();
|
||
|
if ( $dbh1->err ) {
|
||
|
croak $dbh1->errstr;
|
||
|
}
|
||
|
|
||
|
$count = 0;
|
||
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||
|
#
|
||
|
# Fields are:
|
||
|
# id integer,
|
||
|
# rdate date NOT NULL,
|
||
|
# title varchar ( 100 ) NOT NULL,
|
||
|
# summary varchar ( 100 ) NOT NULL,
|
||
|
# uploaded integer DEFAULT 0,
|
||
|
# has_files INTEGER DEFAULT 0,
|
||
|
# with_files INTEGER DEFAULT 0,
|
||
|
# with_derived INTEGER DEFAULT 0,
|
||
|
# with_source INTEGER DEFAULT 0,
|
||
|
# archive_date date,
|
||
|
# item_last_updated integer
|
||
|
# IA_URL text,
|
||
|
# notes text,
|
||
|
#
|
||
|
#>>>
|
||
|
$dbh2->do(
|
||
|
q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
|
||
|
undef,
|
||
|
$h1->{id},
|
||
|
$h1->{date},
|
||
|
$h1->{title},
|
||
|
$h1->{summary},
|
||
|
) or die $dbh2->errstr;
|
||
|
#<<<
|
||
|
|
||
|
$count++;
|
||
|
}
|
||
|
|
||
|
print "Records added: $count\n";
|
||
|
$log->info("Records added to 'episodes' table: $count");
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: initialise_assets
|
||
|
# PURPOSE: Initialise an empty 'assets' table (during database
|
||
|
# initialisation)
|
||
|
# PARAMETERS: $dbh1 Handle for the MariaDB database
|
||
|
# $dbh2 Handle for the SQLite database
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Assuming the assets table is empty loads all assets (links to
|
||
|
# files on the HPR server) into the table
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub initialise_assets {
|
||
|
my ( $dbh1, $dbh2 ) = @_;
|
||
|
|
||
|
my ( $sql1, $sth1, $h1, $count, $links, $id, $episode, $notes );
|
||
|
my %assets;
|
||
|
|
||
|
#
|
||
|
# Collect the notes for shows that have links to other HPR files
|
||
|
#
|
||
|
$sql1 = q{
|
||
|
SELECT
|
||
|
id,
|
||
|
notes
|
||
|
FROM eps
|
||
|
WHERE notes REGEXP 'https?://(www.)?(hacker|hobby)publicradio.org/eps/'
|
||
|
};
|
||
|
# LIMIT 20 OFFSET 30
|
||
|
|
||
|
$sth1 = $dbh1->prepare($sql1);
|
||
|
|
||
|
$sth1->execute();
|
||
|
if ( $dbh1->err ) {
|
||
|
croak $dbh1->errstr;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Process notes one show at a time, recursing into HTML where necessary
|
||
|
# and building the %assets hash
|
||
|
#
|
||
|
$count = 0;
|
||
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||
|
$links = 0;
|
||
|
|
||
|
$id = $h1->{id};
|
||
|
$episode = sprintf("%04d", $id);
|
||
|
$notes = $h1->{notes};
|
||
|
|
||
|
print "Episode: $id\n";
|
||
|
#$links = find_links($id,$notes,\%assets);
|
||
|
$links = find_assets($ssh,$episode,$rempath,\%assets);
|
||
|
print "Links found: $links\n";
|
||
|
}
|
||
|
|
||
|
#print Dumper(\%assets),"\n";
|
||
|
|
||
|
#
|
||
|
# Use the %assets hash to populate the 'assets' table
|
||
|
#
|
||
|
foreach my $key (sort(keys(%assets))) {
|
||
|
foreach my $obj (@{$assets{$key}}) {
|
||
|
#>>>
|
||
|
$dbh2->do(
|
||
|
q{INSERT INTO assets (episode_id, URL, filename) VALUES(?,?,?)},
|
||
|
undef,
|
||
|
$key,
|
||
|
$obj->{URL},
|
||
|
$obj->{filename},
|
||
|
) or die $dbh2->errstr;
|
||
|
#<<<
|
||
|
|
||
|
$count++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
print "Records added: $count\n";
|
||
|
$log->info("Records added to 'assets' table: $count");
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: update_db
|
||
|
# PURPOSE: Loads any additional rows from the MySQL database to the
|
||
|
# SQLite one.
|
||
|
# PARAMETERS: $dbh1 Handle for the MySQL database
|
||
|
# $dbh2 Handle for the SQLite database
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Finds the first gap in the 'id' field of the 'episodes' table
|
||
|
# in the SQLite database. This is the starting show number it
|
||
|
# uses when querying the MySQL database. The function loops over
|
||
|
# the 'eps' table in the MySQL database returning details for
|
||
|
# storage in the SQLite database. Before storing it double
|
||
|
# checks that the row in question does not exist.
|
||
|
# TODO
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub update_db {
|
||
|
my ( $dbh1, $dbh2, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2 );
|
||
|
my ( $ep_count, $ffree);
|
||
|
my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
|
||
|
my ( $id, $episode, $notes, $links );
|
||
|
|
||
|
print "Updating SQLite database from MySQL\n" if $verbose;
|
||
|
|
||
|
#
|
||
|
# Find the first episode id gap in the SQLite database
|
||
|
#
|
||
|
$sql1 = q{
|
||
|
SELECT MIN (id) + 1 AS first_free
|
||
|
FROM episodes t1
|
||
|
WHERE NOT EXISTS (
|
||
|
SELECT *
|
||
|
FROM episodes t2
|
||
|
WHERE t1.id + 1 = t2.id)
|
||
|
};
|
||
|
|
||
|
$sth1 = $dbh2->prepare($sql1);
|
||
|
|
||
|
$sth1->execute();
|
||
|
if ( $dbh1->err ) {
|
||
|
croak $dbh1->errstr;
|
||
|
}
|
||
|
|
||
|
$h1 = $sth1->fetchrow_hashref;
|
||
|
$ffree = $h1->{first_free};
|
||
|
print "First free id: $ffree\n" if $verbose;
|
||
|
|
||
|
#
|
||
|
# SQL to find new rows in the MySQL database
|
||
|
#
|
||
|
$sql1 = q{
|
||
|
SELECT
|
||
|
id,
|
||
|
date,
|
||
|
title,
|
||
|
summary,
|
||
|
notes
|
||
|
FROM eps
|
||
|
WHERE id >= ?
|
||
|
};
|
||
|
|
||
|
#
|
||
|
# SQL to look for pre-existing episodes in the SQLite database
|
||
|
#
|
||
|
$sql2 = q{
|
||
|
SELECT * FROM episodes WHERE id = ?
|
||
|
};
|
||
|
|
||
|
#
|
||
|
# Prepare the SQL statements
|
||
|
#
|
||
|
$sth1 = $dbh1->prepare($sql1);
|
||
|
$sth2 = $dbh2->prepare($sql2);
|
||
|
|
||
|
#
|
||
|
# Query MySQL, from $ffree onwards
|
||
|
#
|
||
|
$sth1->execute($ffree);
|
||
|
if ( $dbh1->err ) {
|
||
|
croak $dbh1->errstr;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Loop through the MySQL rows
|
||
|
#
|
||
|
$ep_count = $asset_count = 0;
|
||
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||
|
#
|
||
|
# Query the SQLite database for the episode number we just got
|
||
|
#
|
||
|
$sth2->execute( $h1->{id} );
|
||
|
if ( $dbh2->err ) {
|
||
|
croak $dbh2->errstr;
|
||
|
}
|
||
|
$h2 = $sth2->fetchrow_hashref;
|
||
|
|
||
|
#
|
||
|
# If the row isn't already there create it
|
||
|
#
|
||
|
unless ($h2) {
|
||
|
unless ($dry_run) {
|
||
|
#>>>
|
||
|
$dbh2->do(
|
||
|
q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
|
||
|
undef,
|
||
|
$h1->{id},
|
||
|
$h1->{date},
|
||
|
$h1->{title},
|
||
|
$h1->{summary},
|
||
|
) or die $dbh2->errstr;
|
||
|
#<<<
|
||
|
|
||
|
printf "Added episode: %d\n", $h1->{id} if $verbose;
|
||
|
|
||
|
$ep_count++;
|
||
|
}
|
||
|
else {
|
||
|
printf
|
||
|
"Would have inserted a row into the 'episodes' table (%s)\n",
|
||
|
$h1->{id};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Check this show for "assets" (whether we have the episode already or
|
||
|
# not). Even in dry-run mode this gets executed
|
||
|
#
|
||
|
$links = 0;
|
||
|
|
||
|
$id = $h1->{id};
|
||
|
$episode = sprintf("%04d", $id);
|
||
|
# $notes = $h1->{notes};
|
||
|
# $links = find_links($id,$notes,\%assets);
|
||
|
$links = find_assets($ssh,$episode,$rempath,\%assets);
|
||
|
|
||
|
$asset_count += $links;
|
||
|
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# In debug mode show what assets were collected as we iterated through the
|
||
|
# new shows.
|
||
|
#
|
||
|
_debug( $DEBUG >= 2, '%assets: ' . Dumper(\%assets) );
|
||
|
|
||
|
#
|
||
|
# Add any assets collected from the episodes just processed
|
||
|
#
|
||
|
$assets_added = 0;
|
||
|
if ($asset_count > 0) {
|
||
|
( $assets_added, $assets_deleted, $assets_updated ) =
|
||
|
process_assets( $dbh2, \%assets, 1, $dry_run, $verbose );
|
||
|
}
|
||
|
|
||
|
printf "%-16s %d\n", "Episodes added:", $ep_count;
|
||
|
printf "%-16s %d\n", "Assets found:", $asset_count;
|
||
|
printf "%-16s %d\n", "Assets added:", coalesce( $assets_added, 0 );
|
||
|
printf "%-16s %d\n", "Assets deleted:", coalesce( $assets_deleted, 0 );
|
||
|
printf "%-16s %d\n", "Assets updated:", coalesce( $assets_updated, 0 );
|
||
|
|
||
|
$log->info("Episodes added: $ep_count");
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: process_assets
|
||
|
# PURPOSE: Given a collection of asset details obtained by searching
|
||
|
# notes for a given range of episodes determine whether they
|
||
|
# need to be added to the SQLite database.
|
||
|
# PARAMETERS: $dbh SQLite database handle
|
||
|
# $assets hashref pointing to the collected assets which
|
||
|
# have been found by parsing the notes of all
|
||
|
# relevant shows prior to calling this routine.
|
||
|
# $new Boolean flag to indicate that we're dealing with a new
|
||
|
# show or updating an old one
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Number of assets added
|
||
|
# DESCRIPTION: The %assets hash has been accumulated by scanning all the
|
||
|
# notes belonging to new episodes found in the MySQL database.
|
||
|
# The hash is keyed by the episode number and it is scanned in
|
||
|
# sorted order of the key. The SQLite database is queried for
|
||
|
# assets with this key and the result held in a hash keyed by
|
||
|
# the filename. The value of each key in the %assets hash is an
|
||
|
# array of hashes holding the asset details, so we loop through
|
||
|
# these. If the filename of a particular asset hash is not found
|
||
|
# in the hash from the database then we create a new row in the
|
||
|
# 'assets' table (assuming that we're not in dry run mode where
|
||
|
# we simply report what we would have done). If any assets were
|
||
|
# added for an episode then we set the flag 'has_files' (unless
|
||
|
# in dry run mode). The total number of assets added to the
|
||
|
# database is returned by the function.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub process_assets {
|
||
|
my ( $dbh, $assets, $new, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ( $sql, $sth, $db_assets, $key_field, $db_count );
|
||
|
my ( $asset_count, $assets_added, $assets_deleted, $assets_updated );
|
||
|
|
||
|
$key_field = 'id';
|
||
|
|
||
|
#
|
||
|
# We should only be called if there are assets, but just in case...
|
||
|
#
|
||
|
return ( 0, 0, 0 ) unless ($assets);
|
||
|
|
||
|
#
|
||
|
# SQL to look for pre-existing assets in the SQLite database
|
||
|
#
|
||
|
$sql = q{
|
||
|
SELECT * FROM assets WHERE episode_id = ?
|
||
|
};
|
||
|
$sth = $dbh->prepare($sql);
|
||
|
|
||
|
say 'Processing assets for ', ( $new ? 'new' : 'existing' ), ' show'
|
||
|
if $verbose;
|
||
|
|
||
|
#
|
||
|
# Use the %assets hash to populate the 'assets' table. The hash is keyed
|
||
|
# by episode numbers, and each entry consists of an array of hashes each
|
||
|
# containing the filename and the URL of the asset.
|
||
|
#
|
||
|
$assets_added = $assets_deleted = $assets_updated = 0;
|
||
|
foreach my $key ( sort( keys(%$assets) ) ) {
|
||
|
print "Checking episode $key\n" if $verbose;
|
||
|
|
||
|
#
|
||
|
# Count the assets for the current show
|
||
|
#
|
||
|
$asset_count = scalar( @{ $assets->{$key} } );
|
||
|
|
||
|
_debug( $DEBUG >= 2, "Episode: $key, Asset_count: $asset_count" );
|
||
|
|
||
|
#
|
||
|
# Query the SQLite database for the episode number we just got, looking
|
||
|
# for entries in the 'assets' table
|
||
|
#
|
||
|
$sth->execute($key);
|
||
|
if ( $dbh->err ) {
|
||
|
croak $dbh->errstr;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Grab everything as a hash keyed on the 'filename' field so we can
|
||
|
# check it easily. Count what we got (i.e. count the number of assets
|
||
|
# in SQLite for this episode).
|
||
|
# NOTE: trying the idea of indexing $db_assets by the 'id' field (in
|
||
|
# $key_field)
|
||
|
#
|
||
|
# $db_assets = $sth->fetchall_hashref('filename');
|
||
|
$db_assets = $sth->fetchall_hashref($key_field);
|
||
|
$db_count = scalar( keys(%$db_assets) );
|
||
|
|
||
|
_debug( $DEBUG >= 2, "Assets in ia.db:\n" . Dumper($db_assets) );
|
||
|
_debug( $DEBUG >= 2, "DB count: $db_count" );
|
||
|
|
||
|
#
|
||
|
# Compare the number of "real" assets with what SQLite holds.
|
||
|
#
|
||
|
if ( $asset_count > $db_count ) {
|
||
|
#
|
||
|
# We need to add assets. If it's a newly added show which has
|
||
|
# assets this will always be necessary. If it's an existing show
|
||
|
# then we're repairing an anomaly and want to say so.
|
||
|
#
|
||
|
if ($new) {
|
||
|
say "Adding assets" if $verbose;
|
||
|
}
|
||
|
else {
|
||
|
say "** Found more real assets than in DB -> adding";
|
||
|
}
|
||
|
$assets_added += insert_assets( $dbh, $key, $assets, $db_assets,
|
||
|
$dry_run, $verbose );
|
||
|
}
|
||
|
elsif ( $asset_count < $db_count ) {
|
||
|
#
|
||
|
# We need to delete assets. This can't happen for an existing
|
||
|
# show, so it's always a noteworthy event.
|
||
|
#
|
||
|
say "** Found fewer real assets than in DB -> deleting";
|
||
|
$assets_deleted += delete_assets( $dbh, $key, $assets, $db_assets,
|
||
|
$dry_run, $verbose );
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Look for field differences in what we have after the inserts and
|
||
|
# deletes
|
||
|
#
|
||
|
if ( $assets_added > 0 || $assets_deleted > 0 ) {
|
||
|
$sth->execute($key);
|
||
|
if ( $dbh->err ) {
|
||
|
croak $dbh->errstr;
|
||
|
}
|
||
|
$db_assets = $sth->fetchall_hashref($key_field);
|
||
|
}
|
||
|
my @results
|
||
|
= synchronise_found_assets( $dbh, $key, $assets, $db_assets,
|
||
|
$dry_run, $verbose );
|
||
|
|
||
|
$assets_added += $results[0];
|
||
|
$assets_deleted += $results[1];
|
||
|
$assets_updated += $results[2];
|
||
|
|
||
|
_debug( $DEBUG >= 4, '@results ' . Dumper( \@results ) );
|
||
|
|
||
|
#
|
||
|
# If there are assets then we need to ensure the 'has_files' setting
|
||
|
# reflects it (unless in dry run mode)
|
||
|
#
|
||
|
if ( $asset_count > 0 ) {
|
||
|
unless ($dry_run) {
|
||
|
$dbh->do(
|
||
|
q{
|
||
|
UPDATE episodes
|
||
|
SET has_files = 1
|
||
|
WHERE id = ?
|
||
|
AND has_files = 0
|
||
|
},
|
||
|
undef,
|
||
|
$key,
|
||
|
) or die $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
say "Would have updated the 'episodes.has_files' ",
|
||
|
"field for $key (if needed)";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# $assets_added += $asset_count;
|
||
|
|
||
|
}
|
||
|
|
||
|
return ( $assets_added, $assets_deleted, $assets_updated );
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: synchronise_found_assets
|
||
|
# PURPOSE: Compares the assets found by searching the shownotes (and
|
||
|
# other files) with those in the SQLite database and updates
|
||
|
# what differs.
|
||
|
# PARAMETERS: $dbh SQLite database handle
|
||
|
# $episode Numeric episode
|
||
|
# $assets Hashref pointing to the collected assets which
|
||
|
# have been found by parsing the show's notes;
|
||
|
# keyed by episode number. Each keyed object is
|
||
|
# an arrayref of hashes containing the URL and
|
||
|
# filename of the asset.
|
||
|
# $db_assets Hash of assets from SQLite, keyed by filename.
|
||
|
# Each keyed object is a hash of columns from
|
||
|
# the database: id, episode_id, URL, filename,
|
||
|
# uploaded.
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Number of additions, deletions and updates
|
||
|
# DESCRIPTION: The assets for this episode have been found by searching the
|
||
|
# HTML of the show notes and any linked HTML files for files on
|
||
|
# the HPR server. The equivalent assets have also been
|
||
|
# collected from the SQLite database. A comparison is made
|
||
|
# between these items and if they differ the definitive details
|
||
|
# collected from the MySQL database are used. There are three
|
||
|
# actions possible: (1) if the URL or filename fields differ
|
||
|
# then we can work out the update and apply it, (2) if the
|
||
|
# incoming assets don't match any in those from the database
|
||
|
# then we can delete the database ones, and (3) we can add the
|
||
|
# incoming assets we also couldn't match because they'll be the
|
||
|
# replecements! For all actions changes are made to the
|
||
|
# versions in the SQLite database using SQL::Abstract to
|
||
|
# construct the queries to do it.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: TODO: There may be multiple asset changes for the episode.
|
||
|
# Does the SQL::Abstract call perform the multiple changes?
|
||
|
# Needs testing.
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub synchronise_found_assets {
|
||
|
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ( $sth, $additions, $deletions, $updates, $update_count, $index);
|
||
|
my ( @fieldvals, @where, @db_rows, @asset_rows, %matches, %changes );
|
||
|
|
||
|
$additions = $deletions = $updates = 0;
|
||
|
$update_count = 0;
|
||
|
|
||
|
#
|
||
|
# Temporary?
|
||
|
#
|
||
|
if ( $DEBUG >= 4 ) {
|
||
|
_debug( 1, 'In subroutine synchronise_found_assets' );
|
||
|
_debug( 1, "\$episode: $episode" );
|
||
|
_debug( 1, '%$assets: ' . Dumper($assets) );
|
||
|
_debug( 1, '%$db_assets: ' . Dumper($db_assets) );
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 4, 'Looking for assets that match - to exclude them' );
|
||
|
|
||
|
@db_rows = ( keys(%$db_assets) );
|
||
|
@asset_rows = ( 0 .. scalar( @{ $assets->{$episode} } ) - 1 );
|
||
|
_debug( $DEBUG >= 4, 'Initial @db_rows ' . Dumper( \@db_rows ) );
|
||
|
_debug( $DEBUG >= 4, 'Initial @asset_rows ' . Dumper( \@asset_rows ) );
|
||
|
|
||
|
#
|
||
|
# Find any asset pair matches
|
||
|
#
|
||
|
$index = 0;
|
||
|
foreach my $db_key ( keys(%$db_assets) ) {
|
||
|
foreach my $a_obj ( @{ $assets->{$episode} } ) {
|
||
|
if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 2 ) {
|
||
|
$matches{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
|
||
|
# splice( @asset_rows, $index, 1 );
|
||
|
@asset_rows = grep { $_ != $index } @asset_rows;
|
||
|
$index++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
_debug( $DEBUG >= 4, 'Assets to ignore: %matches ' . Dumper( \%matches ) );
|
||
|
_debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
|
||
|
|
||
|
#
|
||
|
# If everything matched then there's nothing to do, so return zero and
|
||
|
# leave.
|
||
|
#
|
||
|
# if (scalar(@db_rows) == scalar( keys(%matches) )) {
|
||
|
if ( scalar(@asset_rows) == 0 ) {
|
||
|
_debug( $DEBUG >= 4, 'All rows match, so nothing to do' );
|
||
|
return ( 0, 0, 0 );
|
||
|
}
|
||
|
_debug( $DEBUG >= 4, 'Some rows do not match' );
|
||
|
|
||
|
#
|
||
|
# Remove the row ids for any matches
|
||
|
#
|
||
|
@db_rows = grep { !exists( $matches{$_} ) } @db_rows;
|
||
|
_debug( $DEBUG >= 4, '@db_rows after matches ' . Dumper( \@db_rows ) );
|
||
|
_debug( $DEBUG >= 4, '@asset_rows after matches ' . Dumper( \@asset_rows ) );
|
||
|
|
||
|
_debug( $DEBUG >= 4, 'Looking for assets that match in only one attribute' );
|
||
|
|
||
|
#
|
||
|
# Find any cases where only one element of the asset object matches. This
|
||
|
# means we have a simple change.
|
||
|
#
|
||
|
$index = 0;
|
||
|
foreach my $db_key (@db_rows) {
|
||
|
foreach my $a_obj ( @{ $assets->{$episode} } ) {
|
||
|
if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 1 ) {
|
||
|
$changes{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
|
||
|
@asset_rows = grep { $_ != $index } @asset_rows;
|
||
|
$index++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
_debug( $DEBUG >= 4, 'Assets to change: %changes ' . Dumper( \%changes ) );
|
||
|
_debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
|
||
|
|
||
|
#
|
||
|
# Remove the row ids for any changes
|
||
|
#
|
||
|
@db_rows = grep { !exists( $changes{$_} ) } @db_rows;
|
||
|
_debug( $DEBUG >= 4, '@db_rows after changes ' . Dumper( \@db_rows ) );
|
||
|
_debug( $DEBUG >= 4, '@asset_rows after changes ' . Dumper( \@asset_rows ) );
|
||
|
|
||
|
#
|
||
|
# If there are changes, do them now
|
||
|
#
|
||
|
if (%changes) {
|
||
|
_debug( $DEBUG >= 4, 'Collecting changes' );
|
||
|
|
||
|
#
|
||
|
# We may have multiple database rows to process
|
||
|
#
|
||
|
$index = 0;
|
||
|
|
||
|
foreach my $key ( keys(%changes) ) {
|
||
|
$fieldvals[$index] = {};
|
||
|
$where[$index] = { episode_id => $episode };
|
||
|
|
||
|
#
|
||
|
# Compare the fields we're interested in, accumulating differences
|
||
|
# (expecting only one per key).
|
||
|
#
|
||
|
foreach my $fld ( 'filename', 'URL' ) {
|
||
|
if (!equal(
|
||
|
$changes{$key}->[0]->{$fld},
|
||
|
$changes{$key}->[1]->{$fld}
|
||
|
)
|
||
|
)
|
||
|
{
|
||
|
$update_count++;
|
||
|
$fieldvals[$index]->{$fld} = $changes{$key}->[0]->{$fld};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$where[$index]->{id} = $key;
|
||
|
|
||
|
$index++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
|
||
|
_debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
|
||
|
|
||
|
if ( $update_count > 0 ) {
|
||
|
_debug( $DEBUG >= 4, 'Performing changes' );
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
foreach my $fv (@fieldvals) {
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->update( 'assets', $fv, shift(@where) );
|
||
|
|
||
|
#
|
||
|
# Perform the updates (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth = $dbh->prepare($stmt);
|
||
|
$sth->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $episode\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Updated ", join( ',', keys(%$fv) ), "\n"
|
||
|
if $verbose;
|
||
|
$updates++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# If anything is left in @db_rows then they will be replacements
|
||
|
#
|
||
|
if ( scalar(@db_rows) > 0 ) {
|
||
|
say "Replacements found: ", scalar(@db_rows);
|
||
|
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
#
|
||
|
# Every row which came from the database is to be deleted
|
||
|
#
|
||
|
$index = 0;
|
||
|
foreach my $db_key (@db_rows) {
|
||
|
$where[$index] = { episode_id => $episode };
|
||
|
$where[$index]->{id} = $db_key;
|
||
|
$index++;
|
||
|
}
|
||
|
_debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
|
||
|
|
||
|
#
|
||
|
# Do deletions
|
||
|
#
|
||
|
foreach my $db_key (@db_rows) {
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->delete( 'assets', shift(@where) );
|
||
|
|
||
|
#
|
||
|
# Perform the deletions (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth = $dbh->prepare($stmt);
|
||
|
$sth->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $episode\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Deleted ", join( ',', @db_rows ), "\n"
|
||
|
if $verbose;
|
||
|
$deletions++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Every object in the found assets array for this episode is to be
|
||
|
# added.
|
||
|
#
|
||
|
foreach my $i (@asset_rows) {
|
||
|
my $obj = $assets->{$episode}->[$i];
|
||
|
$obj->{episode_id} = $episode;
|
||
|
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->insert( 'assets', $obj );
|
||
|
|
||
|
#
|
||
|
# Perform the additions (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth = $dbh->prepare($stmt);
|
||
|
$sth->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $episode\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Inserted array elements ", join( ',', @asset_rows ), "\n"
|
||
|
if $verbose;
|
||
|
$additions++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
return ( $additions, $deletions, $updates );
|
||
|
}
|
||
|
|
||
|
################################################################################
|
||
|
#
|
||
|
# Look for database assets to be ignored. The outer loop iterates through
|
||
|
# the primary key values of $db_assets where the rows all match the
|
||
|
# $episode value. For each of these assets the next loop iterates through
|
||
|
# the array of assets in $assets->{$episode} comparing the fields we want
|
||
|
# to check in the next inner loop. If the checked fields are all equal
|
||
|
# then there will be nothing to do.
|
||
|
#
|
||
|
# foreach my $db_key ( keys(%$db_assets) ) {
|
||
|
# foreach my $a_obj ( @{ $assets->{$episode} } ) {
|
||
|
# $ignore = $delete = $update = 0;
|
||
|
# foreach my $fld ( 'filename', 'URL' ) {
|
||
|
# $ignore
|
||
|
# += equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
|
||
|
# $delete
|
||
|
# += !equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
|
||
|
# }
|
||
|
# push( @{ $row_actions{'ignore'} }, $db_assets->{$db_key}->{id} )
|
||
|
# if $ignore == 2;
|
||
|
# push( @{ $row_actions{'delete'} }, $db_assets->{$db_key}->{id} )
|
||
|
# if $delete == 2;
|
||
|
# }
|
||
|
# }
|
||
|
#
|
||
|
# _debug( $DEBUG >= 4, '%row_actions ' . Dumper( \%row_actions ) );
|
||
|
#
|
||
|
#
|
||
|
# Scan the array of hashes obtained from the shownotes for $episode and
|
||
|
# record things that need changing.
|
||
|
#
|
||
|
# $index = 0;
|
||
|
# $updates = 0;
|
||
|
# foreach my $obj ( @{ $assets->{$episode} } ) {
|
||
|
# $fieldvals[$index] = {};
|
||
|
#
|
||
|
# #
|
||
|
# # Compare the fields we're interested in
|
||
|
# #
|
||
|
# foreach my $fld ( 'filename', 'URL' ) {
|
||
|
# if ( !equal( $obj->{$fld}, $db_assets->{$obj->{filename}}->{$fld} ) ) {
|
||
|
# $updates++;
|
||
|
# $fieldvals[$index]->{$fld} = $obj->{$fld};
|
||
|
# }
|
||
|
# }
|
||
|
#
|
||
|
# # TODO
|
||
|
# $fieldvals[$index]->{id} = $db_assets->{$obj->{filename}}->{id};
|
||
|
#
|
||
|
# $index++;
|
||
|
# }
|
||
|
#
|
||
|
# _debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
|
||
|
#
|
||
|
# if ($updates > 0 ) {
|
||
|
# #
|
||
|
# # Prepare to use SQL::Abstract
|
||
|
# #
|
||
|
# my $sql = SQL::Abstract->new();
|
||
|
#
|
||
|
# $where{episode_id} = $episode;
|
||
|
#
|
||
|
# foreach my $fv (@fieldvals) {
|
||
|
# $where{id} = $fv->{id};
|
||
|
# delete($fv->{id});
|
||
|
#
|
||
|
# my ( $stmt, @bindvals )
|
||
|
# = $sql->update( 'assets', $fv, \%where );
|
||
|
#
|
||
|
# #
|
||
|
# # Perform the updates (unless in dry-run mode)
|
||
|
# #
|
||
|
# unless ($dry_run) {
|
||
|
# $sth = $dbh->prepare($stmt);
|
||
|
# $sth->execute(@bindvals);
|
||
|
# if ( $dbh->err ) {
|
||
|
# warn "Processing $episode\n", $dbh->errstr;
|
||
|
# }
|
||
|
# else {
|
||
|
# print "Updated ", join( ',', keys(%$fv) ), "\n"
|
||
|
# if $verbose;
|
||
|
# }
|
||
|
# }
|
||
|
# else {
|
||
|
# print "Dry run mode:\n";
|
||
|
# print "Statement: $stmt\n";
|
||
|
# print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
# }
|
||
|
# }
|
||
|
# }
|
||
|
#
|
||
|
################################################################################
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: insert_assets
|
||
|
# PURPOSE: Find missing assets and add them to SQLite
|
||
|
# PARAMETERS: $dbh SQLite database handle
|
||
|
# $episode Numeric episode
|
||
|
# $assets Hashref pointing to the collected assets which
|
||
|
# have been found by parsing the show's notes;
|
||
|
# keyed by episode number.
|
||
|
# $db_assets Hash of assets from SQLite, keyed by filename
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Number of insertions
|
||
|
# DESCRIPTION: The assets for this episode have been found by searching the
|
||
|
# HTML of the show notes and any linked HTML files for files on
|
||
|
# the HPR server. The equivalent assets have also been
|
||
|
# collected from the SQLite database. The MySQL assets are
|
||
|
# reorganised by filename. For each file which does not already
|
||
|
# exist in the SQLite list details are prepared and passed to
|
||
|
# SQL::Abstract using the multi insert feature. The missing
|
||
|
# details are inserted into the assets table unless dry-run mode
|
||
|
# is on.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub insert_assets {
|
||
|
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ($sth, %db_by_file, %assets_by_file, $insertions, @multi_insert, %where);
|
||
|
|
||
|
#
|
||
|
# Reorder the %$db_assets hash in filename order
|
||
|
#
|
||
|
foreach my $key ( keys( %{$db_assets} ) ) {
|
||
|
$db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Reorder the %assets hash in filename order
|
||
|
#
|
||
|
foreach my $obj ( @{ $assets->{$episode} } ) {
|
||
|
$assets_by_file{$obj->{filename}} = $obj;
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
|
||
|
_debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
|
||
|
|
||
|
#
|
||
|
# Find stuff for insertion. We need to build @multi_insert such that it
|
||
|
# contains a list of hashrefs, each containing the fields we want to
|
||
|
# insert.
|
||
|
# Note: We make a new %fv each time, fill it as a slice, and push it into
|
||
|
# the array @multi_insert. This may be rather obscure and therefore
|
||
|
# error-prone.
|
||
|
#
|
||
|
$insertions = 0;
|
||
|
foreach my $key ( keys(%assets_by_file) ) {
|
||
|
# if ( !exists( $db_assets->{$key} ) ) {
|
||
|
if ( !exists( $db_by_file{$key} ) ) {
|
||
|
my %fv;
|
||
|
$insertions++;
|
||
|
@fv{ 'episode_id', 'URL', 'filename' } = (
|
||
|
$episode, $assets_by_file{$key}->{URL},
|
||
|
$assets_by_file{$key}->{filename}
|
||
|
);
|
||
|
push( @multi_insert, \%fv );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, '@multi_insert: ' . Dumper( \@multi_insert ) );
|
||
|
|
||
|
if ($insertions > 0) {
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->insert_multi( 'assets', \@multi_insert );
|
||
|
|
||
|
#
|
||
|
# Perform the insertions (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth = $dbh->prepare($stmt);
|
||
|
$sth->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $episode\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
# print "Inserted ",join(',',keys(%where)),"\n" if $verbose;
|
||
|
print "Inserted $insertions assets\n" if $verbose;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $insertions;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: delete_assets
|
||
|
# PURPOSE: Find superfluous assets for an episode in the 'assets' table
|
||
|
# and delete them.
|
||
|
# PARAMETERS: $dbh SQLite database handle
|
||
|
# $episode Numeric episode
|
||
|
# $assets Hashref pointing to the collected assets which
|
||
|
# have been found by parsing the show's notes;
|
||
|
# keyed by episode number.
|
||
|
# $db_assets Hash of assets from SQLite, keyed by filename
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Number of deletions
|
||
|
# DESCRIPTION: The assets for this episode have been found by searching the
|
||
|
# HTML of the show notes and any linked HTML files for files on
|
||
|
# the HPR server. The equivalent assets have also been
|
||
|
# collected from the SQLite database. The MySQL assets are
|
||
|
# reorganised by filename. For each file exists in the SQLite
|
||
|
# list details are prepared and passed to SQL::Abstract. The
|
||
|
# unwanted details are deleted from the assets table unless
|
||
|
# dry-run mode is on.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub delete_assets {
|
||
|
my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ($sth, %db_by_file, %assets_by_file, $deletions, %where);
|
||
|
|
||
|
#
|
||
|
# Reorder the %$db_assets hash in filename order
|
||
|
#
|
||
|
foreach my $key ( keys( %{$db_assets} ) ) {
|
||
|
$db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Reorder the %assets hash in filename order
|
||
|
#
|
||
|
foreach my $obj ( @{ $assets->{$episode} } ) {
|
||
|
$assets_by_file{$obj->{filename}} = $obj;
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
|
||
|
_debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
|
||
|
|
||
|
#
|
||
|
# Only delete assets with this value in the 'episode_id' field
|
||
|
#
|
||
|
$where{episode_id} = $episode;
|
||
|
|
||
|
#
|
||
|
# Find stuff for deletion. We need to build %where such that it generates
|
||
|
# 'WHERE filename = <value1> OR filename = <value2> ...', so we add these
|
||
|
# values to an (anonymous) array.
|
||
|
#
|
||
|
$deletions = 0;
|
||
|
foreach my $key ( keys(%$db_assets) ) {
|
||
|
# if ( !exists( $assets_by_file{$key} ) ) {
|
||
|
if ( !exists( $db_by_file{$key} ) ) {
|
||
|
$deletions++;
|
||
|
if ( !exists( $where{filename} ) ) {
|
||
|
$where{filename} = [$key];
|
||
|
}
|
||
|
else {
|
||
|
push( @{ $where{filename} }, $key );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, "%where: " . Dumper( \%where ) );
|
||
|
|
||
|
if ($deletions > 0) {
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->delete( 'assets', \%where );
|
||
|
|
||
|
#
|
||
|
# Perform the deletions (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth = $dbh->prepare($stmt);
|
||
|
$sth->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $episode\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Deleted ",join(',',keys(%where)),"\n" if $verbose;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $deletions;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: update_episode
|
||
|
# PURPOSE: Perform a database update for a specific episode
|
||
|
# PARAMETERS: $dbh1 Handle for the MariaDB database
|
||
|
# $dbh2 Handle for the SQLite database
|
||
|
# $episode Target episode to update
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Performs an update for a single episode in the SQLite
|
||
|
# database. The usual scenario is after the Community News show
|
||
|
# has been posted, or a spelling mistake has been corrected for
|
||
|
# an episode. The episode is selected from the MySQL database
|
||
|
# and the SQLite one. The 'title' and 'summary' fields are
|
||
|
# compared across the two copies, and if they differ the
|
||
|
# necessary changes are set up to be executed or reported in
|
||
|
# 'dry run' mode.
|
||
|
# NOTE: No longer done this way:
|
||
|
# |A search for assets is made by reading the show notes and any
|
||
|
# |notes that are linked from these, and if there are
|
||
|
# |supplementary files they are accumulated and the database
|
||
|
# |updated as necessary (under control of the 'dry run' option).
|
||
|
# |This is currently the only way that the row in the SQLite
|
||
|
# |database can be updated after it has been loaded from the
|
||
|
# |MySQL database, other than when the metadata is collected and
|
||
|
# |applied. The metadata update process does not fix errors in
|
||
|
# |the 'title' or 'summary' fields.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub update_episode {
|
||
|
my ( $dbh1, $dbh2, $episode, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2, $sth3 );
|
||
|
my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
|
||
|
my ( $id, $notes, $links );
|
||
|
my ( $updates, %fieldvals, %where );
|
||
|
|
||
|
#
|
||
|
# SQL to find new rows in the MySQL database
|
||
|
#
|
||
|
$sql1 = q{
|
||
|
SELECT
|
||
|
id,
|
||
|
date,
|
||
|
title,
|
||
|
summary,
|
||
|
notes
|
||
|
FROM eps
|
||
|
WHERE id = ?
|
||
|
};
|
||
|
|
||
|
#
|
||
|
# SQL to look for pre-existing episodes in the SQLite database
|
||
|
#
|
||
|
$sql2 = q{
|
||
|
SELECT * FROM episodes WHERE id = ?
|
||
|
};
|
||
|
|
||
|
#
|
||
|
# Prepare the SQL statements
|
||
|
#
|
||
|
$sth1 = $dbh1->prepare($sql1);
|
||
|
$sth2 = $dbh2->prepare($sql2);
|
||
|
|
||
|
#
|
||
|
# Query MySQL, for the $episode
|
||
|
#
|
||
|
$sth1->execute($episode);
|
||
|
if ( $dbh1->err ) {
|
||
|
croak $dbh1->errstr;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Loop through the MySQL row (should only be one)
|
||
|
#
|
||
|
$asset_count = $updates = 0;
|
||
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||
|
#
|
||
|
# Query the SQLite database for the episode number we just got
|
||
|
#
|
||
|
$sth2->execute( $h1->{id} );
|
||
|
if ( $dbh2->err ) {
|
||
|
croak $dbh2->errstr;
|
||
|
}
|
||
|
$h2 = $sth2->fetchrow_hashref;
|
||
|
|
||
|
#
|
||
|
# If the row doesn't exist we have a problem.
|
||
|
# TODO: Could we just add the episode at this point?
|
||
|
#
|
||
|
unless ($h2) {
|
||
|
warn "Row $episode is not in the SQLite database\n";
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Look for title and summary changes and build a hash holding the
|
||
|
# changes
|
||
|
#
|
||
|
foreach my $fld ( 'title', 'summary' ) {
|
||
|
if ( !equal( $h1->{$fld}, $h2->{$fld} ) ) {
|
||
|
$updates++;
|
||
|
$fieldvals{$fld} = $h1->{$fld};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($updates > 0 ) {
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
$where{id} = $episode;
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->update( 'episodes', \%fieldvals, \%where );
|
||
|
|
||
|
#
|
||
|
# Perform the updates (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth3 = $dbh2->prepare($stmt);
|
||
|
$sth3->execute(@bindvals);
|
||
|
if ( $dbh2->err ) {
|
||
|
warn "Processing $episode\n", $dbh2->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Updated ",join(',',keys(%fieldvals)),"\n" if $verbose;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Check this show for "assets". Even in dry-run mode this gets
|
||
|
# executed
|
||
|
#
|
||
|
$links = 0;
|
||
|
|
||
|
$id = $h1->{id};
|
||
|
$notes = $h1->{notes};
|
||
|
# $links = find_links($id,$notes,\%assets);
|
||
|
$links = find_assets($ssh,$episode,$rempath,\%assets);
|
||
|
|
||
|
$asset_count += $links;
|
||
|
|
||
|
}
|
||
|
|
||
|
if ( $asset_count > 0 ) {
|
||
|
_debug( $DEBUG >= 2, "Scanned assets:\n" . Dumper( \%assets ) );
|
||
|
}
|
||
|
else {
|
||
|
_debug( $DEBUG >= 2, "No assets found" );
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Add any assets collected from the episode just processed
|
||
|
#
|
||
|
$assets_added = 0;
|
||
|
if ($asset_count > 0) {
|
||
|
( $assets_added, $assets_deleted, $assets_updated ) =
|
||
|
process_assets( $dbh2, \%assets, 0, $dry_run, $verbose );
|
||
|
}
|
||
|
|
||
|
printf "%-16s %d\n", "Episode checked:", $episode;
|
||
|
printf "%-16s %d\n", "Episode updates:", $updates;
|
||
|
printf "%-16s %d\n", "Assets found:", $asset_count;
|
||
|
printf "%-16s %d\n", "Assets added:", coalesce( $assets_added, 0 );
|
||
|
printf "%-16s %d\n", "Assets deleted:", coalesce( $assets_deleted, 0 );
|
||
|
printf "%-16s %d\n", "Assets updated:", coalesce( $assets_updated, 0 );
|
||
|
|
||
|
$log->info("Episode number: $episode");
|
||
|
$log->info("Episode updates: $updates");
|
||
|
$log->info( 'Assets added,deleted,updated:',
|
||
|
coalesce( $assets_added, 0 ), ',',
|
||
|
coalesce( $assets_deleted, 0 ), ',',
|
||
|
coalesce( $assets_updated, 0 )
|
||
|
);
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: load_metadata
|
||
|
# PURPOSE: Loads values from a JSON file which has been generated by the
|
||
|
# 'ia metadata' command and stores them in the 'episodes' table.
|
||
|
# Also uses the JSON data to determine if the derived files and
|
||
|
# the source file have been uploaded and which assets have been
|
||
|
# uploaded.
|
||
|
# PARAMETERS: $dbh Handle for the SQLite database
|
||
|
# $file Name of file of JSON data
|
||
|
# $dry_run Boolean dry run setting
|
||
|
# $verbose Boolean verbose setting
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: The input file is expected to consist of many JSON complex
|
||
|
# objects, generated by the command 'ia metadata <identifier>'
|
||
|
# for a list of HPR episodes (see script 'collect_show_data').
|
||
|
# A count is kept of successful updates, and this is reported
|
||
|
# once finished.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub load_metadata {
|
||
|
my ( $dbh, $file, $dry_run, $verbose ) = @_;
|
||
|
|
||
|
my ( $filebuffer, @jsonbuffer, %cols, %fieldvals, %where );
|
||
|
my ( @deletions, $index );
|
||
|
my ( @stash, @assets, $orig_count, $source, $epid, $count, $updates );
|
||
|
my ( $sth1, $sth2, $sth3, $rv, $h );
|
||
|
my $template = "https://archive.org/details/hpr%04d";
|
||
|
my @flds = (
|
||
|
'uploaded', 'has_files',
|
||
|
'with_files', 'with_derived',
|
||
|
'with_source', 'archive_date',
|
||
|
'item_last_updated', 'IA_URL',
|
||
|
);
|
||
|
my $re = qr{^hpr\d{4}\.(flac|mp3|ogg|opus|spx|wav)$};
|
||
|
|
||
|
#
|
||
|
# Read the entire JSON file into a buffer
|
||
|
#
|
||
|
try {
|
||
|
$filebuffer = read_text($file);
|
||
|
}
|
||
|
|
||
|
catch {
|
||
|
die "Failed to read JSON file $file\n";
|
||
|
};
|
||
|
|
||
|
#
|
||
|
# Parse the JSON from the buffer
|
||
|
#
|
||
|
my $json = JSON->new;
|
||
|
@jsonbuffer = $json->incr_parse($filebuffer);
|
||
|
|
||
|
#print Dumper(\@jsonbuffer),"\n";
|
||
|
|
||
|
#
|
||
|
# Check that this is the type of JSON we need
|
||
|
#
|
||
|
unless (@jsonbuffer) {
|
||
|
warn "Empty JSON?\n";
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Look at each top-level JSON object and check for missing components.
|
||
|
# It's possible for there to be missing objects, but only in very rare
|
||
|
# cases when a range has been requested yet there's a gap in that range
|
||
|
# (reserved slot perhaps). We accumulate indices of invalid objects in
|
||
|
# @jsonbuffer and then delete the invalid objects.
|
||
|
#
|
||
|
$index = 0;
|
||
|
foreach my $obj (@jsonbuffer) {
|
||
|
unless ( exists( $obj->{metadata} )
|
||
|
&& exists( $obj->{metadata}->{identifier} )
|
||
|
&& exists( $obj->{files} ))
|
||
|
{
|
||
|
warn "Invalid JSON? Skipped object #$index\n";
|
||
|
push( @deletions, $index );
|
||
|
next;
|
||
|
}
|
||
|
$index++;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Delete any bad objects we found in the previous pass
|
||
|
#
|
||
|
for my $ind (@deletions) {
|
||
|
splice( @jsonbuffer, $ind, 1 );
|
||
|
print STDERR "Deleted invalid item with index $ind\n" if $verbose;
|
||
|
}
|
||
|
print "\n" if $verbose;
|
||
|
|
||
|
#
|
||
|
# Prepare to get the database row relating to an episode
|
||
|
#
|
||
|
$sth1 = $dbh->prepare("SELECT * FROM episodes WHERE id = ?");
|
||
|
|
||
|
#
|
||
|
# Prepare to get the rows relating to assets for an episode
|
||
|
#
|
||
|
$sth2 = $dbh->prepare("SELECT * FROM assets WHERE episode_id = ?");
|
||
|
|
||
|
$count = 0;
|
||
|
foreach my $obj (@jsonbuffer) {
|
||
|
if ( $obj->{metadata}->{identifier} =~ /hpr(\d{4})/ ) {
|
||
|
$epid = $1 + 0;
|
||
|
|
||
|
#
|
||
|
# Skip the episodes in the range 1-620
|
||
|
#
|
||
|
#next if ( $epid <= 620 );
|
||
|
#
|
||
|
# 2021-06-27: We don't skip any more. Shows 1-620 have been
|
||
|
# uploaded in blocks, but we want them on the IA in the same
|
||
|
# format as all the others. Today we hit 620, working backwards
|
||
|
# from the earliest IA upload as a project to upload everything
|
||
|
# with show notes and assets. The existing database entries were
|
||
|
# changed to make the 'notes' field hold the text 'Block upload'
|
||
|
# followed by the block archive date and the URL. Then the rows
|
||
|
# between 1-620 are then to have the new date and URL and these
|
||
|
# columns have been made NULL in preparation. See the Journal for
|
||
|
# more information.
|
||
|
#
|
||
|
|
||
|
if ($verbose) {
|
||
|
print "Processing ", $obj->{metadata}->{identifier}, "\n";
|
||
|
print "Publicdate: ", $obj->{metadata}->{publicdate}, "\n";
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Process the files in the metadata
|
||
|
#
|
||
|
$orig_count = 0;
|
||
|
$source = 0;
|
||
|
undef(@stash);
|
||
|
foreach my $f ( @{ $obj->{files} } ) {
|
||
|
#
|
||
|
# Skip anything with a 'Metatda' format or which has a source
|
||
|
# of 'derivative' or 'metadata' or is the newly added
|
||
|
# '__ia_thumb.jpg'
|
||
|
#
|
||
|
next
|
||
|
if ( $f->{format} eq 'Metadata'
|
||
|
|| $f->{source} =~ /^(derivative|metadata)$/
|
||
|
|| $f->{name} eq '__ia_thumb.jpg'
|
||
|
|| $f->{name} =~ /^history\/files\// );
|
||
|
|
||
|
#
|
||
|
# Count audio files which have been "derived" (by us) prior to upload,
|
||
|
# and those that look like 'source' files (e.g.
|
||
|
# hpr2472_source.flac). Anything else store in @stash because
|
||
|
# it's probably an asset file.
|
||
|
#
|
||
|
if ( $f->{name} =~ $re ) {
|
||
|
$orig_count++;
|
||
|
}
|
||
|
elsif ( $f->{name} =~ /^hpr\d{4}_source\..+$/ ) {
|
||
|
$source++;
|
||
|
}
|
||
|
else {
|
||
|
push(@stash,$f->{name});
|
||
|
}
|
||
|
|
||
|
printf "%s %s\n", $f->{name}, $f->{source} if $verbose;
|
||
|
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Check the assets before updating the episode
|
||
|
#
|
||
|
@assets = collect_assets($dbh, $sth2, $epid);
|
||
|
_debug( $DEBUG >= 2, '@assets: ' . Dumper(\@assets));
|
||
|
_debug( $DEBUG >= 2, '@stash: ' . Dumper(\@stash));
|
||
|
|
||
|
if (@assets || @stash) {
|
||
|
@assets = sort(@assets);
|
||
|
@stash = sort(@stash);
|
||
|
|
||
|
#
|
||
|
# Smart match the @assets and @stash arrays to look for
|
||
|
# difference. In @assets we have what the database holds, and
|
||
|
# in @stash we have stored what we found in the JSON.
|
||
|
# NOTE: Now that Smartmatch ('~~') is deprecated we use the
|
||
|
# rather clever 'match::smart' with its strange |M| operator.
|
||
|
# It needs arrays being matched to be offered as references
|
||
|
# however.
|
||
|
#
|
||
|
# unless ( @assets ~~ @stash ) {
|
||
|
unless ( \@assets |M| \@stash ) {
|
||
|
print "Difference between DB assets and IA\n";
|
||
|
print "DB assets:\n ",join("\n ",@assets),"\n";
|
||
|
print "IA assets:\n ",join("\n ",@stash),"\n";
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# In live mode mark all assets belonging to this episode as
|
||
|
# 'uploaded'.
|
||
|
# TODO: Is this dangerous?
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
mark_assets_uploaded($dbh,$epid);
|
||
|
}
|
||
|
else {
|
||
|
print "Update of 'assets' table skipped; dry run\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Prepare to use SQL::Abstract
|
||
|
#
|
||
|
my $sql = SQL::Abstract->new();
|
||
|
|
||
|
#
|
||
|
# Get the current row for this episode for comparison to find what
|
||
|
# has been changed.
|
||
|
#
|
||
|
$rv = $sth1->execute($epid);
|
||
|
if ( $dbh->err ) {
|
||
|
warn $dbh->errstr;
|
||
|
}
|
||
|
|
||
|
$h = $sth1->fetchrow_hashref;
|
||
|
|
||
|
#
|
||
|
# Store the changes
|
||
|
#
|
||
|
$cols{'uploaded'} = 1;
|
||
|
$cols{'has_files'} = ( scalar(@stash) > 0 ? 1 : 0 );
|
||
|
$cols{'with_files'} = ( scalar(@stash) > 0 ? 1 : 0 );
|
||
|
$cols{'with_derived'} = ( $orig_count >= 2 ? 1 : 0 );
|
||
|
$cols{'with_source'} = ( $source > 0 ? 1 : 0 );
|
||
|
$cols{'archive_date'} = $obj->{metadata}->{publicdate};
|
||
|
$cols{'item_last_updated'} = coalesce($obj->{item_last_updated},0);
|
||
|
$cols{'IA_URL'} = sprintf( $template, $epid );
|
||
|
|
||
|
#
|
||
|
# Look for differences, storing the updated values in %fieldvals
|
||
|
# and counting the changes.
|
||
|
#
|
||
|
$updates = 0;
|
||
|
undef(%fieldvals);
|
||
|
foreach my $fld (@flds) {
|
||
|
if ( !equal( $h->{$fld}, $cols{$fld} ) ) {
|
||
|
$updates++;
|
||
|
$fieldvals{$fld} = $cols{$fld};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( $updates > 0 ) {
|
||
|
#
|
||
|
# Build the update statement containing changes. It will be:
|
||
|
# "UPDATE episodes SET a = b, c = d, ... WHERE id = $epid"
|
||
|
#
|
||
|
$where{id} = $epid;
|
||
|
my ( $stmt, @bindvals )
|
||
|
= $sql->update( 'episodes', \%fieldvals, \%where );
|
||
|
|
||
|
#
|
||
|
# Perform the updates (unless in dry-run mode)
|
||
|
#
|
||
|
unless ($dry_run) {
|
||
|
$sth3 = $dbh->prepare($stmt);
|
||
|
$sth3->execute(@bindvals);
|
||
|
if ( $dbh->err ) {
|
||
|
warn "Processing $epid\n", $dbh->errstr;
|
||
|
}
|
||
|
else {
|
||
|
print "Updated\n" if $verbose;
|
||
|
$count++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
print "Dry run mode:\n";
|
||
|
print "Statement: $stmt\n";
|
||
|
print "Bindvals: ", join( ", ", @bindvals ), "\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
else {
|
||
|
printf "Unexpected identifier: %s\n", $obj->{metadata}->{identifier};
|
||
|
}
|
||
|
|
||
|
print '-' x 80, "\n" if $verbose;
|
||
|
}
|
||
|
|
||
|
print "Number of updates: $count\n";
|
||
|
$log->info("Number of updates from JSON: $count");
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: collect_assets
|
||
|
# PURPOSE: Collects asset names for a given episode
|
||
|
# PARAMETERS: $dbh Handle for the SQLite database
|
||
|
# $sth Statement handle for the prepared query
|
||
|
# $epid Episode id
|
||
|
# RETURNS: A list of the asset names
|
||
|
# DESCRIPTION: Just prepares a list of asset filenames by querying the SQLite
|
||
|
# 'assets' table and returns it to the caller
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub collect_assets {
|
||
|
my ( $dbh, $sth, $epid ) = @_;
|
||
|
|
||
|
my ( $rv, $h, @result );
|
||
|
|
||
|
#
|
||
|
# Query the database
|
||
|
#
|
||
|
$rv = $sth->execute($epid);
|
||
|
if ( $dbh->err ) {
|
||
|
warn $dbh->errstr;
|
||
|
}
|
||
|
|
||
|
while ($h = $sth->fetchrow_hashref) {
|
||
|
push(@result,$h->{filename});
|
||
|
}
|
||
|
|
||
|
return @result;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: mark_assets_uploaded
|
||
|
# PURPOSE: Marks the assets relating to an episode as 'uploaded'
|
||
|
# PARAMETERS: $dbh Handle for the SQLite database
|
||
|
# $epid Episode id
|
||
|
# RETURNS: Number of updates
|
||
|
# DESCRIPTION: Mark any assets for a given episode as uploaded, if they are
|
||
|
# not so marked. Expected to be used during an update from JSON
|
||
|
# data collected from the IA.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub mark_assets_uploaded {
|
||
|
my ( $dbh, $epid ) = @_;
|
||
|
|
||
|
my ($rows);
|
||
|
|
||
|
$rows = $dbh->do(
|
||
|
q{UPDATE assets SET uploaded = 1
|
||
|
WHERE episode_id = ? AND uploaded = 0},
|
||
|
undef,
|
||
|
$epid,
|
||
|
) or die $dbh->errstr;
|
||
|
|
||
|
$rows += 0;
|
||
|
|
||
|
return $rows;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: scan_dir
|
||
|
# PURPOSE: Scans a file from the HPR server generated with a simple
|
||
|
# 'find' command which lists all the files and directories under
|
||
|
# the ~/www/eps/ directory.
|
||
|
# PARAMETERS: $dbh Handle for the SQLite database
|
||
|
# $file Name of file of directory data
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Under development
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub scan_dir {
|
||
|
my ( $dbh, $file ) = @_;
|
||
|
|
||
|
my ( %dirtree, $ptr, %shows, $key, $id );
|
||
|
|
||
|
#
|
||
|
# Open the file of paths
|
||
|
#
|
||
|
open( my $fh, "<:encoding(UTF-8)", $file )
|
||
|
or die "Unable to open '$file'\n";
|
||
|
|
||
|
#
|
||
|
# Loop through all paths. If it's one of the audio files then skip it
|
||
|
#
|
||
|
while ( my $line = <$fh> ) {
|
||
|
chomp($line);
|
||
|
|
||
|
next if ( $line =~ /(flac|mp3|ogg|opus|spx|wav)$/ );
|
||
|
|
||
|
#
|
||
|
# Point into the tree hash and add the necessary nodes from the split
|
||
|
# path. The result is a hierarchical has structure where the 'leaf'
|
||
|
# node has an empty anonymous hash as its value.
|
||
|
#
|
||
|
# %dirtree = {
|
||
|
# 'www' => {
|
||
|
# 'eps' => {
|
||
|
# 'hpr2356' => {
|
||
|
# 'index.html' => {},
|
||
|
# 'fix-ssh-on-pi.bash.txt' => {}
|
||
|
# },
|
||
|
# },
|
||
|
# },
|
||
|
# }
|
||
|
#
|
||
|
$ptr = \%dirtree;
|
||
|
$ptr = $ptr->{$_} //= {} for split( /\//, $line );
|
||
|
|
||
|
}
|
||
|
|
||
|
close($fh);
|
||
|
|
||
|
#print Dumper(\%dirtree),"\n";
|
||
|
|
||
|
#
|
||
|
# Build a hash indexed by IA identifier containing an array per value with
|
||
|
# the files relating to the show therein
|
||
|
#
|
||
|
$ptr = \%dirtree;
|
||
|
# TODO if there are multiple items at a level which one to test
|
||
|
$key = ( keys(%$ptr) )[0];
|
||
|
#$key = first { defined($_) } keys(%$ptr);
|
||
|
until ( $key =~ /^hpr[0-9]{4}/ ) {
|
||
|
print "$key\n";
|
||
|
$ptr = $ptr->{$key};
|
||
|
$key = ( keys(%$ptr) )[0];
|
||
|
}
|
||
|
#print join(", ",keys(%$ptr));
|
||
|
|
||
|
# TODO Doesn't deal with sub-directories
|
||
|
foreach my $k ( keys(%$ptr) ) {
|
||
|
if ( ($id) = ( $k =~ /^(hpr[0-9]{4})/ ) ) {
|
||
|
unless ( defined( $shows{$id} ) ) {
|
||
|
$shows{$id} = [];
|
||
|
}
|
||
|
push( @{ $shows{$id} }, $k );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
print Dumper( \%shows ), "\n";
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: find_assets
|
||
|
# PURPOSE: An alternative way to look for assets; connect to the server
|
||
|
# with SSH and use a 'find' command for matching files and
|
||
|
# directories.
|
||
|
# PARAMETERS: $ssh Net::OpenSSH object previously set up to
|
||
|
# connect to the server
|
||
|
# $episode episode number we're dealing with
|
||
|
# $rempath path to search on the server
|
||
|
# $rfiles hashref to receive the files (assets) found
|
||
|
# RETURNS: Number of files found
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub find_assets {
|
||
|
my ($ssh, $episode, $rempath, $rfiles) = @_;
|
||
|
|
||
|
my $urlbase = 'https://hackerpublicradio.org/eps/';
|
||
|
my (@result, @row, $URL, $file, $assetcount);
|
||
|
|
||
|
$rempath .= '/' unless ($rempath =~ /\/$/);
|
||
|
|
||
|
@result = $ssh->capture({ timeout => 10 },
|
||
|
"find $rempath -regextype egrep -regex '.*/hpr${episode}.*' -printf '%y,%p,%s\n'");
|
||
|
$ssh->error and warn "remote 'find' command failed: " . $ssh->error;
|
||
|
|
||
|
# d,/home/hpr/www/eps/hpr3775,85
|
||
|
# f,/home/hpr/www/eps/hpr3775/hpr3775_source.wav,75006546
|
||
|
# f,/home/hpr/www/eps/hpr3775/hpr3775.vtt,13127
|
||
|
# f,/home/hpr/www/eps/hpr3775/hpr3775.srt,13830
|
||
|
# f,/home/hpr/www/eps/hpr3775/hpr3775.txt,11119
|
||
|
# f,/home/hpr/www/eps/hpr3775.mp3,7449559
|
||
|
# f,/home/hpr/www/eps/hpr3775.ogg,10631998
|
||
|
# f,/home/hpr/www/eps/hpr3775.spx,3516736
|
||
|
|
||
|
$assetcount = 0;
|
||
|
$rfiles->{$episode} = [];
|
||
|
for my $file (@result) {
|
||
|
@row = split( ',', $file );
|
||
|
if ( $row[0] eq 'f' ) {
|
||
|
unless ( $row[1] =~ /$episode\.(mp3|ogg|spx)$/ ) {
|
||
|
( $URL = $row[1] ) =~ s/$rempath/$urlbase/;
|
||
|
( $file = $row[1] ) =~ s/$rempath//;
|
||
|
|
||
|
push(
|
||
|
@{ $rfiles->{$episode} },
|
||
|
{ filename => $file, URL => $URL }
|
||
|
);
|
||
|
|
||
|
$assetcount++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 2, "Results from 'find':\n" . Dumper( \@result ) );
|
||
|
|
||
|
#
|
||
|
# Return the asset count
|
||
|
#
|
||
|
return $assetcount;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: find_links
|
||
|
# PURPOSE: Parses the HTML in a string for links so that a list of assets
|
||
|
# relating to this show can be made
|
||
|
# PARAMETERS: $episode episode number we're dealing with
|
||
|
# $html string containing HTML
|
||
|
# $rlinks hashref to receive the links found
|
||
|
# RETURNS: Number of links found
|
||
|
# DESCRIPTION: Given HTML from the main notes or a subsidiary file the
|
||
|
# function parses this looking for links in 'a' or 'img' tags.
|
||
|
# Links are standardised, making them absolute if relative and
|
||
|
# removing any 'fragment'. The links need to be to files on the
|
||
|
# HPR website to be of interest. If so the filename part is
|
||
|
# extracted. If it follows the format 'hpr9999' then it's
|
||
|
# checked to see if it's for the current show. If not it's
|
||
|
# ignored. If the filename ends with a '/' then it's assumed
|
||
|
# it's shorthand for 'index.html' so this name is appended. Then
|
||
|
# the link and filename are stashed in the hash referenced by
|
||
|
# $rlinks. If the filename ends with '.html' then we need to
|
||
|
# parse it in turn, so we get the contents of the link and
|
||
|
# recurse to parse it. We return the number of HPR links found
|
||
|
# in the pass through the HTML.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub find_links {
|
||
|
my ( $episode, $html, $rlinks ) = @_;
|
||
|
|
||
|
my ($tree, $epstr, $linkre, $re2, $filepath,
|
||
|
$uri, $slink, $linkcount, $content
|
||
|
);
|
||
|
|
||
|
_debug( $DEBUG >= 3, "find_links enter\n" );
|
||
|
|
||
|
#
|
||
|
# Create a tree object
|
||
|
#
|
||
|
$tree = HTML::TreeBuilder->new;
|
||
|
$tree->ignore_unknown(0);
|
||
|
$tree->no_expand_entities(1);
|
||
|
$tree->p_strict(1);
|
||
|
$tree->store_comments(1);
|
||
|
$tree->warn(1);
|
||
|
|
||
|
$tree->parse_content($html)
|
||
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
||
|
|
||
|
my $baseURL = "https://hackerpublicradio.org";
|
||
|
|
||
|
$epstr = sprintf( "hpr%04d", $episode );
|
||
|
$linkre = qr{
|
||
|
^https?://
|
||
|
(?:www.)?
|
||
|
(?:hacker|hobby)publicradio.org/eps/
|
||
|
(.+)$
|
||
|
}x;
|
||
|
#(?:(hpr$epstr/.+)|(hpr$epstr.+)|(.+))$
|
||
|
|
||
|
#
|
||
|
# Counting new links found and stashed
|
||
|
#
|
||
|
$linkcount = 0;
|
||
|
|
||
|
#
|
||
|
# Scan for links
|
||
|
#
|
||
|
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
|
||
|
my ( $link, $element, $attr, $tag ) = @$_;
|
||
|
|
||
|
#
|
||
|
# Standardise the link (expands relative URLs, removes any fragment).
|
||
|
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
|
||
|
# URIs are removed.
|
||
|
#
|
||
|
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
|
||
|
$uri = URI->new_abs( $link, $baseURL );
|
||
|
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
|
||
|
|
||
|
#
|
||
|
# Is it an HPR link?
|
||
|
#
|
||
|
if ( $slink =~ $linkre ) {
|
||
|
#
|
||
|
# The URL we found might be a link into an HTML file with an
|
||
|
# '#anchor' component ("fragment"). Save the last bracketed match,
|
||
|
# without any 'fragment' if there is one to get a clean filename
|
||
|
# or path.
|
||
|
#
|
||
|
( $filepath = "$+" ) =~ s/#.*$//;
|
||
|
|
||
|
_debug( $DEBUG >= 3, "Link: $slink\n" );
|
||
|
_debug( $DEBUG >= 3, "File path: $filepath\n" );
|
||
|
|
||
|
#
|
||
|
# Does this file path begin with an 'hpr' prefix? If so is it the
|
||
|
# show id? If not we don't want to process it.
|
||
|
#
|
||
|
if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) {
|
||
|
if ( $1 ne $epstr ) {
|
||
|
_debug( $DEBUG >= 3, "Ignored $slink\n" );
|
||
|
next;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# The path and URL might end with a slash which means the URL is
|
||
|
# relying on the Web server to fill in the filename as
|
||
|
# 'index.html'. We have to make this explicit.
|
||
|
#
|
||
|
if ( $slink =~ /\/$/ ) {
|
||
|
$slink .= 'index.html';
|
||
|
$filepath .= 'index.html';
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Initialise this hash element if needed
|
||
|
#
|
||
|
unless ( exists( $rlinks->{$episode} ) ) {
|
||
|
$rlinks->{$episode} = [];
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Stash this filename if it's not already stashed, and if it's
|
||
|
# HTML get the link and recurse
|
||
|
#
|
||
|
unless (
|
||
|
any { $_->{filename} eq $filepath } @{ $rlinks->{$episode} }
|
||
|
)
|
||
|
{
|
||
|
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
|
||
|
|
||
|
push(
|
||
|
@{ $rlinks->{$episode} },
|
||
|
{ filename => $filepath, URL => $slink }
|
||
|
);
|
||
|
$linkcount++;
|
||
|
|
||
|
#
|
||
|
# An HTML file has to be investigated
|
||
|
#
|
||
|
if ( $filepath =~ /\.html$/ ) {
|
||
|
$content = get($slink);
|
||
|
unless ( defined($content) ) {
|
||
|
carp "Link $slink returned nothing\n";
|
||
|
}
|
||
|
else {
|
||
|
$linkcount
|
||
|
+= find_links( $episode, $content, $rlinks );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" );
|
||
|
|
||
|
#
|
||
|
# Return the link count
|
||
|
#
|
||
|
return $linkcount;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: count_rows
|
||
|
# PURPOSE: Counts the rows in a given SQLite table
|
||
|
# PARAMETERS: $dbh Handle for the SQLite database
|
||
|
# $table Name of table
|
||
|
# RETURNS: Number of rows
|
||
|
# DESCRIPTION: Builds SQL from a template including the selected $table.
|
||
|
# Returns the number of rows once the query has been run.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub count_rows {
|
||
|
my ( $dbh, $table ) = @_;
|
||
|
|
||
|
my ( $sql, $sth, $h, $count );
|
||
|
|
||
|
#
|
||
|
# Count rows in the selected SQLite table
|
||
|
#
|
||
|
$sql = "SELECT count(*) AS rowcount FROM $table";
|
||
|
$sth = $dbh->prepare($sql);
|
||
|
$sth->execute();
|
||
|
if ( $dbh->err ) {
|
||
|
croak $dbh->errstr;
|
||
|
}
|
||
|
|
||
|
$count = 0;
|
||
|
if ($h = $sth->fetchrow_hashref) {
|
||
|
$count = $h->{rowcount};
|
||
|
}
|
||
|
|
||
|
return $count;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: coalesce
|
||
|
# PURPOSE: To find the first defined argument and return it
|
||
|
# PARAMETERS: Arbitrary number of arguments
|
||
|
# RETURNS: The first defined argument or undef if there are none
|
||
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
||
|
# returned when doing so might be a problem.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub coalesce {
|
||
|
foreach (@_) {
|
||
|
return $_ if defined($_);
|
||
|
}
|
||
|
return undef; ## no critic
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: assets_match
|
||
|
# PURPOSE: To compare found assets and those in the database
|
||
|
# PARAMETERS: $found_obj an asset object (hashref) found by parsing show
|
||
|
# notes
|
||
|
# $db_obj an asset object (hashref) from the database
|
||
|
# RETURNS: Value 1 or 2 if they match, otherwise zero
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub assets_match {
|
||
|
my ( $found_obj, $db_obj ) = @_;
|
||
|
|
||
|
return 0 if ( !defined($found_obj) || !defined($db_obj) );
|
||
|
|
||
|
my ( $t1, $t2 ) = (
|
||
|
( $found_obj->{filename} eq $db_obj->{filename} ),
|
||
|
( $found_obj->{URL} eq $db_obj->{URL} )
|
||
|
);
|
||
|
|
||
|
return 2 if ( $t1 && $t2 );
|
||
|
return 1 if ( $t1 || $t2 );
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: equal
|
||
|
# PURPOSE: Compare two strings even if undefined
|
||
|
# PARAMETERS: $s1 The first string
|
||
|
# $s2 The second string
|
||
|
# RETURNS: True if both strings are undefined, false if one isn't
|
||
|
# defined, otherwise the result of comparing them.
|
||
|
# DESCRIPTION: Works on the principle that two undefined strings are equal,
|
||
|
# a defined and an undefined string are not, and otherwise they
|
||
|
# are equal if they are equal!
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub equal {
|
||
|
my ( $s1, $s2 ) = @_;
|
||
|
|
||
|
return 1 if ( !defined($s1) && !defined($s2) );
|
||
|
return 0 if ( !defined($s1) || !defined($s2) );
|
||
|
|
||
|
return ( $s1 eq $s2 );
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: _debug
|
||
|
# PURPOSE: Prints debug reports
|
||
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
||
|
# $message Message to print
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
||
|
# newline and then adds one in the 'print' to the caller doesn't
|
||
|
# have to bother. Prepends the message with 'D> ' to show it's
|
||
|
# a debug message.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub _debug {
|
||
|
my ( $active, $message ) = @_;
|
||
|
|
||
|
chomp($message);
|
||
|
print "D> $message\n" if $active;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Usage
|
||
|
# PURPOSE: Displays a usage message and exit
|
||
|
# PARAMETERS: None
|
||
|
# RETURNS: To command line level with exit value 1
|
||
|
# DESCRIPTION: Builds the usage message using global values
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
#sub Usage {
|
||
|
# print STDERR <<EOD;
|
||
|
#
|
||
|
#Usage: $PROG [options] directory
|
||
|
#
|
||
|
#$PROG v$VERSION
|
||
|
#
|
||
|
# -help Display this information
|
||
|
# -[no]dry-run Enable/disable dry run mode (default off)
|
||
|
# -[no]verbose Enable/disable verbose mode (default off)
|
||
|
# -debug=N Set the debug level
|
||
|
# -mode=MODE Selects the mode from the following list:
|
||
|
# - initialise: rebuild the SQLite database from MySQL
|
||
|
# (deletes any data already uploaded)
|
||
|
# - update: updates the SQLite database from updates
|
||
|
# found in the MySQL database
|
||
|
# - json: load the 'archive_date' field from a JSON
|
||
|
# file. Also sets the 'uploaded' flag and populates
|
||
|
# the 'IA_URL' field
|
||
|
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
|
||
|
# - scandir: scans a file captured from the HPR server
|
||
|
# by running a 'find' command on the 'eps' directory
|
||
|
# where all the show files reside. This file is
|
||
|
# scanned to determine which shows have files other
|
||
|
# than the audio so we can keep a tally of which shows
|
||
|
# have had these uploaded to the IA
|
||
|
# ** Not fully implemented **
|
||
|
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
|
||
|
# -episode=N1 [-episode=N2 ...]
|
||
|
# In -mode=update defines one or more single episodes to
|
||
|
# process otherwise, if this option is not given, all
|
||
|
# new episodes in the database are processed. The
|
||
|
# episodes are expected to have been already added if
|
||
|
# the option is used, and the 'title' and 'summary'
|
||
|
# fields and the assets (if any) are to be processed.
|
||
|
# -range=N1 [-range=N2]
|
||
|
# An alternative way of defining existing shows to be
|
||
|
# processed in -mode=update. The option can only be
|
||
|
# repeated twice. The smaller number defines the start
|
||
|
# of the range, and the larger one the end of the range.
|
||
|
# So -range=834 -range=840 defines the range 836-840. If
|
||
|
# one number is given (or two the same) this means the
|
||
|
# same as defining one episode. The episodes are
|
||
|
# expected to have been already added and the 'title'
|
||
|
# and 'summary' fields and the assets (if any) are to be
|
||
|
# processed.
|
||
|
# -json=FILE Defines the file to use for -mode=json
|
||
|
# -scandir=FILE Defines a file to use for -mode=scandir
|
||
|
# -dbconfig=FILE Defines an alternative MySQL configuration file
|
||
|
#
|
||
|
#EOD
|
||
|
# exit(1);
|
||
|
#}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Options
|
||
|
# PURPOSE: Processes command-line options
|
||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||
|
# RETURNS: Undef
|
||
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
||
|
# for details
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Options {
|
||
|
my ($optref) = @_;
|
||
|
|
||
|
my @options = (
|
||
|
"help", "documentation|man",
|
||
|
"debug=i", "mode=s",
|
||
|
"json=s", "scandir=s",
|
||
|
"episode=i@", "range=i@",
|
||
|
"dry-run!", "verbose!",
|
||
|
"dbconfig=s",
|
||
|
);
|
||
|
|
||
|
if ( !GetOptions( $optref, @options ) ) {
|
||
|
pod2usage(
|
||
|
-msg => "$PROG version $VERSION\n",
|
||
|
-exitval => 1,
|
||
|
-verbose => 0
|
||
|
);
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
__END__
|
||
|
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
# Application Documentation
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
#{{{
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
upload_manager - manages a SQLite database of details of shows on archive.org
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
This documentation refers to upload_manager version 0.2.19
|
||
|
|
||
|
=head1 USAGE
|
||
|
|
||
|
./upload_manager [-help] [-documentation] [-[no]dry-run] [-[no]verbose]
|
||
|
[-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]] [-range=START
|
||
|
[-range=END]] [-json=FILE] [-scandir=FILE] [-dbconfig=FILE]
|
||
|
|
||
|
|
||
|
=head1 OPTIONS
|
||
|
|
||
|
=over 8
|
||
|
|
||
|
=item B<-help>
|
||
|
|
||
|
Display a brief summary of how to use this script.
|
||
|
|
||
|
=item B<-documentation> or B<-man>
|
||
|
|
||
|
Display the entirety of the internal documentation in the form of a manual
|
||
|
page.
|
||
|
|
||
|
=item B<-[no]dry-run>
|
||
|
|
||
|
Enable/disable dry run mode (default off)
|
||
|
|
||
|
=item B<-[no]verbose>
|
||
|
|
||
|
Enable/disable verbose mode (default off)
|
||
|
|
||
|
=item B<-debug=N>
|
||
|
|
||
|
Sets the debug level:
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item B<0>
|
||
|
|
||
|
This is the default value, and causes no debug output to be generated.
|
||
|
|
||
|
=item B<1>
|
||
|
|
||
|
TBA
|
||
|
|
||
|
=item B<2>
|
||
|
|
||
|
Produces quite a lot of asset-related output. All of the output from the lower debug levels is
|
||
|
displayed as well as:
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item .
|
||
|
|
||
|
Contents of the %assets hash. This is generated when scanning a show in the
|
||
|
MySQL database in order to find any associated assets.
|
||
|
|
||
|
=item .
|
||
|
|
||
|
The episode number being checked and the number of assets found while
|
||
|
processing assets. A dump of the assets from the SQLite database is shown as
|
||
|
well as the count of assets found therre.
|
||
|
|
||
|
=item .
|
||
|
|
||
|
If there are assets related to a show and there are already some in the SQLite
|
||
|
database, a comparison is made to see if any updates are needed. These
|
||
|
differences are listed.
|
||
|
|
||
|
=item .
|
||
|
|
||
|
If new assets are being added, these are reported from the hash
|
||
|
%assets_by_file.
|
||
|
|
||
|
=item .
|
||
|
|
||
|
Assets are added using a multiple INSERT statement, and details of what is to
|
||
|
be inserted is shown in the array @multiple_insert.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=item B<3>
|
||
|
|
||
|
Produces the most output. All of the output from the lower debug levels is
|
||
|
displayed as well as:
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item .
|
||
|
|
||
|
Contents of the @episodes array. This is generated by using the B<-episode=N>
|
||
|
option once or many times. It is shown before and after the removal of
|
||
|
duplicates.
|
||
|
|
||
|
=item .
|
||
|
|
||
|
Contents of the @range array. This is generated by using the B<-range=N>
|
||
|
option once or twice.
|
||
|
|
||
|
|
||
|
=back
|
||
|
|
||
|
=back
|
||
|
|
||
|
=item B<-mode=MODE>
|
||
|
|
||
|
Selects the mode of operation of the script from the following list:
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item B<initialise>
|
||
|
|
||
|
Rebuilds the SQLite database from the MySQL data. Use with caution, though
|
||
|
the script will not initialise tables which contain data.
|
||
|
|
||
|
=item B<update>
|
||
|
|
||
|
Updates the SQLite database from updates found in the MySQL database. Exactly
|
||
|
how this is done is controlled by other options. See below for details.
|
||
|
|
||
|
=item B<json>
|
||
|
|
||
|
Load the 'archive_date' field from a JSON file. Also sets the 'uploaded' flag
|
||
|
and populates the 'IA_URL' and 'item_last_updated' fields.
|
||
|
|
||
|
=item B<scandir>
|
||
|
|
||
|
Scans a file captured from the HPR server by running a 'find' command on the
|
||
|
'eps' directory where all the show files reside. This file is scanned to
|
||
|
determine which shows have files other than the audio so we can keep a tally
|
||
|
of which shows have had these uploaded to the IA
|
||
|
|
||
|
** Not fully implemented; may be phased out **
|
||
|
|
||
|
=back
|
||
|
|
||
|
=item B<-episode=N1 [-episode=N2 ...]>
|
||
|
|
||
|
In B<-mode=update> defines one or more single episodes to process otherwise, if
|
||
|
this option is not given, all new episodes in the database are processed. The
|
||
|
episodes are expected to have been already added if this option is used, and
|
||
|
the 'title' and 'summary' fields and the assets (if any) are to be processed.
|
||
|
|
||
|
=item B<-range=N1 [-range=N2]>
|
||
|
|
||
|
An alternative way of defining existing shows to be processed in
|
||
|
B<-mode=update>. The option can only be repeated twice. The smaller number
|
||
|
defines the start of the range, and the larger one the end of the range. So
|
||
|
B<-range=834 -range=840> defines the range 836-840. If one number is given (or
|
||
|
two the same) this means the same as defining one episode. The episodes are
|
||
|
expected to have been already added and the 'title' and 'summary' fields and
|
||
|
the assets (if any) are to be processed.
|
||
|
|
||
|
=item B<-json=FILE>
|
||
|
|
||
|
Defines the file to use for B<-mode=json>. The contents of the file are
|
||
|
expected to be those produced by running B<collect_show_data>.
|
||
|
|
||
|
=item B<-scandir=FILE>
|
||
|
|
||
|
Defines a file to use for B<-mode=scandir>. This feature is not fully
|
||
|
implemented at the moment.
|
||
|
|
||
|
=item B<-dbconfig=FILE>
|
||
|
|
||
|
Defines an alternative MySQL configuration file
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
A full description of the application and its features.
|
||
|
May include numerous subsections (i.e. =head2, =head3, etc.)
|
||
|
|
||
|
|
||
|
=head1 DIAGNOSTICS
|
||
|
|
||
|
A list of every error and warning message that the application can generate
|
||
|
(even the ones that will "never happen"), with a full explanation of each
|
||
|
problem, one or more likely causes, and any suggested remedies. If the
|
||
|
application generates exit status codes (e.g. under Unix) then list the exit
|
||
|
status associated with each error.
|
||
|
|
||
|
|
||
|
=head1 CONFIGURATION AND ENVIRONMENT
|
||
|
|
||
|
A full explanation of any configuration system(s) used by the application,
|
||
|
including the names and locations of any configuration files, and the
|
||
|
meaning of any environment variables or properties that can be set. These
|
||
|
descriptions must also include details of any configuration language used
|
||
|
|
||
|
|
||
|
=head1 DEPENDENCIES
|
||
|
|
||
|
A list of all the other modules that this module relies upon, including any
|
||
|
restrictions on versions, and an indication whether these required modules are
|
||
|
part of the standard Perl distribution, part of the module's distribution,
|
||
|
or must be installed separately.
|
||
|
|
||
|
|
||
|
=head1 INCOMPATIBILITIES
|
||
|
|
||
|
A list of any modules that this module cannot be used in conjunction with.
|
||
|
This may be due to name conflicts in the interface, or competition for
|
||
|
system or program resources, or due to internal limitations of Perl
|
||
|
(for example, many modules that use source code filters are mutually
|
||
|
incompatible).
|
||
|
|
||
|
|
||
|
=head1 BUGS AND LIMITATIONS
|
||
|
|
||
|
A list of known problems with the module, together with some indication
|
||
|
whether they are likely to be fixed in an upcoming release.
|
||
|
|
||
|
Also a list of restrictions on the features the module does provide:
|
||
|
data types that cannot be handled, performance issues and the circumstances
|
||
|
in which they may arise, practical limitations on the size of data sets,
|
||
|
special cases that are not (yet) handled, etc.
|
||
|
|
||
|
The initial template usually just has:
|
||
|
|
||
|
There are no known bugs in this module.
|
||
|
Please report problems to <Maintainer name(s)> (<contact address>)
|
||
|
Patches are welcome.
|
||
|
|
||
|
=head1 AUTHOR
|
||
|
|
||
|
<Author name(s)> (<contact address>)
|
||
|
|
||
|
|
||
|
=head1 LICENCE AND COPYRIGHT
|
||
|
|
||
|
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
|
||
|
|
||
|
Followed by whatever licence you wish to release it under.
|
||
|
For Perl code that is often just:
|
||
|
|
||
|
This module is free software; you can redistribute it and/or
|
||
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
#}}}
|
||
|
|
||
|
# [zo to open fold, zc to close]
|
||
|
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||
|
|