forked from HPR/hpr-tools
		
	InternetArchive/future_upload: Added logging and debugging
InternetArchive/ia_db.sql: Added new tables
InternetArchive/recover_transcripts: New script to run on 'borg' and
    copy missing files from the backup disk to the IA
InternetArchive/repair_assets: More comments, including one about a bug in the design.
InternetArchive/repair_item: Fix relating to octal numbers (if there are
    leading zeroes in a number). '_DEBUG' is now in the function
    library. Added comments to explain obscure stuff.
InternetArchive/snapshot_metadata: New Bash script (to run on my
    desktop) which collects metadata for a show and stores in in the
    '~/HPR/IA/assets' directory. Runs 'view_derivatives' on it to find
    derivative files for deletion.
InternetArchive/tidy_uploaded: Moves files and directories containing
    uploaded files into a holding area for later backup. Added
    debugging, logging and a 'force' mode.
InternetArchive/upload_manager: Manages 'ia.db' (on my workstation).
    Needs many updates which have just started to be added.
InternetArchive/weekly_upload: Old script, now obsolete.
		
	
		
			
				
	
	
		
			2805 lines
		
	
	
		
			92 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			2805 lines
		
	
	
		
			92 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: upload_manager
 | |
| #
 | |
| #        USAGE: ./upload_manager [-help] [-[no]dry-run] [-[no]verbose]
 | |
| #               [-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]]
 | |
| #               [-range=START [-range=END]] [-json=FILE] [-scandir=FILE]
 | |
| #               [-dbconfig=FILE]
 | |
| #
 | |
| #  DESCRIPTION: Collect details about HPR shows and the copies on Archive.org
 | |
| #               and keep them in a separate SQLite database (for
 | |
| #               the moment). This database is used to populate the
 | |
| #               developing PostgreSQL Database (which is not currently
 | |
| #               progressing very much).
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: 2022-02-15 Had to revert to DBD::MySQL temporarily.
 | |
| #               2023-06-12 This code is no longer valid. It expects the main
 | |
| #               HTML to point to files on the server (if there are "assets")
 | |
| #               and recurses through extra HTML it finds there. It currently
 | |
| #               looks for file details in any sub-directory "/eps/hpr1234/".
 | |
| #               Now there are no files, so the details of all assets
 | |
| #               (including the audio) needs to be in the 'assets' table. Not
 | |
| #               sure we are there yet.
 | |
| #               2024-03-08 Smartmatch is deprecated; moved to 'match::smart'.
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.2.19
 | |
| #      CREATED: 2017-06-14 10:50:28
 | |
| #     REVISION: 2024-03-08 20:49:51
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use v5.36;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| #use experimental 'smartmatch';
 | |
| # TODO: use experimental::try;
 | |
| 
 | |
| use match::smart;
 | |
| 
 | |
| use Carp;
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use Config::General;
 | |
| use File::Slurper qw{ read_text read_lines };
 | |
| use JSON;
 | |
| use Net::OpenSSH;
 | |
| #use TryCatch; # Broke in late May 2020 due to a problem with Devel::Declare
 | |
| use Try::Tiny;
 | |
| use Log::Handler;
 | |
| 
 | |
| use HTML::TreeBuilder 5 -weak;
 | |
| use HTML::Entities;
 | |
| use List::Util qw{ min max };
 | |
| use List::MoreUtils qw{ any };
 | |
| use LWP::Simple;
 | |
| 
 | |
| use DBI;
 | |
| use SQL::Abstract;
 | |
| use SQL::Abstract::Plugin::InsertMulti;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.2.19';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| #( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| #$DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir       = "$ENV{HOME}/HPR/IA";
 | |
| my $logdir        = "$basedir/logs";
 | |
| my $logfile       = "$logdir/${PROG}.log";
 | |
| my $configfile    = "$basedir/.$PROG.cfg";
 | |
| my $db1configfile = "$basedir/.hpr_db.cfg";
 | |
| my $database2     = "$basedir/ia.db";
 | |
| 
 | |
| my ( $dbh1, $dbh2, $sth1, $h1 );
 | |
| 
 | |
| #
 | |
| # NOTE: Program config not currently used
 | |
| #
 | |
| die "Configuration file $configfile not found\n" unless ( -e $configfile );
 | |
| 
 | |
| my $conf = Config::General->new(
 | |
|     -ConfigFile      => $configfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1,
 | |
| );
 | |
| my %config = $conf->getall();
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Option defaults
 | |
| #
 | |
| my $DEFDEBUG = 0;
 | |
| 
 | |
| #
 | |
| # Options and arguments
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help shows minimal information
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # The -documentation or -man option shows the full POD documentation through
 | |
| # a pager for convenience
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
 | |
|     if ( $options{'documentation'} );
 | |
| 
 | |
| my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
 | |
| 
 | |
| my $db1cfgfile
 | |
|     = ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile );
 | |
| 
 | |
| my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
 | |
| my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
 | |
| 
 | |
| my $mode = $options{mode};
 | |
| die "Mode is required\n" unless defined($mode);
 | |
| $mode = lc($mode);
 | |
| 
 | |
| my @episodes;
 | |
| @episodes = @{ $options{episode} } if ( defined( $options{episode} ) );
 | |
| if ($mode ne 'update' and @episodes) {
 | |
|     die "The -episode=N options are only for use in update mode\n";
 | |
| }
 | |
| 
 | |
| my @range;
 | |
| @range = @{ $options{range} } if ( defined( $options{range} ) );
 | |
| if ($mode ne 'update' and @range) {
 | |
|     die "The -range=N options are only for use in update mode\n";
 | |
| }
 | |
| 
 | |
| _debug( $DEBUG >= 3, '@episodes: ' . Dumper( \@episodes ) );
 | |
| _debug( $DEBUG >= 3, '@range: ' . Dumper( \@range ) );
 | |
| 
 | |
| #
 | |
| # Check -episode=N versus -range=N. We can't have both
 | |
| #
 | |
| die "Choose only -episode=N or -range=N, not both\n"
 | |
|     if ( defined( $options{episode} ) && defined( $options{range} ) );
 | |
| 
 | |
| #
 | |
| # Only two range values
 | |
| #
 | |
| die "-range=N should not be used more than twice\n" if ($#range gt 2);
 | |
| 
 | |
| #
 | |
| # Turn the range into a list of episodes
 | |
| #
 | |
| if (defined($options{range})) {
 | |
|     @episodes = ( min(@range) .. max(@range) );
 | |
| }
 | |
| 
 | |
| @episodes = map { sprintf('%04d',$_)} @episodes;
 | |
| 
 | |
| _debug( $DEBUG >= 3, '@episodes: ' . Dumper(\@episodes) );
 | |
| 
 | |
| die "Too many episodes requested\n" if (scalar(@episodes) > 20);
 | |
| 
 | |
| my $json = $options{json};
 | |
| if ( $mode eq 'json' ) {
 | |
|     die "JSON file is required\n" unless defined($json);
 | |
|     die "Unable to find file '$json'\n" unless -e $json;
 | |
| }
 | |
| 
 | |
| my $dirfile = $options{scandir};
 | |
| if ( $mode eq 'scandir' ) {
 | |
|     die "Directory listing file is required\n" unless defined($dirfile);
 | |
|     die "Unable to find file '$dirfile'\n" unless -e $dirfile;
 | |
| }
 | |
| 
 | |
| die "Configuration file $db1cfgfile not found\n" unless ( -e $db1cfgfile );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up logging keeping the default log layout except for the date. The format
 | |
| # is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
 | |
| # the message.
 | |
| #-------------------------------------------------------------------------------
 | |
| my $log = Log::Handler->new();
 | |
| 
 | |
| $log->add(
 | |
|     file => {
 | |
|         timeformat => "%Y-%m-%d %H:%M:%S",
 | |
|         filename   => $logfile,
 | |
|         minlevel   => 0,
 | |
|         maxlevel   => 7,
 | |
|     }
 | |
| );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Load MySQL configuration data
 | |
| #-------------------------------------------------------------------------------
 | |
| my $db1conf = Config::General->new(
 | |
|     -ConfigFile      => $db1cfgfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1,
 | |
| );
 | |
| my %db1cfg = $db1conf->getall();
 | |
| 
 | |
| if ($verbose) {
 | |
|     print "MySQL database configuration chosen: $db1cfgfile\n";
 | |
|     print "Dry run\n" if $dry_run;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Connect to the MariaDB database [reverted to mysql temporarily]
 | |
| #-------------------------------------------------------------------------------
 | |
| my $dbhost = $db1cfg{database}->{host} // '127.0.0.1';
 | |
| my $dbport = $db1cfg{database}->{port} // 3306;
 | |
| my $dbname = $db1cfg{database}->{name};
 | |
| my $dbuser = $db1cfg{database}->{user};
 | |
| my $dbpwd  = $db1cfg{database}->{password};
 | |
| #$dbh1 = DBI->connect( "DBI:MariaDB:database=$dbname;host=$dbhost;port=$dbport",
 | |
| #    $dbuser, $dbpwd, { AutoCommit => 1 } )
 | |
| #    or croak $DBI::errstr;
 | |
| $dbh1 = DBI->connect( "DBI:mysql:database=$dbname;host=$dbhost;port=$dbport",
 | |
|     $dbuser, $dbpwd, { AutoCommit => 1 } )
 | |
|     or croak $DBI::errstr;
 | |
| 
 | |
| #
 | |
| # Enable client-side UTF8 [temporary]
 | |
| #
 | |
| $dbh1->{mysql_enable_utf8} = 1;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Connect to the SQLite database
 | |
| #-------------------------------------------------------------------------------
 | |
| $dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" );
 | |
| 
 | |
| #
 | |
| # Log the starting conditions
 | |
| #
 | |
| $log->info("Starting $PROG version $VERSION");
 | |
| if (@episodes) {
 | |
|     $log->info('Episodes:', join( ", ", @episodes ) );
 | |
| }
 | |
| else {
 | |
|     $log->info('Looking for new episodes');
 | |
| }
 | |
| $log->info("Mode: $mode");
 | |
| if ($dry_run) {
 | |
|     $log->info('Dry run: Yes');
 | |
| }
 | |
| 
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up for connecting to the server with SSH
 | |
| #-------------------------------------------------------------------------------
 | |
| my $remhost = 'hackerpublicradio.org';
 | |
| #my $remport = 22074;
 | |
| my $remport = 22;
 | |
| my $remuser = 'hpr';
 | |
| my $rempath = '/home/hpr/www/eps/';
 | |
| 
 | |
| my $ssh = Net::OpenSSH->new( $remhost, user => $remuser, port => $remport );
 | |
| $ssh->error and die "Can't ssh to $remhost: " . $ssh->error;
 | |
| 
 | |
| $log->info("Connected to ${remuser}\@${remhost}, port $remport");
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Perform the action requested by -mode=X
 | |
| #-------------------------------------------------------------------------------
 | |
| if ( $mode eq 'initialise' ) {
 | |
|     print "Initialise mode\n";
 | |
|     initialise( $dbh1, $dbh2 );
 | |
| }
 | |
| elsif ( $mode eq 'update' ) {
 | |
|     print "Update mode\n";
 | |
| 
 | |
|     #
 | |
|     # Range updating is achieved by running the single-shot mode repeatedly
 | |
|     #
 | |
|     if ( @episodes ) {
 | |
|         foreach my $episode (@episodes) {
 | |
|             update_episode( $dbh1, $dbh2, $episode, $dry_run, $verbose );
 | |
|         }
 | |
|     }
 | |
|     else {
 | |
|         update_db( $dbh1, $dbh2, $dry_run, $verbose );
 | |
|     }
 | |
| }
 | |
| elsif ( $mode eq 'json' ) {
 | |
|     print "JSON mode\n";
 | |
|     load_metadata( $dbh2, $json, $dry_run, $verbose );
 | |
|     #load_upload_dates( $dbh2, $json );
 | |
| }
 | |
| elsif ( $mode eq 'scandir' ) {
 | |
|     print "Scan directory mode\n";
 | |
|     scan_dir( $dbh2, $dirfile );
 | |
| }
 | |
| else {
 | |
|     die "Invalid mode: $mode\n";
 | |
| }
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: initialise
 | |
| #      PURPOSE: Initialise an empty database
 | |
| #   PARAMETERS: $dbh1           Handle for the MySQL database
 | |
| #               $dbh2           Handle for the SQLite database
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Runs the various initialisation functions, but only if the
 | |
| #               database tables 'episodes' and/or 'assets' are empty
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub initialise {
 | |
|     my ( $dbh1, $dbh2 ) = @_;
 | |
| 
 | |
|     my ($count);
 | |
| 
 | |
|     #
 | |
|     # Initialise the 'episodes' table if it's empty
 | |
|     #
 | |
|     $count = count_rows( $dbh2, 'episodes' );
 | |
|     if ( $count == 0 ) {
 | |
|         print "Initialising the 'episodes' table\n";
 | |
|         initialise_episodes( $dbh1, $dbh2 );
 | |
|     }
 | |
|     else {
 | |
|         print "Did not initialise the 'episodes' table ($count rows)\n";
 | |
|         $log->info("Did not initialise the 'episodes' table ($count rows)");
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Initialise the 'assets' table if it's empty
 | |
|     #
 | |
|     $count = count_rows( $dbh2, 'assets' );
 | |
|     if ( $count == 0 ) {
 | |
|         print "Initialising the 'assets' table\n";
 | |
|         initialise_assets( $dbh1, $dbh2 );
 | |
|     }
 | |
|     else {
 | |
|         print "Did not initialise the 'assets' table ($count rows)\n";
 | |
|         $log->info("Did not initialise the 'assets' table ($count rows)");
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: initialise_episodes
 | |
| #      PURPOSE: Initialise an empty 'episodes' table
 | |
| #   PARAMETERS: $dbh1           Handle for the MySQL database
 | |
| #               $dbh2           Handle for the SQLite database
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Assuming the episodes table is empty loads all episodes from
 | |
| #               the MariaDB database
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub initialise_episodes {
 | |
|     my ( $dbh1, $dbh2 ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sth1, $h1, $count );
 | |
| 
 | |
|     $sql1 = q{
 | |
|         SELECT
 | |
|             id,
 | |
|             date,
 | |
|             title,
 | |
|             summary
 | |
|         FROM eps
 | |
|     };
 | |
| 
 | |
|     $sth1 = $dbh1->prepare($sql1);
 | |
| 
 | |
|     $sth1->execute();
 | |
|     if ( $dbh1->err ) {
 | |
|         croak $dbh1->errstr;
 | |
|     }
 | |
| 
 | |
|     $count = 0;
 | |
|     while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|         #
 | |
|         # Fields are:
 | |
|         # id                    integer,
 | |
|         # rdate                 date NOT NULL,
 | |
|         # title                 varchar ( 100 ) NOT NULL,
 | |
|         # summary               varchar ( 100 ) NOT NULL,
 | |
|         # uploaded              integer DEFAULT 0,
 | |
|         # has_files             INTEGER DEFAULT 0,
 | |
|         # with_files            INTEGER DEFAULT 0,
 | |
|         # with_derived          INTEGER DEFAULT 0,
 | |
|         # with_source           INTEGER DEFAULT 0,
 | |
|         # archive_date          date,
 | |
|         # item_last_updated     integer
 | |
|         # IA_URL                text,
 | |
|         # notes                 text,
 | |
|         #
 | |
|         #>>>
 | |
|         $dbh2->do(
 | |
|             q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
 | |
|             undef,
 | |
|             $h1->{id},
 | |
|             $h1->{date},
 | |
|             $h1->{title},
 | |
|             $h1->{summary},
 | |
|         ) or die $dbh2->errstr;
 | |
|         #<<<
 | |
| 
 | |
|         $count++;
 | |
|     }
 | |
| 
 | |
|     print "Records added: $count\n";
 | |
|     $log->info("Records added to 'episodes' table: $count");
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: initialise_assets
 | |
| #      PURPOSE: Initialise an empty 'assets' table (during database
 | |
| #               initialisation)
 | |
| #   PARAMETERS: $dbh1           Handle for the MariaDB database
 | |
| #               $dbh2           Handle for the SQLite database
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Assuming the assets table is empty loads all assets (links to
 | |
| #               files on the HPR server) into the table
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub initialise_assets {
 | |
|     my ( $dbh1, $dbh2 ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sth1, $h1, $count, $links, $id, $episode, $notes );
 | |
|     my %assets;
 | |
| 
 | |
|     #
 | |
|     # Collect the notes for shows that have links to other HPR files
 | |
|     #
 | |
|     $sql1 = q{
 | |
|         SELECT
 | |
|             id,
 | |
|             notes
 | |
|         FROM eps
 | |
|         WHERE notes REGEXP 'https?://(www.)?(hacker|hobby)publicradio.org/eps/'
 | |
|     };
 | |
| #        LIMIT 20 OFFSET 30
 | |
| 
 | |
|     $sth1 = $dbh1->prepare($sql1);
 | |
| 
 | |
|     $sth1->execute();
 | |
|     if ( $dbh1->err ) {
 | |
|         croak $dbh1->errstr;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Process notes one show at a time, recursing into HTML where necessary
 | |
|     # and building the %assets hash
 | |
|     #
 | |
|     $count = 0;
 | |
|     while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|         $links = 0;
 | |
| 
 | |
|         $id = $h1->{id};
 | |
|         $episode = sprintf("%04d", $id);
 | |
|         $notes = $h1->{notes};
 | |
| 
 | |
|         print "Episode: $id\n";
 | |
|         #$links = find_links($id,$notes,\%assets);
 | |
|         $links = find_assets($ssh,$episode,$rempath,\%assets);
 | |
|         print "Links found: $links\n";
 | |
|     }
 | |
| 
 | |
|     #print Dumper(\%assets),"\n";
 | |
| 
 | |
|     #
 | |
|     # Use the %assets hash to populate the 'assets' table
 | |
|     #
 | |
|     foreach my $key (sort(keys(%assets))) {
 | |
|         foreach my $obj (@{$assets{$key}}) {
 | |
|             #>>>
 | |
|             $dbh2->do(
 | |
|                 q{INSERT INTO assets (episode_id, URL, filename) VALUES(?,?,?)},
 | |
|                 undef,
 | |
|                 $key,
 | |
|                 $obj->{URL},
 | |
|                 $obj->{filename},
 | |
|             ) or die $dbh2->errstr;
 | |
|             #<<<
 | |
| 
 | |
|             $count++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     print "Records added: $count\n";
 | |
|     $log->info("Records added to 'assets' table: $count");
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: update_db
 | |
| #      PURPOSE: Loads any additional rows from the MySQL database to the
 | |
| #               SQLite one.
 | |
| #   PARAMETERS: $dbh1           Handle for the MySQL database
 | |
| #               $dbh2           Handle for the SQLite database
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Finds the first gap in the 'id' field of the 'episodes' table
 | |
| #               in the SQLite database. This is the starting show number it
 | |
| #               uses when querying the MySQL database. The function loops over
 | |
| #               the 'eps' table in the MySQL database returning details for
 | |
| #               storage in the SQLite database. Before storing it double
 | |
| #               checks that the row in question does not exist.
 | |
| #               TODO
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub update_db {
 | |
|     my ( $dbh1, $dbh2, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2 );
 | |
|     my ( $ep_count, $ffree);
 | |
|     my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
 | |
|     my ( $id, $episode, $notes, $links );
 | |
| 
 | |
|     print "Updating SQLite database from MySQL\n" if $verbose;
 | |
| 
 | |
|     #
 | |
|     # Find the first episode id gap in the SQLite database
 | |
|     #
 | |
|     $sql1 = q{
 | |
|         SELECT MIN (id) + 1 AS first_free
 | |
|         FROM episodes t1
 | |
|         WHERE NOT EXISTS (
 | |
|             SELECT *
 | |
|             FROM episodes t2
 | |
|             WHERE t1.id + 1 = t2.id)
 | |
|     };
 | |
| 
 | |
|     $sth1 = $dbh2->prepare($sql1);
 | |
| 
 | |
|     $sth1->execute();
 | |
|     if ( $dbh1->err ) {
 | |
|         croak $dbh1->errstr;
 | |
|     }
 | |
| 
 | |
|     $h1    = $sth1->fetchrow_hashref;
 | |
|     $ffree = $h1->{first_free};
 | |
|     print "First free id: $ffree\n" if $verbose;
 | |
| 
 | |
|     #
 | |
|     # SQL to find new rows in the MySQL database
 | |
|     #
 | |
|     $sql1 = q{
 | |
|         SELECT
 | |
|             id,
 | |
|             date,
 | |
|             title,
 | |
|             summary,
 | |
|             notes
 | |
|         FROM eps
 | |
|         WHERE id >= ?
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # SQL to look for pre-existing episodes in the SQLite database
 | |
|     #
 | |
|     $sql2 = q{
 | |
|         SELECT * FROM episodes WHERE id = ?
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # Prepare the SQL statements
 | |
|     #
 | |
|     $sth1 = $dbh1->prepare($sql1);
 | |
|     $sth2 = $dbh2->prepare($sql2);
 | |
| 
 | |
|     #
 | |
|     # Query MySQL, from $ffree onwards
 | |
|     #
 | |
|     $sth1->execute($ffree);
 | |
|     if ( $dbh1->err ) {
 | |
|         croak $dbh1->errstr;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Loop through the MySQL rows
 | |
|     #
 | |
|     $ep_count = $asset_count = 0;
 | |
|     while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|         #
 | |
|         # Query the SQLite database for the episode number we just got
 | |
|         #
 | |
|         $sth2->execute( $h1->{id} );
 | |
|         if ( $dbh2->err ) {
 | |
|             croak $dbh2->errstr;
 | |
|         }
 | |
|         $h2 = $sth2->fetchrow_hashref;
 | |
| 
 | |
|         #
 | |
|         # If the row isn't already there create it
 | |
|         #
 | |
|         unless ($h2) {
 | |
|             unless ($dry_run) {
 | |
|                 #>>>
 | |
|                 $dbh2->do(
 | |
|                     q{INSERT INTO episodes (id, rdate, title, summary) VALUES(?,?,?,?)},
 | |
|                     undef,
 | |
|                     $h1->{id},
 | |
|                     $h1->{date},
 | |
|                     $h1->{title},
 | |
|                     $h1->{summary},
 | |
|                 ) or die $dbh2->errstr;
 | |
|                 #<<<
 | |
| 
 | |
|                 printf "Added episode: %d\n", $h1->{id} if $verbose;
 | |
| 
 | |
|                 $ep_count++;
 | |
|             }
 | |
|             else {
 | |
|                 printf
 | |
|                     "Would have inserted a row into the 'episodes' table (%s)\n",
 | |
|                     $h1->{id};
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check this show for "assets" (whether we have the episode already or
 | |
|         # not). Even in dry-run mode this gets executed
 | |
|         #
 | |
|         $links = 0;
 | |
| 
 | |
|         $id = $h1->{id};
 | |
|         $episode = sprintf("%04d", $id);
 | |
|         # $notes = $h1->{notes};
 | |
|         # $links = find_links($id,$notes,\%assets);
 | |
|         $links = find_assets($ssh,$episode,$rempath,\%assets);
 | |
| 
 | |
|         $asset_count += $links;
 | |
| 
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # In debug mode show what assets were collected as we iterated through the
 | |
|     # new shows.
 | |
|     #
 | |
|     _debug( $DEBUG >= 2, '%assets: ' . Dumper(\%assets) );
 | |
| 
 | |
|     #
 | |
|     # Add any assets collected from the episodes just processed
 | |
|     #
 | |
|     $assets_added = 0;
 | |
|     if ($asset_count > 0) {
 | |
|         ( $assets_added, $assets_deleted, $assets_updated ) =
 | |
|             process_assets( $dbh2, \%assets, 1, $dry_run, $verbose );
 | |
|     }
 | |
| 
 | |
|     printf "%-16s %d\n", "Episodes added:", $ep_count;
 | |
|     printf "%-16s %d\n", "Assets found:",   $asset_count;
 | |
|     printf "%-16s %d\n", "Assets added:",   coalesce( $assets_added,   0 );
 | |
|     printf "%-16s %d\n", "Assets deleted:", coalesce( $assets_deleted, 0 );
 | |
|     printf "%-16s %d\n", "Assets updated:", coalesce( $assets_updated, 0 );
 | |
| 
 | |
|     $log->info("Episodes added: $ep_count");
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: process_assets
 | |
| #      PURPOSE: Given a collection of asset details obtained by searching
 | |
| #               notes for a given range of episodes determine whether they
 | |
| #               need to be added to the SQLite database.
 | |
| #   PARAMETERS: $dbh            SQLite database handle
 | |
| #               $assets         hashref pointing to the collected assets which
 | |
| #                               have been found by parsing the notes of all
 | |
| #                               relevant shows prior to calling this routine.
 | |
| #               $new            Boolean flag to indicate that we're dealing with a new
 | |
| #                               show or updating an old one
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Number of assets added
 | |
| #  DESCRIPTION: The %assets hash has been accumulated by scanning all the
 | |
| #               notes belonging to new episodes found in the MySQL database.
 | |
| #               The hash is keyed by the episode number and it is scanned in
 | |
| #               sorted order of the key. The SQLite database is queried for
 | |
| #               assets with this key and the result held in a hash keyed by
 | |
| #               the filename. The value of each key in the %assets hash is an
 | |
| #               array of hashes holding the asset details, so we loop through
 | |
| #               these. If the filename of a particular asset hash is not found
 | |
| #               in the hash from the database then we create a new row in the
 | |
| #               'assets' table (assuming that we're not in dry run mode where
 | |
| #               we simply report what we would have done). If any assets were
 | |
| #               added for an episode then we set the flag 'has_files' (unless
 | |
| #               in dry run mode). The total number of assets added to the
 | |
| #               database is returned by the function.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub process_assets {
 | |
|     my ( $dbh, $assets, $new, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ( $sql, $sth, $db_assets, $key_field, $db_count );
 | |
|     my ( $asset_count, $assets_added, $assets_deleted, $assets_updated );
 | |
| 
 | |
|     $key_field = 'id';
 | |
| 
 | |
|     #
 | |
|     # We should only be called if there are assets, but just in case...
 | |
|     #
 | |
|     return ( 0, 0, 0 ) unless ($assets);
 | |
| 
 | |
|     #
 | |
|     # SQL to look for pre-existing assets in the SQLite database
 | |
|     #
 | |
|     $sql = q{
 | |
|         SELECT * FROM assets WHERE episode_id = ?
 | |
|     };
 | |
|     $sth = $dbh->prepare($sql);
 | |
| 
 | |
|     say 'Processing assets for ', ( $new ? 'new' : 'existing' ), ' show'
 | |
|         if $verbose;
 | |
| 
 | |
|     #
 | |
|     # Use the %assets hash to populate the 'assets' table. The hash is keyed
 | |
|     # by episode numbers, and each entry consists of an array of hashes each
 | |
|     # containing the filename and the URL of the asset.
 | |
|     #
 | |
|     $assets_added = $assets_deleted = $assets_updated = 0;
 | |
|     foreach my $key ( sort( keys(%$assets) ) ) {
 | |
|         print "Checking episode $key\n" if $verbose;
 | |
| 
 | |
|         #
 | |
|         # Count the assets for the current show
 | |
|         #
 | |
|         $asset_count = scalar( @{ $assets->{$key} } );
 | |
| 
 | |
|         _debug( $DEBUG >= 2, "Episode: $key, Asset_count: $asset_count" );
 | |
| 
 | |
|        #
 | |
|        # Query the SQLite database for the episode number we just got, looking
 | |
|        # for entries in the 'assets' table
 | |
|        #
 | |
|         $sth->execute($key);
 | |
|         if ( $dbh->err ) {
 | |
|             croak $dbh->errstr;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Grab everything as a hash keyed on the 'filename' field so we can
 | |
|         # check it easily. Count what we got (i.e. count the number of assets
 | |
|         # in SQLite for this episode).
 | |
|         # NOTE: trying the idea of indexing $db_assets by the 'id' field (in
 | |
|         # $key_field)
 | |
|         #
 | |
|         #        $db_assets = $sth->fetchall_hashref('filename');
 | |
|         $db_assets = $sth->fetchall_hashref($key_field);
 | |
|         $db_count  = scalar( keys(%$db_assets) );
 | |
| 
 | |
|         _debug( $DEBUG >= 2, "Assets in ia.db:\n" . Dumper($db_assets) );
 | |
|         _debug( $DEBUG >= 2, "DB count: $db_count" );
 | |
| 
 | |
|         #
 | |
|         # Compare the number of "real" assets with what SQLite holds.
 | |
|         #
 | |
|         if ( $asset_count > $db_count ) {
 | |
|             #
 | |
|             # We need to add assets. If it's a newly added show which has
 | |
|             # assets this will always be necessary. If it's an existing show
 | |
|             # then we're repairing an anomaly and want to say so.
 | |
|             #
 | |
|             if ($new) {
 | |
|                 say "Adding assets" if $verbose;
 | |
|             }
 | |
|             else {
 | |
|                 say "** Found more real assets than in DB -> adding";
 | |
|             }
 | |
|             $assets_added += insert_assets( $dbh, $key, $assets, $db_assets,
 | |
|                 $dry_run, $verbose );
 | |
|         }
 | |
|         elsif ( $asset_count < $db_count ) {
 | |
|             #
 | |
|             # We need to delete assets. This can't happen for an existing
 | |
|             # show, so it's always a noteworthy event.
 | |
|             #
 | |
|             say "** Found fewer real assets than in DB -> deleting";
 | |
|             $assets_deleted += delete_assets( $dbh, $key, $assets, $db_assets,
 | |
|                 $dry_run, $verbose );
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Look for field differences in what we have after the inserts and
 | |
|         # deletes
 | |
|         #
 | |
|         if ( $assets_added > 0 || $assets_deleted > 0 ) {
 | |
|             $sth->execute($key);
 | |
|             if ( $dbh->err ) {
 | |
|                 croak $dbh->errstr;
 | |
|             }
 | |
|             $db_assets = $sth->fetchall_hashref($key_field);
 | |
|         }
 | |
|         my @results
 | |
|             = synchronise_found_assets( $dbh, $key, $assets, $db_assets,
 | |
|             $dry_run, $verbose );
 | |
| 
 | |
|         $assets_added   += $results[0];
 | |
|         $assets_deleted += $results[1];
 | |
|         $assets_updated += $results[2];
 | |
| 
 | |
|         _debug( $DEBUG >= 4, '@results ' . Dumper( \@results ) );
 | |
| 
 | |
|         #
 | |
|         # If there are assets then we need to ensure the 'has_files' setting
 | |
|         # reflects it (unless in dry run mode)
 | |
|         #
 | |
|         if ( $asset_count > 0 ) {
 | |
|             unless ($dry_run) {
 | |
|                 $dbh->do(
 | |
|                     q{
 | |
|                         UPDATE episodes
 | |
|                         SET has_files = 1
 | |
|                         WHERE id = ?
 | |
|                         AND has_files = 0
 | |
|                     },
 | |
|                     undef,
 | |
|                     $key,
 | |
|                 ) or die $dbh->errstr;
 | |
|             }
 | |
|             else {
 | |
|                 say "Would have updated the 'episodes.has_files' ",
 | |
|                     "field for $key (if needed)";
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #        $assets_added += $asset_count;
 | |
| 
 | |
|     }
 | |
| 
 | |
|     return ( $assets_added, $assets_deleted, $assets_updated );
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: synchronise_found_assets
 | |
| #      PURPOSE: Compares the assets found by searching the shownotes (and
 | |
| #               other files) with those in the SQLite database and updates
 | |
| #               what differs.
 | |
| #   PARAMETERS: $dbh            SQLite database handle
 | |
| #               $episode        Numeric episode
 | |
| #               $assets         Hashref pointing to the collected assets which
 | |
| #                               have been found by parsing the show's notes;
 | |
| #                               keyed by episode number. Each keyed object is
 | |
| #                               an arrayref of hashes containing the URL and
 | |
| #                               filename of the asset.
 | |
| #               $db_assets      Hash of assets from SQLite, keyed by filename.
 | |
| #                               Each keyed object is a hash of columns from
 | |
| #                               the database: id, episode_id, URL, filename,
 | |
| #                               uploaded.
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Number of additions, deletions and updates
 | |
| #  DESCRIPTION: The assets for this episode have been found by searching the
 | |
| #               HTML of the show notes and any linked HTML files for files on
 | |
| #               the HPR server.  The equivalent assets have also been
 | |
| #               collected from the SQLite database. A comparison is made
 | |
| #               between these items and if they differ the definitive details
 | |
| #               collected from the MySQL database are used. There are three
 | |
| #               actions possible: (1) if the URL or filename fields differ
 | |
| #               then we can work out the update and apply it, (2) if the
 | |
| #               incoming assets don't match any in those from the database
 | |
| #               then we can delete the database ones, and (3) we can add the
 | |
| #               incoming assets we also couldn't match because they'll be the
 | |
| #               replecements!  For all actions changes are made to the
 | |
| #               versions in the SQLite database using SQL::Abstract to
 | |
| #               construct the queries to do it.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: TODO: There may be multiple asset changes for the episode.
 | |
| #               Does the SQL::Abstract call perform the multiple changes?
 | |
| #               Needs testing.
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub synchronise_found_assets {
 | |
|     my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ( $sth, $additions, $deletions, $updates, $update_count, $index);
 | |
|     my ( @fieldvals, @where, @db_rows, @asset_rows, %matches, %changes );
 | |
| 
 | |
|     $additions = $deletions = $updates = 0;
 | |
|     $update_count = 0;
 | |
| 
 | |
|     #
 | |
|     # Temporary?
 | |
|     #
 | |
|     if ( $DEBUG >= 4 ) {
 | |
|         _debug( 1, 'In subroutine synchronise_found_assets' );
 | |
|         _debug( 1, "\$episode: $episode" );
 | |
|         _debug( 1, '%$assets: ' . Dumper($assets) );
 | |
|         _debug( 1, '%$db_assets: ' . Dumper($db_assets) );
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 4, 'Looking for assets that match - to exclude them' );
 | |
| 
 | |
|     @db_rows = ( keys(%$db_assets) );
 | |
|     @asset_rows = ( 0 .. scalar( @{ $assets->{$episode} } ) - 1 );
 | |
|     _debug( $DEBUG >= 4, 'Initial @db_rows ' . Dumper( \@db_rows ) );
 | |
|     _debug( $DEBUG >= 4, 'Initial @asset_rows ' . Dumper( \@asset_rows ) );
 | |
| 
 | |
|     #
 | |
|     # Find any asset pair matches
 | |
|     #
 | |
|     $index = 0;
 | |
|     foreach my $db_key ( keys(%$db_assets) ) {
 | |
|         foreach my $a_obj ( @{ $assets->{$episode} } ) {
 | |
|             if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 2 ) {
 | |
|                 $matches{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
 | |
| #                splice( @asset_rows, $index, 1 );
 | |
|                 @asset_rows = grep { $_ != $index } @asset_rows;
 | |
|                 $index++;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     _debug( $DEBUG >= 4, 'Assets to ignore: %matches ' . Dumper( \%matches ) );
 | |
|     _debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
 | |
| 
 | |
|     #
 | |
|     # If everything matched then there's nothing to do, so return zero and
 | |
|     # leave.
 | |
|     #
 | |
| #    if (scalar(@db_rows) == scalar( keys(%matches) )) {
 | |
|     if ( scalar(@asset_rows) == 0 ) {
 | |
|         _debug( $DEBUG >= 4, 'All rows match, so nothing to do' );
 | |
|         return ( 0, 0, 0 );
 | |
|     }
 | |
|     _debug( $DEBUG >= 4, 'Some rows do not match' );
 | |
| 
 | |
|     #
 | |
|     # Remove the row ids for any matches
 | |
|     #
 | |
|     @db_rows = grep { !exists( $matches{$_} ) } @db_rows;
 | |
|     _debug( $DEBUG >= 4, '@db_rows after matches ' . Dumper( \@db_rows ) );
 | |
|     _debug( $DEBUG >= 4, '@asset_rows after matches ' . Dumper( \@asset_rows ) );
 | |
| 
 | |
|     _debug( $DEBUG >= 4, 'Looking for assets that match in only one attribute' );
 | |
| 
 | |
|     #
 | |
|     # Find any cases where only one element of the asset object matches. This
 | |
|     # means we have a simple change.
 | |
|     #
 | |
|     $index = 0;
 | |
|     foreach my $db_key (@db_rows) {
 | |
|         foreach my $a_obj ( @{ $assets->{$episode} } ) {
 | |
|             if ( assets_match( $a_obj, $db_assets->{$db_key} ) == 1 ) {
 | |
|                 $changes{$db_key} = [ $a_obj, $db_assets->{$db_key} ];
 | |
|                 @asset_rows = grep { $_ != $index } @asset_rows;
 | |
|                 $index++;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     _debug( $DEBUG >= 4, 'Assets to change: %changes ' . Dumper( \%changes ) );
 | |
|     _debug( $DEBUG >= 4, 'Current @asset_rows ' . Dumper( \@asset_rows ) );
 | |
| 
 | |
|     #
 | |
|     # Remove the row ids for any changes
 | |
|     #
 | |
|     @db_rows = grep { !exists( $changes{$_} ) } @db_rows;
 | |
|     _debug( $DEBUG >= 4, '@db_rows after changes ' . Dumper( \@db_rows ) );
 | |
|     _debug( $DEBUG >= 4, '@asset_rows after changes ' . Dumper( \@asset_rows ) );
 | |
| 
 | |
|     #
 | |
|     # If there are changes, do them now
 | |
|     #
 | |
|     if (%changes) {
 | |
|         _debug( $DEBUG >= 4, 'Collecting changes' );
 | |
| 
 | |
|         #
 | |
|         # We may have multiple database rows to process
 | |
|         #
 | |
|         $index        = 0;
 | |
| 
 | |
|         foreach my $key ( keys(%changes) ) {
 | |
|             $fieldvals[$index] = {};
 | |
|             $where[$index]     = { episode_id => $episode };
 | |
| 
 | |
|             #
 | |
|             # Compare the fields we're interested in, accumulating differences
 | |
|             # (expecting only one per key).
 | |
|             #
 | |
|             foreach my $fld ( 'filename', 'URL' ) {
 | |
|                 if (!equal(
 | |
|                         $changes{$key}->[0]->{$fld},
 | |
|                         $changes{$key}->[1]->{$fld}
 | |
|                     )
 | |
|                     )
 | |
|                 {
 | |
|                     $update_count++;
 | |
|                     $fieldvals[$index]->{$fld} = $changes{$key}->[0]->{$fld};
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             $where[$index]->{id} = $key;
 | |
| 
 | |
|             $index++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
 | |
|     _debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
 | |
| 
 | |
|     if ( $update_count > 0 ) {
 | |
|         _debug( $DEBUG >= 4, 'Performing changes' );
 | |
|         #
 | |
|         # Prepare to use SQL::Abstract
 | |
|         #
 | |
|         my $sql = SQL::Abstract->new();
 | |
| 
 | |
|         foreach my $fv (@fieldvals) {
 | |
|             my ( $stmt, @bindvals )
 | |
|                 = $sql->update( 'assets', $fv, shift(@where) );
 | |
| 
 | |
|             #
 | |
|             # Perform the updates (unless in dry-run mode)
 | |
|             #
 | |
|             unless ($dry_run) {
 | |
|                 $sth = $dbh->prepare($stmt);
 | |
|                 $sth->execute(@bindvals);
 | |
|                 if ( $dbh->err ) {
 | |
|                     warn "Processing $episode\n", $dbh->errstr;
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Updated ", join( ',', keys(%$fv) ), "\n"
 | |
|                         if $verbose;
 | |
|                     $updates++;
 | |
|                 }
 | |
|             }
 | |
|             else {
 | |
|                 print "Dry run mode:\n";
 | |
|                 print "Statement: $stmt\n";
 | |
|                 print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # If anything is left in @db_rows then they will be replacements
 | |
|     #
 | |
|     if ( scalar(@db_rows) > 0 ) {
 | |
|         say "Replacements found: ", scalar(@db_rows);
 | |
| 
 | |
|         #
 | |
|         # Prepare to use SQL::Abstract
 | |
|         #
 | |
|         my $sql = SQL::Abstract->new();
 | |
| 
 | |
|         #
 | |
|         # Every row which came from the database is to be deleted
 | |
|         #
 | |
|         $index = 0;
 | |
|         foreach my $db_key (@db_rows) {
 | |
|             $where[$index] = { episode_id => $episode };
 | |
|             $where[$index]->{id} = $db_key;
 | |
|             $index++;
 | |
|         }
 | |
|         _debug( $DEBUG >= 2, '@where ' . Dumper( \@where ) );
 | |
| 
 | |
|         #
 | |
|         # Do deletions
 | |
|         #
 | |
|         foreach my $db_key (@db_rows) {
 | |
|             my ( $stmt, @bindvals )
 | |
|                 = $sql->delete( 'assets', shift(@where) );
 | |
| 
 | |
|             #
 | |
|             # Perform the deletions (unless in dry-run mode)
 | |
|             #
 | |
|             unless ($dry_run) {
 | |
|                 $sth = $dbh->prepare($stmt);
 | |
|                 $sth->execute(@bindvals);
 | |
|                 if ( $dbh->err ) {
 | |
|                     warn "Processing $episode\n", $dbh->errstr;
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Deleted ", join( ',', @db_rows ), "\n"
 | |
|                         if $verbose;
 | |
|                     $deletions++;
 | |
|                 }
 | |
|             }
 | |
|             else {
 | |
|                 print "Dry run mode:\n";
 | |
|                 print "Statement: $stmt\n";
 | |
|                 print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Every object in the found assets array for this episode is to be
 | |
|         # added.
 | |
|         #
 | |
|         foreach my $i (@asset_rows) {
 | |
|             my $obj = $assets->{$episode}->[$i];
 | |
|             $obj->{episode_id} = $episode;
 | |
| 
 | |
|             my ( $stmt, @bindvals )
 | |
|                 = $sql->insert( 'assets', $obj );
 | |
| 
 | |
|             #
 | |
|             # Perform the additions (unless in dry-run mode)
 | |
|             #
 | |
|             unless ($dry_run) {
 | |
|                 $sth = $dbh->prepare($stmt);
 | |
|                 $sth->execute(@bindvals);
 | |
|                 if ( $dbh->err ) {
 | |
|                     warn "Processing $episode\n", $dbh->errstr;
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Inserted array elements ", join( ',', @asset_rows ), "\n"
 | |
|                         if $verbose;
 | |
|                     $additions++;
 | |
|                 }
 | |
|             }
 | |
|             else {
 | |
|                 print "Dry run mode:\n";
 | |
|                 print "Statement: $stmt\n";
 | |
|                 print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     }
 | |
| 
 | |
|     return ( $additions, $deletions, $updates );
 | |
| }
 | |
| 
 | |
| ################################################################################
 | |
|     #
 | |
|     # Look for database assets to be ignored. The outer loop iterates through
 | |
|     # the primary key values of $db_assets where the rows all match the
 | |
|     # $episode value. For each of these assets the next loop iterates through
 | |
|     # the array of assets in $assets->{$episode} comparing the fields we want
 | |
|     # to check in the next inner loop. If the checked fields are all equal
 | |
|     # then there will be nothing to do.
 | |
|     #
 | |
| #    foreach my $db_key ( keys(%$db_assets) ) {
 | |
| #        foreach my $a_obj ( @{ $assets->{$episode} } ) {
 | |
| #            $ignore = $delete = $update = 0;
 | |
| #            foreach my $fld ( 'filename', 'URL' ) {
 | |
| #                $ignore
 | |
| #                    += equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
 | |
| #                $delete
 | |
| #                    += !equal( $db_assets->{$db_key}->{$fld}, $a_obj->{$fld} );
 | |
| #            }
 | |
| #            push( @{ $row_actions{'ignore'} }, $db_assets->{$db_key}->{id} )
 | |
| #                if $ignore == 2;
 | |
| #            push( @{ $row_actions{'delete'} }, $db_assets->{$db_key}->{id} )
 | |
| #                if $delete == 2;
 | |
| #        }
 | |
| #    }
 | |
| #
 | |
| #    _debug( $DEBUG >= 4, '%row_actions ' . Dumper( \%row_actions ) );
 | |
| #
 | |
|     #
 | |
|     # Scan the array of hashes obtained from the shownotes for $episode and
 | |
|     # record things that need changing.
 | |
|     #
 | |
| #    $index = 0;
 | |
| #    $updates = 0;
 | |
| #    foreach my $obj ( @{ $assets->{$episode} } ) {
 | |
| #        $fieldvals[$index] = {};
 | |
| #
 | |
| #        #
 | |
| #        # Compare the fields we're interested in
 | |
| #        #
 | |
| #        foreach my $fld ( 'filename', 'URL' ) {
 | |
| #            if ( !equal( $obj->{$fld}, $db_assets->{$obj->{filename}}->{$fld} ) ) {
 | |
| #                $updates++;
 | |
| #                $fieldvals[$index]->{$fld} = $obj->{$fld};
 | |
| #            }
 | |
| #        }
 | |
| #
 | |
| #        # TODO
 | |
| #        $fieldvals[$index]->{id} = $db_assets->{$obj->{filename}}->{id};
 | |
| #
 | |
| #        $index++;
 | |
| #    }
 | |
| #
 | |
| #    _debug( $DEBUG >= 2, '@fieldvals: ' . Dumper( \@fieldvals ) );
 | |
| #
 | |
| #    if ($updates > 0 ) {
 | |
| #        #
 | |
| #        # Prepare to use SQL::Abstract
 | |
| #        #
 | |
| #        my $sql = SQL::Abstract->new();
 | |
| #
 | |
| #        $where{episode_id} = $episode;
 | |
| #
 | |
| #        foreach my $fv (@fieldvals) {
 | |
| #            $where{id} = $fv->{id};
 | |
| #            delete($fv->{id});
 | |
| #
 | |
| #            my ( $stmt, @bindvals )
 | |
| #                = $sql->update( 'assets', $fv, \%where );
 | |
| #
 | |
| #            #
 | |
| #            # Perform the updates (unless in dry-run mode)
 | |
| #            #
 | |
| #            unless ($dry_run) {
 | |
| #                $sth = $dbh->prepare($stmt);
 | |
| #                $sth->execute(@bindvals);
 | |
| #                if ( $dbh->err ) {
 | |
| #                    warn "Processing $episode\n", $dbh->errstr;
 | |
| #                }
 | |
| #                else {
 | |
| #                    print "Updated ", join( ',', keys(%$fv) ), "\n"
 | |
| #                        if $verbose;
 | |
| #                }
 | |
| #            }
 | |
| #            else {
 | |
| #                print "Dry run mode:\n";
 | |
| #                print "Statement: $stmt\n";
 | |
| #                print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
| #            }
 | |
| #        }
 | |
| #    }
 | |
| #
 | |
| ################################################################################
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: insert_assets
 | |
| #      PURPOSE: Find missing assets and add them to SQLite
 | |
| #   PARAMETERS: $dbh            SQLite database handle
 | |
| #               $episode        Numeric episode
 | |
| #               $assets         Hashref pointing to the collected assets which
 | |
| #                               have been found by parsing the show's notes;
 | |
| #                               keyed by episode number.
 | |
| #               $db_assets      Hash of assets from SQLite, keyed by filename
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Number of insertions
 | |
| #  DESCRIPTION: The assets for this episode have been found by searching the
 | |
| #               HTML of the show notes and any linked HTML files for files on
 | |
| #               the HPR server.  The equivalent assets have also been
 | |
| #               collected from the SQLite database. The MySQL assets are
 | |
| #               reorganised by filename. For each file which does not already
 | |
| #               exist in the SQLite list details are prepared and passed to
 | |
| #               SQL::Abstract using the multi insert feature. The missing
 | |
| #               details are inserted into the assets table unless dry-run mode
 | |
| #               is on.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub insert_assets {
 | |
|     my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ($sth, %db_by_file, %assets_by_file, $insertions, @multi_insert, %where);
 | |
| 
 | |
|     #
 | |
|     # Reorder the %$db_assets hash in filename order
 | |
|     #
 | |
|     foreach my $key ( keys( %{$db_assets} ) ) {
 | |
|         $db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Reorder the %assets hash in filename order
 | |
|     #
 | |
|     foreach my $obj ( @{ $assets->{$episode} } ) {
 | |
|         $assets_by_file{$obj->{filename}} = $obj;
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
 | |
|     _debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
 | |
| 
 | |
|     #
 | |
|     # Find stuff for insertion. We need to build @multi_insert such that it
 | |
|     # contains a list of hashrefs, each containing the fields we want to
 | |
|     # insert.
 | |
|     # Note: We make a new %fv each time, fill it as a slice, and push it into
 | |
|     # the array @multi_insert. This may be rather obscure and therefore
 | |
|     # error-prone.
 | |
|     #
 | |
|     $insertions = 0;
 | |
|     foreach my $key ( keys(%assets_by_file) ) {
 | |
| #        if ( !exists( $db_assets->{$key} ) ) {
 | |
|         if ( !exists( $db_by_file{$key} ) ) {
 | |
|             my %fv;
 | |
|             $insertions++;
 | |
|             @fv{ 'episode_id', 'URL', 'filename' } = (
 | |
|                 $episode, $assets_by_file{$key}->{URL},
 | |
|                 $assets_by_file{$key}->{filename}
 | |
|             );
 | |
|             push( @multi_insert, \%fv );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, '@multi_insert: ' . Dumper( \@multi_insert ) );
 | |
| 
 | |
|     if ($insertions > 0) {
 | |
|         #
 | |
|         # Prepare to use SQL::Abstract
 | |
|         #
 | |
|         my $sql = SQL::Abstract->new();
 | |
| 
 | |
|         my ( $stmt, @bindvals )
 | |
|             = $sql->insert_multi( 'assets', \@multi_insert );
 | |
| 
 | |
|         #
 | |
|         # Perform the insertions (unless in dry-run mode)
 | |
|         #
 | |
|         unless ($dry_run) {
 | |
|             $sth = $dbh->prepare($stmt);
 | |
|             $sth->execute(@bindvals);
 | |
|             if ( $dbh->err ) {
 | |
|                 warn "Processing $episode\n", $dbh->errstr;
 | |
|             }
 | |
|             else {
 | |
| #                print "Inserted ",join(',',keys(%where)),"\n" if $verbose;
 | |
|                 print "Inserted $insertions assets\n" if $verbose;
 | |
|             }
 | |
|         }
 | |
|         else {
 | |
|             print "Dry run mode:\n";
 | |
|             print "Statement: $stmt\n";
 | |
|             print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return $insertions;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: delete_assets
 | |
| #      PURPOSE: Find superfluous assets for an episode in the 'assets' table
 | |
| #               and delete them.
 | |
| #   PARAMETERS: $dbh            SQLite database handle
 | |
| #               $episode        Numeric episode
 | |
| #               $assets         Hashref pointing to the collected assets which
 | |
| #                               have been found by parsing the show's notes;
 | |
| #                               keyed by episode number.
 | |
| #               $db_assets      Hash of assets from SQLite, keyed by filename
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Number of deletions
 | |
| #  DESCRIPTION: The assets for this episode have been found by searching the
 | |
| #               HTML of the show notes and any linked HTML files for files on
 | |
| #               the HPR server.  The equivalent assets have also been
 | |
| #               collected from the SQLite database. The MySQL assets are
 | |
| #               reorganised by filename. For each file exists in the SQLite
 | |
| #               list details are prepared and passed to SQL::Abstract. The
 | |
| #               unwanted details are deleted from the assets table unless
 | |
| #               dry-run mode is on.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub delete_assets {
 | |
|     my ( $dbh, $episode, $assets, $db_assets, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ($sth, %db_by_file, %assets_by_file, $deletions, %where);
 | |
| 
 | |
|     #
 | |
|     # Reorder the %$db_assets hash in filename order
 | |
|     #
 | |
|     foreach my $key ( keys( %{$db_assets} ) ) {
 | |
|         $db_by_file{$db_assets->{$key}->{filename}} = $db_assets->{$key};
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Reorder the %assets hash in filename order
 | |
|     #
 | |
|     foreach my $obj ( @{ $assets->{$episode} } ) {
 | |
|         $assets_by_file{$obj->{filename}} = $obj;
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, "%db_by_file: " . Dumper( \%db_by_file ) );
 | |
|     _debug( $DEBUG >= 2, "%assets_by_file: " . Dumper( \%assets_by_file ) );
 | |
| 
 | |
|     #
 | |
|     # Only delete assets with this value in the 'episode_id' field
 | |
|     #
 | |
|     $where{episode_id} = $episode;
 | |
| 
 | |
|     #
 | |
|     # Find stuff for deletion. We need to build %where such that it generates
 | |
|     # 'WHERE filename = <value1> OR filename = <value2> ...', so we add these
 | |
|     # values to an (anonymous) array.
 | |
|     #
 | |
|     $deletions = 0;
 | |
|     foreach my $key ( keys(%$db_assets) ) {
 | |
| #        if ( !exists( $assets_by_file{$key} ) ) {
 | |
|         if ( !exists( $db_by_file{$key} ) ) {
 | |
|             $deletions++;
 | |
|             if ( !exists( $where{filename} ) ) {
 | |
|                 $where{filename} = [$key];
 | |
|             }
 | |
|             else {
 | |
|                 push( @{ $where{filename} }, $key );
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, "%where: " . Dumper( \%where ) );
 | |
| 
 | |
|     if ($deletions > 0) {
 | |
|         #
 | |
|         # Prepare to use SQL::Abstract
 | |
|         #
 | |
|         my $sql = SQL::Abstract->new();
 | |
| 
 | |
|         my ( $stmt, @bindvals )
 | |
|             = $sql->delete( 'assets', \%where );
 | |
| 
 | |
|         #
 | |
|         # Perform the deletions (unless in dry-run mode)
 | |
|         #
 | |
|         unless ($dry_run) {
 | |
|             $sth = $dbh->prepare($stmt);
 | |
|             $sth->execute(@bindvals);
 | |
|             if ( $dbh->err ) {
 | |
|                 warn "Processing $episode\n", $dbh->errstr;
 | |
|             }
 | |
|             else {
 | |
|                 print "Deleted ",join(',',keys(%where)),"\n" if $verbose;
 | |
|             }
 | |
|         }
 | |
|         else {
 | |
|             print "Dry run mode:\n";
 | |
|             print "Statement: $stmt\n";
 | |
|             print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return $deletions;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: update_episode
 | |
| #      PURPOSE: Perform a database update for a specific episode
 | |
| #   PARAMETERS: $dbh1           Handle for the MariaDB database
 | |
| #               $dbh2           Handle for the SQLite database
 | |
| #               $episode        Target episode to update
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Performs an update for a single episode in the SQLite
 | |
| #               database. The usual scenario is after the Community News show
 | |
| #               has been posted, or a spelling mistake has been corrected for
 | |
| #               an episode. The episode is selected from the MySQL database
 | |
| #               and the SQLite one. The 'title' and 'summary' fields are
 | |
| #               compared across the two copies, and if they differ the
 | |
| #               necessary changes are set up to be executed or reported in
 | |
| #               'dry run' mode.
 | |
| #               NOTE: No longer done this way:
 | |
| #               |A search for assets is made by reading the show notes and any
 | |
| #               |notes that are linked from these, and if there are
 | |
| #               |supplementary files they are accumulated and the database
 | |
| #               |updated as necessary (under control of the 'dry run' option).
 | |
| #               |This is currently the only way that the row in the SQLite
 | |
| #               |database can be updated after it has been loaded from the
 | |
| #               |MySQL database, other than when the metadata is collected and
 | |
| #               |applied. The metadata update process does not fix errors in
 | |
| #               |the 'title' or 'summary' fields.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub update_episode {
 | |
|     my ( $dbh1, $dbh2, $episode, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sql2, $sth1, $h1, $sth2, $h2, $sth3 );
 | |
|     my ( %assets, $asset_count, $assets_added, $assets_deleted, $assets_updated );
 | |
|     my ( $id, $notes, $links );
 | |
|     my ( $updates, %fieldvals, %where );
 | |
| 
 | |
|     #
 | |
|     # SQL to find new rows in the MySQL database
 | |
|     #
 | |
|     $sql1 = q{
 | |
|         SELECT
 | |
|             id,
 | |
|             date,
 | |
|             title,
 | |
|             summary,
 | |
|             notes
 | |
|         FROM eps
 | |
|         WHERE id = ?
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # SQL to look for pre-existing episodes in the SQLite database
 | |
|     #
 | |
|     $sql2 = q{
 | |
|         SELECT * FROM episodes WHERE id = ?
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # Prepare the SQL statements
 | |
|     #
 | |
|     $sth1 = $dbh1->prepare($sql1);
 | |
|     $sth2 = $dbh2->prepare($sql2);
 | |
| 
 | |
|     #
 | |
|     # Query MySQL, for the $episode
 | |
|     #
 | |
|     $sth1->execute($episode);
 | |
|     if ( $dbh1->err ) {
 | |
|         croak $dbh1->errstr;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Loop through the MySQL row (should only be one)
 | |
|     #
 | |
|     $asset_count = $updates = 0;
 | |
|     while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|         #
 | |
|         # Query the SQLite database for the episode number we just got
 | |
|         #
 | |
|         $sth2->execute( $h1->{id} );
 | |
|         if ( $dbh2->err ) {
 | |
|             croak $dbh2->errstr;
 | |
|         }
 | |
|         $h2 = $sth2->fetchrow_hashref;
 | |
| 
 | |
|         #
 | |
|         # If the row doesn't exist we have a problem.
 | |
|         # TODO: Could we just add the episode at this point?
 | |
|         #
 | |
|         unless ($h2) {
 | |
|             warn "Row $episode is not in the SQLite database\n";
 | |
|             return;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Look for title and summary changes and build a hash holding the
 | |
|         # changes
 | |
|         #
 | |
|         foreach my $fld ( 'title', 'summary' ) {
 | |
|             if ( !equal( $h1->{$fld}, $h2->{$fld} ) ) {
 | |
|                 $updates++;
 | |
|                 $fieldvals{$fld} = $h1->{$fld};
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if ($updates > 0 ) {
 | |
|             #
 | |
|             # Prepare to use SQL::Abstract
 | |
|             #
 | |
|             my $sql = SQL::Abstract->new();
 | |
| 
 | |
|             $where{id} = $episode;
 | |
|             my ( $stmt, @bindvals )
 | |
|                 = $sql->update( 'episodes', \%fieldvals, \%where );
 | |
| 
 | |
|             #
 | |
|             # Perform the updates (unless in dry-run mode)
 | |
|             #
 | |
|             unless ($dry_run) {
 | |
|                 $sth3 = $dbh2->prepare($stmt);
 | |
|                 $sth3->execute(@bindvals);
 | |
|                 if ( $dbh2->err ) {
 | |
|                     warn "Processing $episode\n", $dbh2->errstr;
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Updated ",join(',',keys(%fieldvals)),"\n" if $verbose;
 | |
|                 }
 | |
|             }
 | |
|             else {
 | |
|                 print "Dry run mode:\n";
 | |
|                 print "Statement: $stmt\n";
 | |
|                 print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check this show for "assets". Even in dry-run mode this gets
 | |
|         # executed
 | |
|         #
 | |
|         $links = 0;
 | |
| 
 | |
|         $id = $h1->{id};
 | |
|         $notes = $h1->{notes};
 | |
|         # $links = find_links($id,$notes,\%assets);
 | |
|         $links = find_assets($ssh,$episode,$rempath,\%assets);
 | |
| 
 | |
|         $asset_count += $links;
 | |
| 
 | |
|     }
 | |
| 
 | |
|     if ( $asset_count > 0 ) {
 | |
|         _debug( $DEBUG >= 2, "Scanned assets:\n" . Dumper( \%assets ) );
 | |
|     }
 | |
|     else {
 | |
|         _debug( $DEBUG >= 2, "No assets found" );
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Add any assets collected from the episode just processed
 | |
|     #
 | |
|     $assets_added = 0;
 | |
|     if ($asset_count > 0) {
 | |
|         ( $assets_added, $assets_deleted, $assets_updated ) =
 | |
|             process_assets( $dbh2, \%assets, 0, $dry_run, $verbose );
 | |
|     }
 | |
| 
 | |
|     printf "%-16s %d\n", "Episode checked:", $episode;
 | |
|     printf "%-16s %d\n", "Episode updates:", $updates;
 | |
|     printf "%-16s %d\n", "Assets found:",    $asset_count;
 | |
|     printf "%-16s %d\n", "Assets added:",    coalesce( $assets_added,   0 );
 | |
|     printf "%-16s %d\n", "Assets deleted:",  coalesce( $assets_deleted, 0 );
 | |
|     printf "%-16s %d\n", "Assets updated:",  coalesce( $assets_updated, 0 );
 | |
| 
 | |
|     $log->info("Episode number: $episode");
 | |
|     $log->info("Episode updates: $updates");
 | |
|     $log->info( 'Assets added,deleted,updated:',
 | |
|         coalesce( $assets_added, 0 ), ',',
 | |
|         coalesce( $assets_deleted, 0 ), ',',
 | |
|         coalesce( $assets_updated, 0 )
 | |
|     );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: load_metadata
 | |
| #      PURPOSE: Loads values from a JSON file which has been generated by the
 | |
| #               'ia metadata' command and stores them in the 'episodes' table.
 | |
| #               Also uses the JSON data to determine if the derived files and
 | |
| #               the source file have been uploaded and which assets have been
 | |
| #               uploaded.
 | |
| #   PARAMETERS: $dbh            Handle for the SQLite database
 | |
| #               $file           Name of file of JSON data
 | |
| #               $dry_run        Boolean dry run setting
 | |
| #               $verbose        Boolean verbose setting
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: The input file is expected to consist of many JSON complex
 | |
| #               objects, generated by the command 'ia metadata <identifier>'
 | |
| #               for a list of HPR episodes (see script 'collect_show_data').
 | |
| #               A count is kept of successful updates, and this is reported
 | |
| #               once finished.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub load_metadata {
 | |
|     my ( $dbh, $file, $dry_run, $verbose ) = @_;
 | |
| 
 | |
|     my ( $filebuffer, @jsonbuffer, %cols, %fieldvals, %where );
 | |
|     my ( @deletions, $index );
 | |
|     my ( @stash, @assets, $orig_count, $source, $epid, $count, $updates );
 | |
|     my ( $sth1, $sth2, $sth3, $rv, $h );
 | |
|     my $template = "https://archive.org/details/hpr%04d";
 | |
|     my @flds = (
 | |
|         'uploaded',          'has_files',
 | |
|         'with_files',        'with_derived',
 | |
|         'with_source',       'archive_date',
 | |
|         'item_last_updated', 'IA_URL',
 | |
|     );
 | |
|     my $re = qr{^hpr\d{4}\.(flac|mp3|ogg|opus|spx|wav)$};
 | |
| 
 | |
|     #
 | |
|     # Read the entire JSON file into a buffer
 | |
|     #
 | |
|     try {
 | |
|         $filebuffer = read_text($file);
 | |
|     }
 | |
| 
 | |
|     catch {
 | |
|         die "Failed to read JSON file $file\n";
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # Parse the JSON from the buffer
 | |
|     #
 | |
|     my $json = JSON->new;
 | |
|     @jsonbuffer = $json->incr_parse($filebuffer);
 | |
| 
 | |
|     #print Dumper(\@jsonbuffer),"\n";
 | |
| 
 | |
|     #
 | |
|     # Check that this is the type of JSON we need
 | |
|     #
 | |
|     unless (@jsonbuffer) {
 | |
|         warn "Empty JSON?\n";
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Look at each top-level JSON object and check for missing components.
 | |
|     # It's possible for there to be missing objects, but only in very rare
 | |
|     # cases when a range has been requested yet there's a gap in that range
 | |
|     # (reserved slot perhaps). We accumulate indices of invalid objects in
 | |
|     # @jsonbuffer and then delete the invalid objects.
 | |
|     #
 | |
|     $index = 0;
 | |
|     foreach my $obj (@jsonbuffer) {
 | |
|         unless ( exists( $obj->{metadata} )
 | |
|             && exists( $obj->{metadata}->{identifier} )
 | |
|             && exists( $obj->{files} ))
 | |
|         {
 | |
|             warn "Invalid JSON? Skipped object #$index\n";
 | |
|             push( @deletions, $index );
 | |
|             next;
 | |
|         }
 | |
|         $index++;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Delete any bad objects we found in the previous pass
 | |
|     #
 | |
|     for my $ind (@deletions) {
 | |
|         splice( @jsonbuffer, $ind, 1 );
 | |
|         print STDERR "Deleted invalid item with index $ind\n" if $verbose;
 | |
|     }
 | |
|     print "\n" if $verbose;
 | |
| 
 | |
|     #
 | |
|     # Prepare to get the database row relating to an episode
 | |
|     #
 | |
|     $sth1 = $dbh->prepare("SELECT * FROM episodes WHERE id = ?");
 | |
| 
 | |
|     #
 | |
|     # Prepare to get the rows relating to assets for an episode
 | |
|     #
 | |
|     $sth2 = $dbh->prepare("SELECT * FROM assets WHERE episode_id = ?");
 | |
| 
 | |
|     $count = 0;
 | |
|     foreach my $obj (@jsonbuffer) {
 | |
|         if ( $obj->{metadata}->{identifier} =~ /hpr(\d{4})/ ) {
 | |
|             $epid = $1 + 0;
 | |
| 
 | |
|             #
 | |
|             # Skip the episodes in the range 1-620
 | |
|             #
 | |
|             #next if ( $epid <= 620 );
 | |
|             #
 | |
|             # 2021-06-27: We don't skip any more. Shows 1-620 have been
 | |
|             # uploaded in blocks, but we want them on the IA in the same
 | |
|             # format as all the others. Today we hit 620, working backwards
 | |
|             # from the earliest IA upload as a project to upload everything
 | |
|             # with show notes and assets. The existing database entries were
 | |
|             # changed to make the 'notes' field hold the text 'Block upload'
 | |
|             # followed by the block archive date and the URL. Then the rows
 | |
|             # between 1-620 are then to have the new date and URL and these
 | |
|             # columns have been made NULL in preparation. See the Journal for
 | |
|             # more information.
 | |
|             #
 | |
| 
 | |
|             if ($verbose) {
 | |
|                 print "Processing ",  $obj->{metadata}->{identifier}, "\n";
 | |
|                 print "Publicdate: ", $obj->{metadata}->{publicdate}, "\n";
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Process the files in the metadata
 | |
|             #
 | |
|             $orig_count = 0;
 | |
|             $source     = 0;
 | |
|             undef(@stash);
 | |
|             foreach my $f ( @{ $obj->{files} } ) {
 | |
|                 #
 | |
|                 # Skip anything with a 'Metatda' format or which has a source
 | |
|                 # of 'derivative' or 'metadata' or is the newly added
 | |
|                 # '__ia_thumb.jpg'
 | |
|                 #
 | |
|                 next
 | |
|                     if ( $f->{format} eq 'Metadata'
 | |
|                     || $f->{source} =~ /^(derivative|metadata)$/
 | |
|                     || $f->{name} eq '__ia_thumb.jpg'
 | |
|                     || $f->{name} =~ /^history\/files\// );
 | |
| 
 | |
|                 #
 | |
|                 # Count audio files which have been "derived" (by us) prior to upload,
 | |
|                 # and those that look like 'source' files (e.g.
 | |
|                 # hpr2472_source.flac). Anything else store in @stash because
 | |
|                 # it's probably an asset file.
 | |
|                 #
 | |
|                 if ( $f->{name} =~ $re ) {
 | |
|                     $orig_count++;
 | |
|                 }
 | |
|                 elsif ( $f->{name} =~ /^hpr\d{4}_source\..+$/ ) {
 | |
|                     $source++;
 | |
|                 }
 | |
|                 else {
 | |
|                     push(@stash,$f->{name});
 | |
|                 }
 | |
| 
 | |
|                 printf "%s %s\n", $f->{name}, $f->{source} if $verbose;
 | |
| 
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Check the assets before updating the episode
 | |
|             #
 | |
|             @assets = collect_assets($dbh, $sth2, $epid);
 | |
|             _debug( $DEBUG >= 2, '@assets: ' . Dumper(\@assets));
 | |
|             _debug( $DEBUG >= 2, '@stash: ' . Dumper(\@stash));
 | |
| 
 | |
|             if (@assets || @stash) {
 | |
|                 @assets = sort(@assets);
 | |
|                 @stash = sort(@stash);
 | |
| 
 | |
|                 #
 | |
|                 # Smart match the @assets and @stash arrays to look for
 | |
|                 # difference. In @assets we have what the database holds, and
 | |
|                 # in @stash we have stored what we found in the JSON.
 | |
|                 # NOTE: Now that Smartmatch ('~~') is deprecated we use the
 | |
|                 # rather clever 'match::smart' with its strange |M| operator.
 | |
|                 # It needs arrays being matched to be offered as references
 | |
|                 # however.
 | |
|                 #
 | |
|                 # unless ( @assets ~~ @stash ) {
 | |
|                 unless ( \@assets |M| \@stash ) {
 | |
|                     print "Difference between DB assets and IA\n";
 | |
|                     print "DB assets:\n    ",join("\n    ",@assets),"\n";
 | |
|                     print "IA assets:\n    ",join("\n    ",@stash),"\n";
 | |
|                 }
 | |
| 
 | |
|                 #
 | |
|                 # In live mode mark all assets belonging to this episode as
 | |
|                 # 'uploaded'.
 | |
|                 # TODO: Is this dangerous?
 | |
|                 #
 | |
|                 unless ($dry_run) {
 | |
|                     mark_assets_uploaded($dbh,$epid);
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Update of 'assets' table skipped; dry run\n";
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Prepare to use SQL::Abstract
 | |
|             #
 | |
|             my $sql = SQL::Abstract->new();
 | |
| 
 | |
|             #
 | |
|             # Get the current row for this episode for comparison to find what
 | |
|             # has been changed.
 | |
|             #
 | |
|             $rv = $sth1->execute($epid);
 | |
|             if ( $dbh->err ) {
 | |
|                 warn $dbh->errstr;
 | |
|             }
 | |
| 
 | |
|             $h = $sth1->fetchrow_hashref;
 | |
| 
 | |
|             #
 | |
|             # Store the changes
 | |
|             #
 | |
|             $cols{'uploaded'}          = 1;
 | |
|             $cols{'has_files'}         = ( scalar(@stash) > 0 ? 1 : 0 );
 | |
|             $cols{'with_files'}        = ( scalar(@stash) > 0 ? 1 : 0 );
 | |
|             $cols{'with_derived'}      = ( $orig_count >= 2   ? 1 : 0 );
 | |
|             $cols{'with_source'}       = ( $source > 0        ? 1 : 0 );
 | |
|             $cols{'archive_date'}      = $obj->{metadata}->{publicdate};
 | |
|             $cols{'item_last_updated'} = coalesce($obj->{item_last_updated},0);
 | |
|             $cols{'IA_URL'}            = sprintf( $template, $epid );
 | |
| 
 | |
|             #
 | |
|             # Look for differences, storing the updated values in %fieldvals
 | |
|             # and counting the changes.
 | |
|             #
 | |
|             $updates = 0;
 | |
|             undef(%fieldvals);
 | |
|             foreach my $fld (@flds) {
 | |
|                 if ( !equal( $h->{$fld}, $cols{$fld} ) ) {
 | |
|                     $updates++;
 | |
|                     $fieldvals{$fld} = $cols{$fld};
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if ( $updates > 0 ) {
 | |
|                 #
 | |
|                 # Build the update statement containing changes. It will be:
 | |
|                 # "UPDATE episodes SET a = b, c = d, ... WHERE id = $epid"
 | |
|                 #
 | |
|                 $where{id} = $epid;
 | |
|                 my ( $stmt, @bindvals )
 | |
|                     = $sql->update( 'episodes', \%fieldvals, \%where );
 | |
| 
 | |
|                 #
 | |
|                 # Perform the updates (unless in dry-run mode)
 | |
|                 #
 | |
|                 unless ($dry_run) {
 | |
|                     $sth3 = $dbh->prepare($stmt);
 | |
|                     $sth3->execute(@bindvals);
 | |
|                     if ( $dbh->err ) {
 | |
|                         warn "Processing $epid\n", $dbh->errstr;
 | |
|                     }
 | |
|                     else {
 | |
|                         print "Updated\n" if $verbose;
 | |
|                         $count++;
 | |
|                     }
 | |
|                 }
 | |
|                 else {
 | |
|                     print "Dry run mode:\n";
 | |
|                     print "Statement: $stmt\n";
 | |
|                     print "Bindvals: ", join( ", ", @bindvals ), "\n";
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|         }
 | |
|         else {
 | |
|             printf "Unexpected identifier: %s\n", $obj->{metadata}->{identifier};
 | |
|         }
 | |
| 
 | |
|         print '-' x 80, "\n" if $verbose;
 | |
|     }
 | |
| 
 | |
|     print "Number of updates: $count\n";
 | |
|     $log->info("Number of updates from JSON: $count");
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: collect_assets
 | |
| #      PURPOSE: Collects asset names for a given episode
 | |
| #   PARAMETERS: $dbh            Handle for the SQLite database
 | |
| #               $sth            Statement handle for the prepared query
 | |
| #               $epid           Episode id
 | |
| #      RETURNS: A list of the asset names
 | |
| #  DESCRIPTION: Just prepares a list of asset filenames by querying the SQLite
 | |
| #               'assets' table and returns it to the caller
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub collect_assets {
 | |
|     my ( $dbh, $sth, $epid ) = @_;
 | |
| 
 | |
|     my ( $rv, $h, @result );
 | |
| 
 | |
|     #
 | |
|     # Query the database
 | |
|     #
 | |
|     $rv = $sth->execute($epid);
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     while ($h = $sth->fetchrow_hashref) {
 | |
|         push(@result,$h->{filename});
 | |
|     }
 | |
| 
 | |
|     return @result;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: mark_assets_uploaded
 | |
| #      PURPOSE: Marks the assets relating to an episode as 'uploaded'
 | |
| #   PARAMETERS: $dbh            Handle for the SQLite database
 | |
| #               $epid           Episode id
 | |
| #      RETURNS: Number of updates
 | |
| #  DESCRIPTION: Mark any assets for a given episode as uploaded, if they are
 | |
| #               not so marked. Expected to be used during an update from JSON
 | |
| #               data collected from the IA.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub mark_assets_uploaded {
 | |
|     my ( $dbh, $epid ) = @_;
 | |
| 
 | |
|     my ($rows);
 | |
| 
 | |
|     $rows = $dbh->do(
 | |
|         q{UPDATE assets SET uploaded = 1
 | |
|           WHERE episode_id = ? AND uploaded = 0},
 | |
|         undef,
 | |
|         $epid,
 | |
|     ) or die $dbh->errstr;
 | |
| 
 | |
|     $rows += 0;
 | |
| 
 | |
|     return $rows;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: scan_dir
 | |
| #      PURPOSE: Scans a file from the HPR server generated with a simple
 | |
| #               'find' command which lists all the files and directories under
 | |
| #               the ~/www/eps/ directory.
 | |
| #   PARAMETERS: $dbh            Handle for the SQLite database
 | |
| #               $file           Name of file of directory data
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Under development
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub scan_dir {
 | |
|     my ( $dbh, $file ) = @_;
 | |
| 
 | |
|     my ( %dirtree, $ptr, %shows, $key, $id );
 | |
| 
 | |
|     #
 | |
|     # Open the file of paths
 | |
|     #
 | |
|     open( my $fh, "<:encoding(UTF-8)", $file )
 | |
|         or die "Unable to open '$file'\n";
 | |
| 
 | |
|     #
 | |
|     # Loop through all paths. If it's one of the audio files then skip it
 | |
|     #
 | |
|     while ( my $line = <$fh> ) {
 | |
|         chomp($line);
 | |
| 
 | |
|         next if ( $line =~ /(flac|mp3|ogg|opus|spx|wav)$/ );
 | |
| 
 | |
|         #
 | |
|         # Point into the tree hash and add the necessary nodes from the split
 | |
|         # path. The result is a hierarchical has structure where the 'leaf'
 | |
|         # node has an empty anonymous hash as its value.
 | |
|         #
 | |
|         # %dirtree = {
 | |
|         #     'www' => {
 | |
|         #        'eps' => {
 | |
|         #            'hpr2356' => {
 | |
|         #                 'index.html' => {},
 | |
|         #                 'fix-ssh-on-pi.bash.txt' => {}
 | |
|         #             },
 | |
|         #         },
 | |
|         #     },
 | |
|         # }
 | |
|         #
 | |
|         $ptr = \%dirtree;
 | |
|         $ptr = $ptr->{$_} //= {} for split( /\//, $line );
 | |
| 
 | |
|     }
 | |
| 
 | |
|     close($fh);
 | |
| 
 | |
|     #print Dumper(\%dirtree),"\n";
 | |
| 
 | |
|     #
 | |
|     # Build a hash indexed by IA identifier containing an array per value with
 | |
|     # the files relating to the show therein
 | |
|     #
 | |
|     $ptr = \%dirtree;
 | |
|     # TODO if there are multiple items at a level which one to test
 | |
|     $key = ( keys(%$ptr) )[0];
 | |
|     #$key = first { defined($_) } keys(%$ptr);
 | |
|     until ( $key =~ /^hpr[0-9]{4}/ ) {
 | |
|         print "$key\n";
 | |
|         $ptr = $ptr->{$key};
 | |
|         $key = ( keys(%$ptr) )[0];
 | |
|     }
 | |
|     #print join(", ",keys(%$ptr));
 | |
| 
 | |
|     # TODO Doesn't deal with sub-directories
 | |
|     foreach my $k ( keys(%$ptr) ) {
 | |
|         if ( ($id) = ( $k =~ /^(hpr[0-9]{4})/ ) ) {
 | |
|             unless ( defined( $shows{$id} ) ) {
 | |
|                 $shows{$id} = [];
 | |
|             }
 | |
|             push( @{ $shows{$id} }, $k );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     print Dumper( \%shows ), "\n";
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: find_assets
 | |
| #      PURPOSE: An alternative way to look for assets; connect to the server
 | |
| #               with SSH and use a 'find' command for matching files and
 | |
| #               directories.
 | |
| #   PARAMETERS: $ssh            Net::OpenSSH object previously set up to
 | |
| #                               connect to the server
 | |
| #               $episode        episode number we're dealing with
 | |
| #               $rempath        path to search on the server
 | |
| #               $rfiles         hashref to receive the files (assets) found
 | |
| #      RETURNS: Number of files found
 | |
| #  DESCRIPTION: 
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub find_assets {
 | |
|     my ($ssh, $episode, $rempath, $rfiles) = @_;
 | |
| 
 | |
|     my $urlbase = 'https://hackerpublicradio.org/eps/';
 | |
|     my (@result, @row, $URL, $file, $assetcount);
 | |
| 
 | |
|     $rempath .= '/' unless ($rempath =~ /\/$/);
 | |
| 
 | |
|     @result = $ssh->capture({ timeout => 10 },
 | |
|         "find $rempath -regextype egrep -regex '.*/hpr${episode}.*' -printf '%y,%p,%s\n'");
 | |
|     $ssh->error and warn "remote 'find' command failed: " . $ssh->error;
 | |
| 
 | |
| #       d,/home/hpr/www/eps/hpr3775,85
 | |
| #       f,/home/hpr/www/eps/hpr3775/hpr3775_source.wav,75006546
 | |
| #       f,/home/hpr/www/eps/hpr3775/hpr3775.vtt,13127
 | |
| #       f,/home/hpr/www/eps/hpr3775/hpr3775.srt,13830
 | |
| #       f,/home/hpr/www/eps/hpr3775/hpr3775.txt,11119
 | |
| #       f,/home/hpr/www/eps/hpr3775.mp3,7449559
 | |
| #       f,/home/hpr/www/eps/hpr3775.ogg,10631998
 | |
| #       f,/home/hpr/www/eps/hpr3775.spx,3516736
 | |
| 
 | |
|     $assetcount = 0;
 | |
|     $rfiles->{$episode} = [];
 | |
|     for my $file (@result) {
 | |
|         @row = split( ',', $file );
 | |
|         if ( $row[0] eq 'f' ) {
 | |
|             unless ( $row[1] =~ /$episode\.(mp3|ogg|spx)$/ ) {
 | |
|                 ( $URL  = $row[1] ) =~ s/$rempath/$urlbase/;
 | |
|                 ( $file = $row[1] ) =~ s/$rempath//;
 | |
| 
 | |
|                 push(
 | |
|                     @{ $rfiles->{$episode} },
 | |
|                         { filename => $file, URL => $URL }
 | |
|                 );
 | |
| 
 | |
|                 $assetcount++;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 2, "Results from 'find':\n" . Dumper( \@result ) );
 | |
| 
 | |
|     #
 | |
|     # Return the asset count
 | |
|     #
 | |
|     return $assetcount;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: find_links
 | |
| #      PURPOSE: Parses the HTML in a string for links so that a list of assets
 | |
| #               relating to this show can be made
 | |
| #   PARAMETERS: $episode        episode number we're dealing with
 | |
| #               $html           string containing HTML
 | |
| #               $rlinks         hashref to receive the links found
 | |
| #      RETURNS: Number of links found
 | |
| #  DESCRIPTION: Given HTML from the main notes or a subsidiary file the
 | |
| #               function parses this looking for links in 'a' or 'img' tags.
 | |
| #               Links are standardised, making them absolute if relative and
 | |
| #               removing any 'fragment'. The links need to be to files on the
 | |
| #               HPR website to be of interest. If so the filename part is
 | |
| #               extracted. If it follows the format 'hpr9999' then it's
 | |
| #               checked to see if it's for the current show. If not it's
 | |
| #               ignored. If the filename ends with a '/' then it's assumed
 | |
| #               it's shorthand for 'index.html' so this name is appended. Then
 | |
| #               the link and filename are stashed in the hash referenced by
 | |
| #               $rlinks. If the filename ends with '.html' then we need to
 | |
| #               parse it in turn, so we get the contents of the link and
 | |
| #               recurse to parse it. We return the number of HPR links found
 | |
| #               in the pass through the HTML.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub find_links {
 | |
|     my ( $episode, $html, $rlinks ) = @_;
 | |
| 
 | |
|     my ($tree, $epstr, $linkre,    $re2, $filepath,
 | |
|         $uri,  $slink, $linkcount, $content
 | |
|     );
 | |
| 
 | |
|     _debug( $DEBUG >= 3, "find_links enter\n" );
 | |
| 
 | |
|     #
 | |
|     # Create a tree object
 | |
|     #
 | |
|     $tree = HTML::TreeBuilder->new;
 | |
|     $tree->ignore_unknown(0);
 | |
|     $tree->no_expand_entities(1);
 | |
|     $tree->p_strict(1);
 | |
|     $tree->store_comments(1);
 | |
|     $tree->warn(1);
 | |
| 
 | |
|     $tree->parse_content($html)
 | |
|         or die "HTML::TreeBuilder failed to parse notes: $!\n";
 | |
| 
 | |
|     my $baseURL = "https://hackerpublicradio.org";
 | |
| 
 | |
|     $epstr = sprintf( "hpr%04d", $episode );
 | |
|     $linkre = qr{
 | |
|         ^https?://
 | |
|         (?:www.)?
 | |
|         (?:hacker|hobby)publicradio.org/eps/
 | |
|         (.+)$
 | |
|     }x;
 | |
|     #(?:(hpr$epstr/.+)|(hpr$epstr.+)|(.+))$
 | |
| 
 | |
|     #
 | |
|     # Counting new links found and stashed
 | |
|     #
 | |
|     $linkcount = 0;
 | |
| 
 | |
|     #
 | |
|     # Scan for links
 | |
|     #
 | |
|     for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
 | |
|         my ( $link, $element, $attr, $tag ) = @$_;
 | |
| 
 | |
|         #
 | |
|         # Standardise the link (expands relative URLs, removes any fragment).
 | |
|         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
 | |
|         # URIs are removed.
 | |
|         #
 | |
|         local $URI::ABS_REMOTE_LEADING_DOTS = 1;
 | |
|         $uri = URI->new_abs( $link, $baseURL );
 | |
|         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
 | |
| 
 | |
|         #
 | |
|         # Is it an HPR link?
 | |
|         #
 | |
|         if ( $slink =~ $linkre ) {
 | |
|             #
 | |
|             # The URL we found might be a link into an HTML file with an
 | |
|             # '#anchor' component ("fragment"). Save the last bracketed match,
 | |
|             # without any 'fragment' if there is one to get a clean filename
 | |
|             # or path.
 | |
|             #
 | |
|             ( $filepath = "$+" ) =~ s/#.*$//;
 | |
| 
 | |
|             _debug( $DEBUG >= 3, "Link:      $slink\n" );
 | |
|             _debug( $DEBUG >= 3, "File path: $filepath\n" );
 | |
| 
 | |
|             #
 | |
|             # Does this file path begin with an 'hpr' prefix? If so is it the
 | |
|             # show id? If not we don't want to process it.
 | |
|             #
 | |
|             if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) {
 | |
|                 if ( $1 ne $epstr ) {
 | |
|                     _debug( $DEBUG >= 3, "Ignored $slink\n" );
 | |
|                     next;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # The path and URL might end with a slash which means the URL is
 | |
|             # relying on the Web server to fill in the filename as
 | |
|             # 'index.html'. We have to make this explicit.
 | |
|             #
 | |
|             if ( $slink =~ /\/$/ ) {
 | |
|                 $slink    .= 'index.html';
 | |
|                 $filepath .= 'index.html';
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Initialise this hash element if needed
 | |
|             #
 | |
|             unless ( exists( $rlinks->{$episode} ) ) {
 | |
|                 $rlinks->{$episode} = [];
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Stash this filename if it's not already stashed, and if it's
 | |
|             # HTML get the link and recurse
 | |
|             #
 | |
|             unless (
 | |
|                 any { $_->{filename} eq $filepath } @{ $rlinks->{$episode} }
 | |
|                 )
 | |
|             {
 | |
|                 _debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
 | |
| 
 | |
|                 push(
 | |
|                     @{ $rlinks->{$episode} },
 | |
|                         { filename => $filepath, URL => $slink }
 | |
|                 );
 | |
|                 $linkcount++;
 | |
| 
 | |
|                 #
 | |
|                 # An HTML file has to be investigated
 | |
|                 #
 | |
|                 if ( $filepath =~ /\.html$/ ) {
 | |
|                     $content = get($slink);
 | |
|                     unless ( defined($content) ) {
 | |
|                         carp "Link $slink returned nothing\n";
 | |
|                     }
 | |
|                     else {
 | |
|                         $linkcount
 | |
|                             += find_links( $episode, $content, $rlinks );
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|             }
 | |
| 
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     _debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" );
 | |
| 
 | |
|     #
 | |
|     # Return the link count
 | |
|     #
 | |
|     return $linkcount;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: count_rows
 | |
| #      PURPOSE: Counts the rows in a given SQLite table
 | |
| #   PARAMETERS: $dbh            Handle for the SQLite database
 | |
| #               $table          Name of table
 | |
| #      RETURNS: Number of rows
 | |
| #  DESCRIPTION: Builds SQL from a template including the selected $table.
 | |
| #               Returns the number of rows once the query has been run.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub count_rows {
 | |
|     my ( $dbh, $table ) = @_;
 | |
| 
 | |
|     my ( $sql, $sth, $h, $count );
 | |
| 
 | |
|     #
 | |
|     # Count rows in the selected SQLite table
 | |
|     #
 | |
|     $sql = "SELECT count(*) AS rowcount FROM $table";
 | |
|     $sth = $dbh->prepare($sql);
 | |
|     $sth->execute();
 | |
|     if ( $dbh->err ) {
 | |
|         croak $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     $count = 0;
 | |
|     if ($h = $sth->fetchrow_hashref) {
 | |
|         $count = $h->{rowcount};
 | |
|     }
 | |
| 
 | |
|     return $count;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
 | |
| #               returned when doing so might be a problem.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return undef;    ## no critic
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: assets_match
 | |
| #      PURPOSE: To compare found assets and those in the database
 | |
| #   PARAMETERS: $found_obj      an asset object (hashref) found by parsing show
 | |
| #                               notes
 | |
| #               $db_obj         an asset object (hashref) from the database
 | |
| #      RETURNS: Value 1 or 2 if they match, otherwise zero
 | |
| #  DESCRIPTION: 
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub assets_match {
 | |
|     my ( $found_obj, $db_obj ) = @_;
 | |
| 
 | |
|     return 0 if ( !defined($found_obj) || !defined($db_obj) );
 | |
| 
 | |
|     my ( $t1, $t2 ) = (
 | |
|         ( $found_obj->{filename} eq $db_obj->{filename} ),
 | |
|         ( $found_obj->{URL} eq $db_obj->{URL} )
 | |
|     );
 | |
| 
 | |
|     return 2 if ( $t1 && $t2 );
 | |
|     return 1 if ( $t1 || $t2 );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: equal
 | |
| #      PURPOSE: Compare two strings even if undefined
 | |
| #   PARAMETERS: $s1             The first string
 | |
| #               $s2             The second string
 | |
| #      RETURNS: True if both strings are undefined, false if one isn't
 | |
| #               defined, otherwise the result of comparing them.
 | |
| #  DESCRIPTION: Works on the principle that two undefined strings are equal,
 | |
| #               a defined and an undefined string are not, and otherwise they
 | |
| #               are equal if they are equal!
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub equal {
 | |
|     my ( $s1, $s2 ) = @_;
 | |
| 
 | |
|     return 1 if ( !defined($s1) && !defined($s2) );
 | |
|     return 0 if ( !defined($s1) || !defined($s2) );
 | |
| 
 | |
|     return ( $s1 eq $s2 );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: _debug
 | |
| #      PURPOSE: Prints debug reports
 | |
| #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
 | |
| #               $message        Message to print
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
 | |
| #               newline and then adds one in the 'print' to the caller doesn't
 | |
| #               have to bother. Prepends the message with 'D> ' to show it's
 | |
| #               a debug message.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub _debug {
 | |
|     my ( $active, $message ) = @_;
 | |
| 
 | |
|     chomp($message);
 | |
|     print "D> $message\n" if $active;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Usage
 | |
| #      PURPOSE: Displays a usage message and exit
 | |
| #   PARAMETERS: None
 | |
| #      RETURNS: To command line level with exit value 1
 | |
| #  DESCRIPTION: Builds the usage message using global values
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| #sub Usage {
 | |
| #    print STDERR <<EOD;
 | |
| #
 | |
| #Usage: $PROG [options] directory
 | |
| #
 | |
| #$PROG v$VERSION
 | |
| #
 | |
| #    -help               Display this information
 | |
| #    -[no]dry-run        Enable/disable dry run mode (default off)
 | |
| #    -[no]verbose        Enable/disable verbose mode (default off)
 | |
| #    -debug=N            Set the debug level
 | |
| #    -mode=MODE          Selects the mode from the following list:
 | |
| #                        - initialise: rebuild the SQLite database from MySQL
 | |
| #                          (deletes any data already uploaded)
 | |
| #                        - update: updates the SQLite database from updates
 | |
| #                          found in the MySQL database
 | |
| #                        - json: load the 'archive_date' field from a JSON
 | |
| #                          file. Also sets the 'uploaded' flag and populates
 | |
| #                          the 'IA_URL' field
 | |
| #                          ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
 | |
| #                        - scandir: scans a file captured from the HPR server
 | |
| #                          by running a 'find' command on the 'eps' directory
 | |
| #                          where all the show files reside. This file is
 | |
| #                          scanned to determine which shows have files other
 | |
| #                          than the audio so we can keep a tally of which shows
 | |
| #                          have had these uploaded to the IA
 | |
| #                          ** Not fully implemented **
 | |
| #                          ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
 | |
| #    -episode=N1 [-episode=N2 ...]
 | |
| #                        In -mode=update defines one or more single episodes to
 | |
| #                        process otherwise, if this option is not given, all
 | |
| #                        new episodes in the database are processed. The
 | |
| #                        episodes are expected to have been already added if
 | |
| #                        the option is used, and the 'title' and 'summary'
 | |
| #                        fields and the assets (if any) are to be processed.
 | |
| #    -range=N1 [-range=N2]
 | |
| #                        An alternative way of defining existing shows to be
 | |
| #                        processed in -mode=update. The option can only be
 | |
| #                        repeated twice. The smaller number defines the start
 | |
| #                        of the range, and the larger one the end of the range.
 | |
| #                        So -range=834 -range=840 defines the range 836-840. If
 | |
| #                        one number is given (or two the same) this means the
 | |
| #                        same as defining one episode. The episodes are
 | |
| #                        expected to have been already added and the 'title'
 | |
| #                        and 'summary' fields and the assets (if any) are to be
 | |
| #                        processed.
 | |
| #    -json=FILE          Defines the file to use for -mode=json
 | |
| #    -scandir=FILE       Defines a file to use for -mode=scandir
 | |
| #    -dbconfig=FILE      Defines an alternative MySQL configuration file
 | |
| #
 | |
| #EOD
 | |
| #    exit(1);
 | |
| #}
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION: Process the options we want to offer. See the documentation
 | |
| #               for details
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",       "documentation|man",
 | |
|         "debug=i",    "mode=s",
 | |
|         "json=s",     "scandir=s",
 | |
|         "episode=i@", "range=i@",
 | |
|         "dry-run!",   "verbose!",
 | |
|         "dbconfig=s",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage(
 | |
|             -msg     => "$PROG version $VERSION\n",
 | |
|             -exitval => 1,
 | |
|             -verbose => 0
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| upload_manager - manages a SQLite database of details of shows on archive.org
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to upload_manager version 0.2.19
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     ./upload_manager [-help] [-documentation] [-[no]dry-run] [-[no]verbose]
 | |
|         [-debug=N] -mode=MODE [-episode=N1 [-episode=N2 ...]] [-range=START
 | |
|         [-range=END]] [-json=FILE] [-scandir=FILE] [-dbconfig=FILE]
 | |
| 
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Display a brief summary of how to use this script.
 | |
| 
 | |
| =item B<-documentation> or B<-man>
 | |
| 
 | |
| Display the entirety of the internal documentation in the form of a manual
 | |
| page.
 | |
| 
 | |
| =item B<-[no]dry-run>
 | |
| 
 | |
| Enable/disable dry run mode (default off)
 | |
| 
 | |
| =item B<-[no]verbose>
 | |
| 
 | |
| Enable/disable verbose mode (default off)
 | |
| 
 | |
| =item B<-debug=N>
 | |
| 
 | |
| Sets the debug level:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<0>
 | |
| 
 | |
| This is the default value, and causes no debug output to be generated.
 | |
| 
 | |
| =item B<1>
 | |
| 
 | |
| TBA
 | |
| 
 | |
| =item B<2>
 | |
| 
 | |
| Produces quite a lot of asset-related output. All of the output from the lower debug levels is
 | |
| displayed as well as:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item .
 | |
| 
 | |
| Contents of the %assets hash. This is generated when scanning a show in the
 | |
| MySQL database in order to find any associated assets.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| The episode number being checked and the number of assets found while
 | |
| processing assets. A dump of the assets from the SQLite database is shown as
 | |
| well as the count of assets found therre.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| If there are assets related to a show and there are already some in the SQLite
 | |
| database, a comparison is made to see if any updates are needed. These
 | |
| differences are listed.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| If new assets are being added, these are reported from the hash
 | |
| %assets_by_file.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| Assets are added using a multiple INSERT statement, and details of what is to
 | |
| be inserted is shown in the array @multiple_insert.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =item B<3>
 | |
| 
 | |
| Produces the most output. All of the output from the lower debug levels is
 | |
| displayed as well as:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item .
 | |
| 
 | |
| Contents of the @episodes array.  This is generated by using the B<-episode=N>
 | |
| option once or many times. It is shown before and after the removal of
 | |
| duplicates.
 | |
| 
 | |
| =item .
 | |
| 
 | |
| Contents of the @range array.  This is generated by using the B<-range=N>
 | |
| option once or twice.
 | |
| 
 | |
| 
 | |
| =back
 | |
| 
 | |
| =back
 | |
| 
 | |
| =item B<-mode=MODE>
 | |
| 
 | |
| Selects the mode of operation of the script from the following list:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<initialise>
 | |
| 
 | |
| Rebuilds the SQLite database from the MySQL data. Use with caution, though
 | |
| the script will not initialise tables which contain data.
 | |
| 
 | |
| =item B<update>
 | |
| 
 | |
| Updates the SQLite database from updates found in the MySQL database. Exactly
 | |
| how this is done is controlled by other options. See below for details.
 | |
| 
 | |
| =item B<json>
 | |
| 
 | |
| Load the 'archive_date' field from a JSON file. Also sets the 'uploaded' flag
 | |
| and populates the 'IA_URL' and 'item_last_updated' fields.
 | |
| 
 | |
| =item B<scandir>
 | |
| 
 | |
| Scans a file captured from the HPR server by running a 'find' command on the
 | |
| 'eps' directory where all the show files reside. This file is scanned to
 | |
| determine which shows have files other than the audio so we can keep a tally
 | |
| of which shows have had these uploaded to the IA
 | |
| 
 | |
| ** Not fully implemented; may be phased out **
 | |
| 
 | |
| =back
 | |
| 
 | |
| =item B<-episode=N1 [-episode=N2 ...]>
 | |
| 
 | |
| In B<-mode=update> defines one or more single episodes to process otherwise, if
 | |
| this option is not given, all new episodes in the database are processed. The
 | |
| episodes are expected to have been already added if this option is used, and
 | |
| the 'title' and 'summary' fields and the assets (if any) are to be processed.
 | |
| 
 | |
| =item B<-range=N1 [-range=N2]>
 | |
| 
 | |
| An alternative way of defining existing shows to be processed in
 | |
| B<-mode=update>.  The option can only be repeated twice. The smaller number
 | |
| defines the start of the range, and the larger one the end of the range.  So
 | |
| B<-range=834 -range=840> defines the range 836-840. If one number is given (or
 | |
| two the same) this means the same as defining one episode. The episodes are
 | |
| expected to have been already added and the 'title' and 'summary' fields and
 | |
| the assets (if any) are to be processed.
 | |
| 
 | |
| =item B<-json=FILE>
 | |
| 
 | |
| Defines the file to use for B<-mode=json>. The contents of the file are
 | |
| expected to be those produced by running B<collect_show_data>.
 | |
| 
 | |
| =item B<-scandir=FILE>
 | |
| 
 | |
| Defines a file to use for B<-mode=scandir>. This feature is not fully
 | |
| implemented at the moment.
 | |
| 
 | |
| =item B<-dbconfig=FILE>
 | |
| 
 | |
| Defines an alternative MySQL configuration file
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| A full description of the application and its features.
 | |
| May include numerous subsections (i.e. =head2, =head3, etc.)
 | |
| 
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| A list of every error and warning message that the application can generate
 | |
| (even the ones that will "never happen"), with a full explanation of each
 | |
| problem, one or more likely causes, and any suggested remedies. If the
 | |
| application generates exit status codes (e.g. under Unix) then list the exit
 | |
| status associated with each error.
 | |
| 
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| A full explanation of any configuration system(s) used by the application,
 | |
| including the names and locations of any configuration files, and the
 | |
| meaning of any environment variables or properties that can be set. These
 | |
| descriptions must also include details of any configuration language used
 | |
| 
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
| A list of all the other modules that this module relies upon, including any
 | |
| restrictions on versions, and an indication whether these required modules are
 | |
| part of the standard Perl distribution, part of the module's distribution,
 | |
| or must be installed separately.
 | |
| 
 | |
| 
 | |
| =head1 INCOMPATIBILITIES
 | |
| 
 | |
| A list of any modules that this module cannot be used in conjunction with.
 | |
| This may be due to name conflicts in the interface, or competition for
 | |
| system or program resources, or due to internal limitations of Perl
 | |
| (for example, many modules that use source code filters are mutually
 | |
| incompatible).
 | |
| 
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| A list of known problems with the module, together with some indication
 | |
| whether they are likely to be fixed in an upcoming release.
 | |
| 
 | |
| Also a list of restrictions on the features the module does provide:
 | |
| data types that cannot be handled, performance issues and the circumstances
 | |
| in which they may arise, practical limitations on the size of data sets,
 | |
| special cases that are not (yet) handled, etc.
 | |
| 
 | |
| The initial template usually just has:
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to <Maintainer name(s)>  (<contact address>)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| <Author name(s)>  (<contact address>)
 | |
| 
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
 | |
| 
 | |
| Followed by whatever licence you wish to release it under.
 | |
| For Perl code that is often just:
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 | |
| 
 |