forked from HPR/hpr-tools
50edeccc88
FAQ/FAQ.mkd, FAQ/Makefile: this version of the FAQ is now out of date and probably should be deleted. InternetArchive/repair_item: script to upload missing shows after tie out errors during the normal upload; still under development. InternetArchive/update_state: script to update show state in the 'reservations' table in the database. Uses the CMS interface. Link_Checker/scan_links: under development. Not currently usable. Miscellaneous/fix_tags: audio metadata manipulation script. Recently added to this repo for convenience. Updates for 'experimental::try', the official Perl try/catch. PostgreSQL_Database/add_hosts_to_show, PostgreSQL_Database/hpr_schema_2.pgsql, PostgreSQL_Database/nuke_n_pave.sh: an old experimental Pg database to take over from the previous MySQL version (from before 2023). Kept for reference; never implemented.
619 lines
19 KiB
Perl
Executable File
619 lines
19 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#===============================================================================
|
|
#
|
|
# FILE: scan_links
|
|
#
|
|
# USAGE: ./scan_links [-help] [-[no]verbose] [-config=FILE]
|
|
#
|
|
# DESCRIPTION: Scan the notes in the database for links. Test each link to
|
|
# see if it's available. Keep a record of the date, show, link
|
|
# and result. If a link fails more than N tests take action.
|
|
# Possible actions are:
|
|
# - report the problem
|
|
# - look for the link on archive.org
|
|
# - modify the notes
|
|
#
|
|
# OPTIONS: ---
|
|
# REQUIREMENTS: ---
|
|
# BUGS: ---
|
|
# NOTES: ---
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
# VERSION: 0.0.2
|
|
# CREATED: 2017-04-02 14:09:02
|
|
# REVISION: 2022-06-02 23:18:13
|
|
#
|
|
#===============================================================================
|
|
|
|
use 5.010;
|
|
use strict;
|
|
use warnings;
|
|
use utf8;
|
|
use experimental 'smartmatch';
|
|
|
|
use Carp;
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
|
|
use Config::General;
|
|
use File::Slurper qw{ read_text read_lines };
|
|
use Try::Tiny;
|
|
|
|
use HTML::TreeBuilder 5 -weak;
|
|
use HTML::Entities;
|
|
use List::Util qw{ min max };
|
|
use List::MoreUtils qw{ any };
|
|
use LWP::Simple;
|
|
|
|
use DBI;
|
|
use SQL::Abstract;
|
|
use SQL::Abstract::Plugin::InsertMulti;
|
|
|
|
use Data::Dumper;
|
|
|
|
#
|
|
# Version number (manually incremented)
|
|
#
|
|
our $VERSION = '0.0.2';
|
|
|
|
#
|
|
# Script and directory names
|
|
#
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
|
$DIR = '.' unless $DIR;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Declarations
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Constants and other declarations
|
|
#
|
|
my $basedir = "$ENV{HOME}/HPR/Link_Checker";
|
|
my $configfile = "$basedir/.$PROG.cfg";
|
|
my $db1configfile = "$basedir/.hpr_db.cfg";
|
|
my $database2 = "$basedir/ia.db";
|
|
|
|
my ( $dbh1, $dbh2, $sql1, $sth1, $h1 );
|
|
|
|
#
|
|
# Enable Unicode mode
|
|
#
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Options and arguments
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Option defaults
|
|
#
|
|
my $DEFDEBUG = 0;
|
|
my $DEFFROM = 1;
|
|
my $DEFCOUNT = 10;
|
|
|
|
#
|
|
# Process options
|
|
#
|
|
my %options;
|
|
Options( \%options );
|
|
|
|
#
|
|
# Default help is minimal
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
|
|
if ( $options{'help'} );
|
|
|
|
#
|
|
# The -documentation or -man option shows the full POD documentation through
|
|
# a pager for convenience
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
|
if ( $options{'documentation'} );
|
|
|
|
#
|
|
# Collect options
|
|
#
|
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
|
|
|
my $db1cfgfile
|
|
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile );
|
|
|
|
my $cfgfile
|
|
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
|
|
|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
|
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
|
my $from = ( defined( $options{from} ) ? $options{from} : $DEFFROM );
|
|
my $count = ( defined( $options{count} ) ? $options{count} : $DEFCOUNT );
|
|
|
|
$from = $DEFFROM if $from < 1;
|
|
my @episodes = ( ( $from .. $from + $count ) );
|
|
|
|
#
|
|
# Report on the options in debug mode
|
|
#
|
|
if ($DEBUG > 1) {
|
|
_debug(1,'$DEBUG = ' . $DEBUG);
|
|
_debug(1,'$dry-run = ' . $dry_run);
|
|
_debug(1,'$verbose = ' . $verbose);
|
|
_debug(1,'$db1cfgfile = ' . $db1cfgfile);
|
|
_debug(1,'$cfgfile = ' . $cfgfile);
|
|
_debug(1,'$from = ' . $from);
|
|
_debug(1,'$count = ' . $count);
|
|
_debug(1,'$#episodes = ' . $#episodes);
|
|
}
|
|
|
|
#
|
|
# Sanity checks
|
|
#
|
|
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
|
|
die "Unable to find $db1cfgfile\n" unless ( -e $db1cfgfile );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Configuration file - load data
|
|
#-------------------------------------------------------------------------------
|
|
my $conf = Config::General->new(
|
|
-ConfigFile => $cfgfile,
|
|
-InterPolateVars => 1,
|
|
-ExtendedAccess => 1,
|
|
);
|
|
my %config = $conf->getall();
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Connect to the database
|
|
#-------------------------------------------------------------------------------
|
|
my $db1conf = Config::General->new(
|
|
-ConfigFile => $db1cfgfile,
|
|
-InterPolateVars => 1,
|
|
-ExtendedAccess => 1,
|
|
);
|
|
my %db1cfg = $db1conf->getall();
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Database configuration file - load data
|
|
#-------------------------------------------------------------------------------
|
|
my $dbhost = $db1cfg{database}->{host} // '127.0.0.1';
|
|
my $dbport = $db1cfg{database}->{port} // 3306;
|
|
my $dbname = $db1cfg{database}->{name};
|
|
my $dbuser = $db1cfg{database}->{user};
|
|
my $dbpwd = $db1cfg{database}->{password};
|
|
$dbh1 = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
|
|
$dbuser, $dbpwd, { AutoCommit => 1, RaiseError => 1 } )
|
|
or die $DBI::errstr;
|
|
|
|
#
|
|
# Enable client-side UTF8
|
|
#
|
|
$dbh1->{mysql_enable_utf8} = 1;
|
|
|
|
#
|
|
# Set the local timezone to UTC for this connection
|
|
#
|
|
$dbh1->do("set time_zone = '+00:00'") or carp $dbh1->errstr;
|
|
|
|
emit( $verbose >= 2, "Opened MySQL database\n" );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Connect to the SQLite database
|
|
#-------------------------------------------------------------------------------
|
|
$dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" )
|
|
or die $DBI::errstr;
|
|
|
|
$dbh2->do("PRAGMA foreign_keys = ON") or die $DBI::errstr;
|
|
|
|
emit( $verbose >= 2, "Opened SQLite database\n" );
|
|
|
|
#
|
|
# The main MySQL query
|
|
#
|
|
$sql1 = q{
|
|
SELECT * FROM eps WHERE id BETWEEN ? AND ?
|
|
};
|
|
|
|
$sth1 = $dbh1->prepare($sql1);
|
|
|
|
$sth1->execute( $from, $from + $count );
|
|
if ( $dbh1->err ) {
|
|
die $dbh1->errstr;
|
|
}
|
|
|
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
|
printf "%04d %s %s\n", $h1->{id},$h1->{date},$h1->{title};
|
|
}
|
|
|
|
exit;
|
|
|
|
#emit( $verbose >= 2, "Configuration file: ", $cfgfile, "\n" );
|
|
#emit( $verbose >= 2, "DB configuration file: ", $db1cfgfile, "\n" );
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: find_external_links
|
|
# PURPOSE: Parses the HTML in a string for links so that attached assets
|
|
# on the HPR site can also be parsed and to collect external
|
|
# links for testing.
|
|
# PARAMETERS: $episode episode number we're dealing with
|
|
# $html string containing HTML
|
|
# $rlinks hashref to receive the links found
|
|
# RETURNS: Number of links found
|
|
# DESCRIPTION: Given HTML from the main notes or a subsidiary file the
|
|
# function parses this looking for links in 'a' or 'img' tags.
|
|
# Links are standardised, making them absolute if relative and
|
|
# removing any 'fragment'. The links need to be to HTML files on
|
|
# the HPR website to be of use in recursing to subsidiary
|
|
# levels. Otherwise they need to be external links that we will
|
|
# test.
|
|
# Having found a local link the filename part is extracted. If
|
|
# it follows the format 'hpr9999' then it's checked to see if
|
|
# it's for the current show. If not it's ignored. If the
|
|
# filename ends with a '/' then it's assumed it's shorthand for
|
|
# 'index.html' so this name is appended. If the local filename
|
|
# ends with '.html' then we need to parse it in turn, so we get
|
|
# the contents of the link and recurse to parse it.
|
|
# Then, if external, the link and filename are stashed in the
|
|
# hash referenced by $rlinks. We return the number of external
|
|
# links found in the pass through the HTML.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: Based on 'find_links' in 'upload_manager'.
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub find_external_links {
|
|
my ( $episode, $html, $rlinks ) = @_;
|
|
|
|
my ($tree, $epstr, $linkre, $re2, $filepath,
|
|
$uri, $slink, $linkcount, $content
|
|
);
|
|
|
|
_debug( $DEBUG >= 3, "find_external_links enter" );
|
|
|
|
#
|
|
# Create a tree object
|
|
#
|
|
$tree = HTML::TreeBuilder->new;
|
|
$tree->ignore_unknown(0);
|
|
$tree->no_expand_entities(1);
|
|
$tree->p_strict(1);
|
|
$tree->store_comments(1);
|
|
$tree->warn(1);
|
|
|
|
$tree->parse_content($html)
|
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
|
|
|
my $baseURL = "http://hackerpublicradio.org";
|
|
|
|
$epstr = sprintf( "hpr%04d", $episode );
|
|
$linkre = qr{
|
|
^https?://
|
|
(?:www.)?
|
|
(?:hacker|hobby)publicradio.org/
|
|
(.+)$
|
|
}x;
|
|
|
|
#
|
|
# Counting new links found and stashed
|
|
#
|
|
$linkcount = 0;
|
|
|
|
#
|
|
# Scan for links
|
|
#
|
|
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
|
|
my ( $link, $element, $attr, $tag ) = @$_;
|
|
|
|
#
|
|
# Standardise the link (expands relative URLs, removes any fragment).
|
|
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
|
|
# URIs are removed.
|
|
#
|
|
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
|
|
$uri = URI->new_abs( $link, $baseURL );
|
|
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
|
|
|
|
#
|
|
# Is it an HPR link?
|
|
#
|
|
if ( $slink =~ $linkre ) {
|
|
#
|
|
# The URL we found might be a link into an HTML file with an
|
|
# '#anchor' component ("fragment"). Save the last bracketed match,
|
|
# without any 'fragment' if there is one to get a clean filename
|
|
# or path.
|
|
#
|
|
( $filepath = "$+" ) =~ s/#.*$//;
|
|
|
|
_debug( $DEBUG >= 3, "Link: $slink\n" );
|
|
_debug( $DEBUG >= 3, "File path: $filepath\n" );
|
|
|
|
#
|
|
# Does this file path begin with an 'hpr' prefix? If so is it the
|
|
# show id? If not we don't want to process it.
|
|
#
|
|
if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) {
|
|
if ( $1 ne $epstr ) {
|
|
_debug( $DEBUG >= 3, "Ignored $slink\n" );
|
|
next;
|
|
}
|
|
}
|
|
|
|
#
|
|
# The path and URL might end with a slash which means the URL is
|
|
# relying on the Web server to fill in the filename as
|
|
# 'index.html'. We have to make this explicit.
|
|
#
|
|
if ( $slink =~ /\/$/ ) {
|
|
$slink .= 'index.html';
|
|
$filepath .= 'index.html';
|
|
}
|
|
|
|
#
|
|
# Initialise this hash element if needed
|
|
#
|
|
unless ( exists( $rlinks->{$episode} ) ) {
|
|
$rlinks->{$episode} = [];
|
|
}
|
|
|
|
#
|
|
# Stash this filename if it's not already stashed, and if it's
|
|
# HTML get the link and recurse
|
|
#
|
|
unless (
|
|
any { $_->{filename} eq $filepath }
|
|
@{ $rlinks->{$episode} }
|
|
)
|
|
{
|
|
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
|
|
|
|
push(
|
|
@{ $rlinks->{$episode} },
|
|
{ filename => $filepath, URL => $slink }
|
|
);
|
|
$linkcount++;
|
|
|
|
#
|
|
# An HTML file has to be investigated
|
|
#
|
|
if ( $filepath =~ /\.html$/ ) {
|
|
$content = get($slink);
|
|
unless ( defined($content) ) {
|
|
carp "Link $slink returned nothing\n";
|
|
}
|
|
else {
|
|
$linkcount
|
|
+= find_links( $episode, $content, $rlinks );
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
#
|
|
# It's not an HPR URL
|
|
#
|
|
else {
|
|
#
|
|
# Stash this filename if it's not already stashed
|
|
#
|
|
unless (
|
|
any { $_->{filename} eq $filepath }
|
|
@{ $rlinks->{$episode} }
|
|
)
|
|
{
|
|
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
|
|
|
|
push(
|
|
@{ $rlinks->{$episode} },
|
|
{ filename => $filepath, URL => $slink }
|
|
);
|
|
$linkcount++;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
_debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" );
|
|
|
|
#
|
|
# Return the link count
|
|
#
|
|
return $linkcount;
|
|
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: emit
|
|
# PURPOSE: Print text on STDERR unless silent mode has been selected
|
|
# PARAMETERS: - Boolean indicating whether to print or not
|
|
# - list of arguments to 'print'
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
|
|
# a message to STDERR depending on a boolean. We need this to be
|
|
# able to make the script silent when the -verbose option is
|
|
# not selected
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub emit {
|
|
if (shift) {
|
|
print STDERR @_;
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: _debug
|
|
# PURPOSE: Prints debug reports
|
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
|
# $message Message to print
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
|
# newline and then adds one in the 'print' to the caller doesn't
|
|
# have to bother. Prepends the message with 'D> ' to show it's
|
|
# a debug message.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub _debug {
|
|
my ( $active, $message ) = @_;
|
|
|
|
chomp($message);
|
|
print STDERR "D> $message\n" if $active;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: Options
|
|
# PURPOSE: Processes command-line options
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
# RETURNS: Undef
|
|
# DESCRIPTION:
|
|
# THROWS: no exceptions
|
|
# COMMENTS: none
|
|
# SEE ALSO: n/a
|
|
#===============================================================================
|
|
sub Options {
|
|
my ($optref) = @_;
|
|
|
|
my @options = (
|
|
"help", "documentation|man", "debug=i", "dry-run!",
|
|
"verbose+", "dbconfig=s", "from=s", "count=i",
|
|
);
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1,
|
|
-verbose => 0 );
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
__END__
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# Application Documentation
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
#{{{
|
|
|
|
=head1 NAME
|
|
|
|
<application name> - <One line description of application's purpose>
|
|
|
|
=head1 VERSION
|
|
|
|
The initial template usually just has:
|
|
|
|
This documentation refers to <application name> version 0.0.2
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
# Brief working invocation example(s) here showing the most common usage(s)
|
|
|
|
# This section will be as far as many users ever read
|
|
# so make it as educational and exemplary as possible.
|
|
|
|
|
|
=head1 REQUIRED ARGUMENTS
|
|
|
|
A complete list of every argument that must appear on the command line.
|
|
when the application is invoked, explaining what each of them does, any
|
|
restrictions on where each one may appear (i.e. flags that must appear
|
|
before or after filenames), and how the various arguments and options
|
|
may interact (e.g. mutual exclusions, required combinations, etc.)
|
|
|
|
If all of the application's arguments are optional this section
|
|
may be omitted entirely.
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
A complete list of every available option with which the application
|
|
can be invoked, explaining what each does, and listing any restrictions,
|
|
or interactions.
|
|
|
|
If the application has no options this section may be omitted entirely.
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
A full description of the application and its features.
|
|
May include numerous subsections (i.e. =head2, =head3, etc.)
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
A list of every error and warning message that the application can generate
|
|
(even the ones that will "never happen"), with a full explanation of each
|
|
problem, one or more likely causes, and any suggested remedies. If the
|
|
application generates exit status codes (e.g. under Unix) then list the exit
|
|
status associated with each error.
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
A full explanation of any configuration system(s) used by the application,
|
|
including the names and locations of any configuration files, and the
|
|
meaning of any environment variables or properties that can be set. These
|
|
descriptions must also include details of any configuration language used
|
|
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
A list of all the other modules that this module relies upon, including any
|
|
restrictions on versions, and an indication whether these required modules are
|
|
part of the standard Perl distribution, part of the module's distribution,
|
|
or must be installed separately.
|
|
|
|
|
|
=head1 INCOMPATIBILITIES
|
|
|
|
A list of any modules that this module cannot be used in conjunction with.
|
|
This may be due to name conflicts in the interface, or competition for
|
|
system or program resources, or due to internal limitations of Perl
|
|
(for example, many modules that use source code filters are mutually
|
|
incompatible).
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
A list of known problems with the module, together with some indication
|
|
whether they are likely to be fixed in an upcoming release.
|
|
|
|
Also a list of restrictions on the features the module does provide:
|
|
data types that cannot be handled, performance issues and the circumstances
|
|
in which they may arise, practical limitations on the size of data sets,
|
|
special cases that are not (yet) handled, etc.
|
|
|
|
The initial template usually just has:
|
|
|
|
There are no known bugs in this module.
|
|
Please report problems to <Maintainer name(s)> (<contact address>)
|
|
Patches are welcome.
|
|
|
|
=head1 AUTHOR
|
|
|
|
<Author name(s)> (<contact address>)
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
|
|
|
|
Followed by whatever licence you wish to release it under.
|
|
For Perl code that is often just:
|
|
|
|
This module is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
=cut
|
|
|
|
#}}}
|
|
|
|
# [zo to open fold, zc to close or za to toggle]
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
|
|