1
0
forked from HPR/hpr-tools
hpr-tools/Link_Checker/scan_links
Dave Morriss 50edeccc88 Updates from previous repo
FAQ/FAQ.mkd, FAQ/Makefile: this version of the FAQ is now out of date
    and probably should be deleted.

InternetArchive/repair_item: script to upload missing shows after tie
    out errors during the normal upload; still under development.

InternetArchive/update_state: script to update show state in the
    'reservations' table in the database. Uses the CMS interface.

Link_Checker/scan_links: under development. Not currently usable.

Miscellaneous/fix_tags: audio metadata manipulation script. Recently
    added to this repo for convenience. Updates for 'experimental::try',
    the official Perl try/catch.

PostgreSQL_Database/add_hosts_to_show, PostgreSQL_Database/hpr_schema_2.pgsql,
    PostgreSQL_Database/nuke_n_pave.sh: an old experimental Pg database
    to take over from the previous MySQL version (from before 2023).
    Kept for reference; never implemented.
2024-06-14 16:00:04 +01:00

619 lines
19 KiB
Perl
Executable File

#!/usr/bin/env perl
#===============================================================================
#
# FILE: scan_links
#
# USAGE: ./scan_links [-help] [-[no]verbose] [-config=FILE]
#
# DESCRIPTION: Scan the notes in the database for links. Test each link to
# see if it's available. Keep a record of the date, show, link
# and result. If a link fails more than N tests take action.
# Possible actions are:
# - report the problem
# - look for the link on archive.org
# - modify the notes
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.2
# CREATED: 2017-04-02 14:09:02
# REVISION: 2022-06-02 23:18:13
#
#===============================================================================
use 5.010;
use strict;
use warnings;
use utf8;
use experimental 'smartmatch';
use Carp;
use Getopt::Long;
use Pod::Usage;
use Config::General;
use File::Slurper qw{ read_text read_lines };
use Try::Tiny;
use HTML::TreeBuilder 5 -weak;
use HTML::Entities;
use List::Util qw{ min max };
use List::MoreUtils qw{ any };
use LWP::Simple;
use DBI;
use SQL::Abstract;
use SQL::Abstract::Plugin::InsertMulti;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.2';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Link_Checker";
my $configfile = "$basedir/.$PROG.cfg";
my $db1configfile = "$basedir/.hpr_db.cfg";
my $database2 = "$basedir/ia.db";
my ( $dbh1, $dbh2, $sql1, $sth1, $h1 );
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;
my $DEFFROM = 1;
my $DEFCOUNT = 10;
#
# Process options
#
my %options;
Options( \%options );
#
# Default help is minimal
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
if ( $options{'help'} );
#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
if ( $options{'documentation'} );
#
# Collect options
#
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
my $db1cfgfile
= ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile );
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
my $from = ( defined( $options{from} ) ? $options{from} : $DEFFROM );
my $count = ( defined( $options{count} ) ? $options{count} : $DEFCOUNT );
$from = $DEFFROM if $from < 1;
my @episodes = ( ( $from .. $from + $count ) );
#
# Report on the options in debug mode
#
if ($DEBUG > 1) {
_debug(1,'$DEBUG = ' . $DEBUG);
_debug(1,'$dry-run = ' . $dry_run);
_debug(1,'$verbose = ' . $verbose);
_debug(1,'$db1cfgfile = ' . $db1cfgfile);
_debug(1,'$cfgfile = ' . $cfgfile);
_debug(1,'$from = ' . $from);
_debug(1,'$count = ' . $count);
_debug(1,'$#episodes = ' . $#episodes);
}
#
# Sanity checks
#
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
die "Unable to find $db1cfgfile\n" unless ( -e $db1cfgfile );
#-------------------------------------------------------------------------------
# Configuration file - load data
#-------------------------------------------------------------------------------
my $conf = Config::General->new(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $db1conf = Config::General->new(
-ConfigFile => $db1cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %db1cfg = $db1conf->getall();
#-------------------------------------------------------------------------------
# Database configuration file - load data
#-------------------------------------------------------------------------------
my $dbhost = $db1cfg{database}->{host} // '127.0.0.1';
my $dbport = $db1cfg{database}->{port} // 3306;
my $dbname = $db1cfg{database}->{name};
my $dbuser = $db1cfg{database}->{user};
my $dbpwd = $db1cfg{database}->{password};
$dbh1 = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
$dbuser, $dbpwd, { AutoCommit => 1, RaiseError => 1 } )
or die $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh1->{mysql_enable_utf8} = 1;
#
# Set the local timezone to UTC for this connection
#
$dbh1->do("set time_zone = '+00:00'") or carp $dbh1->errstr;
emit( $verbose >= 2, "Opened MySQL database\n" );
#-------------------------------------------------------------------------------
# Connect to the SQLite database
#-------------------------------------------------------------------------------
$dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" )
or die $DBI::errstr;
$dbh2->do("PRAGMA foreign_keys = ON") or die $DBI::errstr;
emit( $verbose >= 2, "Opened SQLite database\n" );
#
# The main MySQL query
#
$sql1 = q{
SELECT * FROM eps WHERE id BETWEEN ? AND ?
};
$sth1 = $dbh1->prepare($sql1);
$sth1->execute( $from, $from + $count );
if ( $dbh1->err ) {
die $dbh1->errstr;
}
while ( $h1 = $sth1->fetchrow_hashref ) {
printf "%04d %s %s\n", $h1->{id},$h1->{date},$h1->{title};
}
exit;
#emit( $verbose >= 2, "Configuration file: ", $cfgfile, "\n" );
#emit( $verbose >= 2, "DB configuration file: ", $db1cfgfile, "\n" );
#=== FUNCTION ================================================================
# NAME: find_external_links
# PURPOSE: Parses the HTML in a string for links so that attached assets
# on the HPR site can also be parsed and to collect external
# links for testing.
# PARAMETERS: $episode episode number we're dealing with
# $html string containing HTML
# $rlinks hashref to receive the links found
# RETURNS: Number of links found
# DESCRIPTION: Given HTML from the main notes or a subsidiary file the
# function parses this looking for links in 'a' or 'img' tags.
# Links are standardised, making them absolute if relative and
# removing any 'fragment'. The links need to be to HTML files on
# the HPR website to be of use in recursing to subsidiary
# levels. Otherwise they need to be external links that we will
# test.
# Having found a local link the filename part is extracted. If
# it follows the format 'hpr9999' then it's checked to see if
# it's for the current show. If not it's ignored. If the
# filename ends with a '/' then it's assumed it's shorthand for
# 'index.html' so this name is appended. If the local filename
# ends with '.html' then we need to parse it in turn, so we get
# the contents of the link and recurse to parse it.
# Then, if external, the link and filename are stashed in the
# hash referenced by $rlinks. We return the number of external
# links found in the pass through the HTML.
# THROWS: No exceptions
# COMMENTS: Based on 'find_links' in 'upload_manager'.
# SEE ALSO: N/A
#===============================================================================
sub find_external_links {
my ( $episode, $html, $rlinks ) = @_;
my ($tree, $epstr, $linkre, $re2, $filepath,
$uri, $slink, $linkcount, $content
);
_debug( $DEBUG >= 3, "find_external_links enter" );
#
# Create a tree object
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);
$tree->warn(1);
$tree->parse_content($html)
or die "HTML::TreeBuilder failed to parse notes: $!\n";
my $baseURL = "http://hackerpublicradio.org";
$epstr = sprintf( "hpr%04d", $episode );
$linkre = qr{
^https?://
(?:www.)?
(?:hacker|hobby)publicradio.org/
(.+)$
}x;
#
# Counting new links found and stashed
#
$linkcount = 0;
#
# Scan for links
#
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
#
# Standardise the link (expands relative URLs, removes any fragment).
# Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative
# URIs are removed.
#
local $URI::ABS_REMOTE_LEADING_DOTS = 1;
$uri = URI->new_abs( $link, $baseURL );
$slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque );
#
# Is it an HPR link?
#
if ( $slink =~ $linkre ) {
#
# The URL we found might be a link into an HTML file with an
# '#anchor' component ("fragment"). Save the last bracketed match,
# without any 'fragment' if there is one to get a clean filename
# or path.
#
( $filepath = "$+" ) =~ s/#.*$//;
_debug( $DEBUG >= 3, "Link: $slink\n" );
_debug( $DEBUG >= 3, "File path: $filepath\n" );
#
# Does this file path begin with an 'hpr' prefix? If so is it the
# show id? If not we don't want to process it.
#
if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) {
if ( $1 ne $epstr ) {
_debug( $DEBUG >= 3, "Ignored $slink\n" );
next;
}
}
#
# The path and URL might end with a slash which means the URL is
# relying on the Web server to fill in the filename as
# 'index.html'. We have to make this explicit.
#
if ( $slink =~ /\/$/ ) {
$slink .= 'index.html';
$filepath .= 'index.html';
}
#
# Initialise this hash element if needed
#
unless ( exists( $rlinks->{$episode} ) ) {
$rlinks->{$episode} = [];
}
#
# Stash this filename if it's not already stashed, and if it's
# HTML get the link and recurse
#
unless (
any { $_->{filename} eq $filepath }
@{ $rlinks->{$episode} }
)
{
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
push(
@{ $rlinks->{$episode} },
{ filename => $filepath, URL => $slink }
);
$linkcount++;
#
# An HTML file has to be investigated
#
if ( $filepath =~ /\.html$/ ) {
$content = get($slink);
unless ( defined($content) ) {
carp "Link $slink returned nothing\n";
}
else {
$linkcount
+= find_links( $episode, $content, $rlinks );
}
}
}
}
#
# It's not an HPR URL
#
else {
#
# Stash this filename if it's not already stashed
#
unless (
any { $_->{filename} eq $filepath }
@{ $rlinks->{$episode} }
)
{
_debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" );
push(
@{ $rlinks->{$episode} },
{ filename => $filepath, URL => $slink }
);
$linkcount++;
}
}
}
_debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" );
#
# Return the link count
#
return $linkcount;
}
#=== FUNCTION ================================================================
# NAME: emit
# PURPOSE: Print text on STDERR unless silent mode has been selected
# PARAMETERS: - Boolean indicating whether to print or not
# - list of arguments to 'print'
# RETURNS: Nothing
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
# a message to STDERR depending on a boolean. We need this to be
# able to make the script silent when the -verbose option is
# not selected
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub emit {
if (shift) {
print STDERR @_;
}
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print STDERR "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "documentation|man", "debug=i", "dry-run!",
"verbose+", "dbconfig=s", "from=s", "count=i",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1,
-verbose => 0 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
<application name> - <One line description of application's purpose>
=head1 VERSION
The initial template usually just has:
This documentation refers to <application name> version 0.0.2
=head1 USAGE
# Brief working invocation example(s) here showing the most common usage(s)
# This section will be as far as many users ever read
# so make it as educational and exemplary as possible.
=head1 REQUIRED ARGUMENTS
A complete list of every argument that must appear on the command line.
when the application is invoked, explaining what each of them does, any
restrictions on where each one may appear (i.e. flags that must appear
before or after filenames), and how the various arguments and options
may interact (e.g. mutual exclusions, required combinations, etc.)
If all of the application's arguments are optional this section
may be omitted entirely.
=head1 OPTIONS
A complete list of every available option with which the application
can be invoked, explaining what each does, and listing any restrictions,
or interactions.
If the application has no options this section may be omitted entirely.
=head1 DESCRIPTION
A full description of the application and its features.
May include numerous subsections (i.e. =head2, =head3, etc.)
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
A full explanation of any configuration system(s) used by the application,
including the names and locations of any configuration files, and the
meaning of any environment variables or properties that can be set. These
descriptions must also include details of any configuration language used
=head1 DEPENDENCIES
A list of all the other modules that this module relies upon, including any
restrictions on versions, and an indication whether these required modules are
part of the standard Perl distribution, part of the module's distribution,
or must be installed separately.
=head1 INCOMPATIBILITIES
A list of any modules that this module cannot be used in conjunction with.
This may be due to name conflicts in the interface, or competition for
system or program resources, or due to internal limitations of Perl
(for example, many modules that use source code filters are mutually
incompatible).
=head1 BUGS AND LIMITATIONS
A list of known problems with the module, together with some indication
whether they are likely to be fixed in an upcoming release.
Also a list of restrictions on the features the module does provide:
data types that cannot be handled, performance issues and the circumstances
in which they may arise, practical limitations on the size of data sets,
special cases that are not (yet) handled, etc.
The initial template usually just has:
There are no known bugs in this module.
Please report problems to <Maintainer name(s)> (<contact address>)
Patches are welcome.
=head1 AUTHOR
<Author name(s)> (<contact address>)
=head1 LICENCE AND COPYRIGHT
Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved.
Followed by whatever licence you wish to release it under.
For Perl code that is often just:
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close or za to toggle]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker