| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #!/usr/bin/env perl | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #         FILE: scan_links | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | #        USAGE: ./scan_links [-help] [-[no]verbose] [-config=FILE] | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | # | 
					
						
							|  |  |  | #  DESCRIPTION: Scan the notes in the database for links. Test each link to | 
					
						
							|  |  |  | #               see if it's available. Keep a record of the date, show, link | 
					
						
							|  |  |  | #               and result. If a link fails more than N tests take action. | 
					
						
							|  |  |  | #               Possible actions are: | 
					
						
							|  |  |  | #               - report the problem | 
					
						
							|  |  |  | #               - look for the link on archive.org | 
					
						
							|  |  |  | #               - modify the notes | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      OPTIONS: --- | 
					
						
							|  |  |  | # REQUIREMENTS: --- | 
					
						
							|  |  |  | #         BUGS: --- | 
					
						
							|  |  |  | #        NOTES: --- | 
					
						
							|  |  |  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | #      VERSION: 0.0.2 | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #      CREATED: 2017-04-02 14:09:02 | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | #     REVISION: 2022-06-02 23:18:13 | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | # | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use 5.010; | 
					
						
							|  |  |  | use strict; | 
					
						
							|  |  |  | use warnings; | 
					
						
							|  |  |  | use utf8; | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | use experimental 'smartmatch'; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | use Carp; | 
					
						
							|  |  |  | use Getopt::Long; | 
					
						
							|  |  |  | use Pod::Usage; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Config::General; | 
					
						
							|  |  |  | use File::Slurper qw{ read_text read_lines }; | 
					
						
							|  |  |  | use Try::Tiny; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use HTML::TreeBuilder 5 -weak; | 
					
						
							|  |  |  | use HTML::Entities; | 
					
						
							|  |  |  | use List::Util qw{ min max }; | 
					
						
							|  |  |  | use List::MoreUtils qw{ any }; | 
					
						
							|  |  |  | use LWP::Simple; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use DBI; | 
					
						
							|  |  |  | use SQL::Abstract; | 
					
						
							|  |  |  | use SQL::Abstract::Plugin::InsertMulti; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Data::Dumper; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Version number (manually incremented) | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | our $VERSION = '0.0.2'; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Script and directory names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( my $PROG = $0 ) =~ s|.*/||mx; | 
					
						
							|  |  |  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | 
					
						
							|  |  |  | $DIR = '.' unless $DIR; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Declarations | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Constants and other declarations | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | my $basedir       = "$ENV{HOME}/HPR/Link_Checker"; | 
					
						
							|  |  |  | my $configfile    = "$basedir/.$PROG.cfg"; | 
					
						
							|  |  |  | my $db1configfile = "$basedir/.hpr_db.cfg"; | 
					
						
							|  |  |  | my $database2     = "$basedir/ia.db"; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | my ( $dbh1, $dbh2, $sql1, $sth1, $h1 ); | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Enable Unicode mode | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | binmode STDOUT, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | binmode STDERR, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Options and arguments | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Option defaults | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEFDEBUG = 0; | 
					
						
							|  |  |  | my $DEFFROM = 1; | 
					
						
							|  |  |  | my $DEFCOUNT = 10; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Process options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my %options; | 
					
						
							|  |  |  | Options( \%options ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default help is minimal | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ) | 
					
						
							|  |  |  |     if ( $options{'help'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The -documentation or -man option shows the full POD documentation through | 
					
						
							|  |  |  | # a pager for convenience | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | 
					
						
							|  |  |  |     if ( $options{'documentation'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Collect options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $db1cfgfile | 
					
						
							|  |  |  |     = ( defined( $options{dbconfig} ) ? $options{dbconfig} : $db1configfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $cfgfile | 
					
						
							|  |  |  |     = ( defined( $options{config} ) ? $options{config} : $configfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); | 
					
						
							|  |  |  | my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 ); | 
					
						
							|  |  |  | my $from    = ( defined( $options{from} )  ? $options{from}  : $DEFFROM ); | 
					
						
							|  |  |  | my $count   = ( defined( $options{count} ) ? $options{count} : $DEFCOUNT ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $from = $DEFFROM if $from < 1; | 
					
						
							|  |  |  | my @episodes = ( ( $from .. $from + $count ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Report on the options in debug mode | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($DEBUG > 1) { | 
					
						
							|  |  |  |     _debug(1,'$DEBUG      = ' . $DEBUG); | 
					
						
							|  |  |  |     _debug(1,'$dry-run    = ' . $dry_run); | 
					
						
							|  |  |  |     _debug(1,'$verbose    = ' . $verbose); | 
					
						
							|  |  |  |     _debug(1,'$db1cfgfile = ' . $db1cfgfile); | 
					
						
							|  |  |  |     _debug(1,'$cfgfile    = ' . $cfgfile); | 
					
						
							|  |  |  |     _debug(1,'$from       = ' . $from); | 
					
						
							|  |  |  |     _debug(1,'$count      = ' . $count); | 
					
						
							|  |  |  |     _debug(1,'$#episodes  = ' . $#episodes); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Sanity checks | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | die "Unable to find $cfgfile\n" unless ( -e $cfgfile ); | 
					
						
							|  |  |  | die "Unable to find $db1cfgfile\n" unless ( -e $db1cfgfile ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Configuration file - load data | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $conf = Config::General->new( | 
					
						
							|  |  |  |     -ConfigFile      => $cfgfile, | 
					
						
							|  |  |  |     -InterPolateVars => 1, | 
					
						
							|  |  |  |     -ExtendedAccess  => 1, | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | my %config = $conf->getall(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Connect to the database | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $db1conf = Config::General->new( | 
					
						
							|  |  |  |     -ConfigFile      => $db1cfgfile, | 
					
						
							|  |  |  |     -InterPolateVars => 1, | 
					
						
							|  |  |  |     -ExtendedAccess  => 1, | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | my %db1cfg = $db1conf->getall(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Database configuration file - load data | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $dbhost = $db1cfg{database}->{host} // '127.0.0.1'; | 
					
						
							|  |  |  | my $dbport = $db1cfg{database}->{port} // 3306; | 
					
						
							|  |  |  | my $dbname = $db1cfg{database}->{name}; | 
					
						
							|  |  |  | my $dbuser = $db1cfg{database}->{user}; | 
					
						
							|  |  |  | my $dbpwd  = $db1cfg{database}->{password}; | 
					
						
							|  |  |  | $dbh1 = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", | 
					
						
							|  |  |  |     $dbuser, $dbpwd, { AutoCommit => 1, RaiseError => 1 } ) | 
					
						
							|  |  |  |     or die $DBI::errstr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Enable client-side UTF8 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $dbh1->{mysql_enable_utf8} = 1; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Set the local timezone to UTC for this connection | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $dbh1->do("set time_zone = '+00:00'") or carp $dbh1->errstr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | emit( $verbose >= 2, "Opened MySQL database\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Connect to the SQLite database | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | $dbh2 = DBI->connect( "dbi:SQLite:dbname=$database2", "", "" ) | 
					
						
							|  |  |  |     or die $DBI::errstr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $dbh2->do("PRAGMA foreign_keys = ON") or die $DBI::errstr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | emit( $verbose >= 2, "Opened SQLite database\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The main MySQL query | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $sql1 = q{ | 
					
						
							|  |  |  |     SELECT * FROM eps WHERE id BETWEEN ? AND ? | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $sth1 = $dbh1->prepare($sql1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $sth1->execute( $from, $from + $count ); | 
					
						
							|  |  |  | if ( $dbh1->err ) { | 
					
						
							|  |  |  |     die $dbh1->errstr; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | while ( $h1 = $sth1->fetchrow_hashref ) { | 
					
						
							|  |  |  |     printf "%04d %s %s\n", $h1->{id},$h1->{date},$h1->{title}; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | exit; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 16:00:04 +01:00
										 |  |  | #emit( $verbose >= 2, "Configuration file: ", $cfgfile, "\n" ); | 
					
						
							|  |  |  | #emit( $verbose >= 2, "DB configuration file: ", $db1cfgfile, "\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: find_external_links | 
					
						
							|  |  |  | #      PURPOSE: Parses the HTML in a string for links so that attached assets | 
					
						
							|  |  |  | #               on the HPR site can also be parsed and to collect external | 
					
						
							|  |  |  | #               links for testing. | 
					
						
							|  |  |  | #   PARAMETERS: $episode        episode number we're dealing with | 
					
						
							|  |  |  | #               $html           string containing HTML | 
					
						
							|  |  |  | #               $rlinks         hashref to receive the links found | 
					
						
							|  |  |  | #      RETURNS: Number of links found | 
					
						
							|  |  |  | #  DESCRIPTION: Given HTML from the main notes or a subsidiary file the | 
					
						
							|  |  |  | #               function parses this looking for links in 'a' or 'img' tags. | 
					
						
							|  |  |  | #               Links are standardised, making them absolute if relative and | 
					
						
							|  |  |  | #               removing any 'fragment'. The links need to be to HTML files on | 
					
						
							|  |  |  | #               the HPR website to be of use in recursing to subsidiary | 
					
						
							|  |  |  | #               levels. Otherwise they need to be external links that we will | 
					
						
							|  |  |  | #               test. | 
					
						
							|  |  |  | #               Having found a local link the filename part is extracted. If | 
					
						
							|  |  |  | #               it follows the format 'hpr9999' then it's checked to see if | 
					
						
							|  |  |  | #               it's for the current show. If not it's ignored. If the | 
					
						
							|  |  |  | #               filename ends with a '/' then it's assumed it's shorthand for | 
					
						
							|  |  |  | #               'index.html' so this name is appended. If the local filename | 
					
						
							|  |  |  | #               ends with '.html' then we need to parse it in turn, so we get | 
					
						
							|  |  |  | #               the contents of the link and recurse to parse it. | 
					
						
							|  |  |  | #               Then, if external, the link and filename are stashed in the | 
					
						
							|  |  |  | #               hash referenced by $rlinks. We return the number of external | 
					
						
							|  |  |  | #               links found in the pass through the HTML. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: Based on 'find_links' in 'upload_manager'. | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub find_external_links { | 
					
						
							|  |  |  |     my ( $episode, $html, $rlinks ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ($tree, $epstr, $linkre,    $re2, $filepath, | 
					
						
							|  |  |  |         $uri,  $slink, $linkcount, $content | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     _debug( $DEBUG >= 3, "find_external_links enter" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Create a tree object | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $tree = HTML::TreeBuilder->new; | 
					
						
							|  |  |  |     $tree->ignore_unknown(0); | 
					
						
							|  |  |  |     $tree->no_expand_entities(1); | 
					
						
							|  |  |  |     $tree->p_strict(1); | 
					
						
							|  |  |  |     $tree->store_comments(1); | 
					
						
							|  |  |  |     $tree->warn(1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $tree->parse_content($html) | 
					
						
							|  |  |  |         or die "HTML::TreeBuilder failed to parse notes: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my $baseURL = "http://hackerpublicradio.org"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $epstr = sprintf( "hpr%04d", $episode ); | 
					
						
							|  |  |  |     $linkre = qr{ | 
					
						
							|  |  |  |         ^https?:// | 
					
						
							|  |  |  |         (?:www.)? | 
					
						
							|  |  |  |         (?:hacker|hobby)publicradio.org/ | 
					
						
							|  |  |  |         (.+)$ | 
					
						
							|  |  |  |     }x; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Counting new links found and stashed | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $linkcount = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Scan for links | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     for ( @{ $tree->extract_links( 'a', 'img' ) } ) { | 
					
						
							|  |  |  |         my ( $link, $element, $attr, $tag ) = @$_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Standardise the link (expands relative URLs, removes any fragment). | 
					
						
							|  |  |  |         # Set $URI::ABS_REMOTE_LEADING_DOTS to ensure leading dots in relative | 
					
						
							|  |  |  |         # URIs are removed. | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         local $URI::ABS_REMOTE_LEADING_DOTS = 1; | 
					
						
							|  |  |  |         $uri = URI->new_abs( $link, $baseURL ); | 
					
						
							|  |  |  |         $slink = sprintf( "%s:%s", $uri->scheme, $uri->opaque ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Is it an HPR link? | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( $slink =~ $linkre ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # The URL we found might be a link into an HTML file with an | 
					
						
							|  |  |  |             # '#anchor' component ("fragment"). Save the last bracketed match, | 
					
						
							|  |  |  |             # without any 'fragment' if there is one to get a clean filename | 
					
						
							|  |  |  |             # or path. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             ( $filepath = "$+" ) =~ s/#.*$//; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             _debug( $DEBUG >= 3, "Link:      $slink\n" ); | 
					
						
							|  |  |  |             _debug( $DEBUG >= 3, "File path: $filepath\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Does this file path begin with an 'hpr' prefix? If so is it the | 
					
						
							|  |  |  |             # show id? If not we don't want to process it. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $filepath =~ /^(hpr[0-9]{1,4})/ ) { | 
					
						
							|  |  |  |                 if ( $1 ne $epstr ) { | 
					
						
							|  |  |  |                     _debug( $DEBUG >= 3, "Ignored $slink\n" ); | 
					
						
							|  |  |  |                     next; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # The path and URL might end with a slash which means the URL is | 
					
						
							|  |  |  |             # relying on the Web server to fill in the filename as | 
					
						
							|  |  |  |             # 'index.html'. We have to make this explicit. | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $slink =~ /\/$/ ) { | 
					
						
							|  |  |  |                 $slink    .= 'index.html'; | 
					
						
							|  |  |  |                 $filepath .= 'index.html'; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Initialise this hash element if needed | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             unless ( exists( $rlinks->{$episode} ) ) { | 
					
						
							|  |  |  |                 $rlinks->{$episode} = []; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Stash this filename if it's not already stashed, and if it's | 
					
						
							|  |  |  |             # HTML get the link and recurse | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             unless ( | 
					
						
							|  |  |  |                 any { $_->{filename} eq $filepath } | 
					
						
							|  |  |  |                 @{ $rlinks->{$episode} } | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 _debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 push( | 
					
						
							|  |  |  |                     @{ $rlinks->{$episode} }, | 
					
						
							|  |  |  |                     { filename => $filepath, URL => $slink } | 
					
						
							|  |  |  |                 ); | 
					
						
							|  |  |  |                 $linkcount++; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 # An HTML file has to be investigated | 
					
						
							|  |  |  |                 # | 
					
						
							|  |  |  |                 if ( $filepath =~ /\.html$/ ) { | 
					
						
							|  |  |  |                     $content = get($slink); | 
					
						
							|  |  |  |                     unless ( defined($content) ) { | 
					
						
							|  |  |  |                         carp "Link $slink returned nothing\n"; | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                     else { | 
					
						
							|  |  |  |                         $linkcount | 
					
						
							|  |  |  |                             += find_links( $episode, $content, $rlinks ); | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # It's not an HPR URL | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Stash this filename if it's not already stashed | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             unless ( | 
					
						
							|  |  |  |                 any { $_->{filename} eq $filepath } | 
					
						
							|  |  |  |                 @{ $rlinks->{$episode} } | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 _debug( $DEBUG >= 3, "Stashed $slink and $filepath\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 push( | 
					
						
							|  |  |  |                     @{ $rlinks->{$episode} }, | 
					
						
							|  |  |  |                     { filename => $filepath, URL => $slink } | 
					
						
							|  |  |  |                 ); | 
					
						
							|  |  |  |                 $linkcount++; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     _debug( $DEBUG >= 3, "find_links exiting with $linkcount links\n" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Return the link count | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return $linkcount; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: emit | 
					
						
							|  |  |  | #      PURPOSE: Print text on STDERR unless silent mode has been selected | 
					
						
							|  |  |  | #   PARAMETERS: - Boolean indicating whether to print or not | 
					
						
							|  |  |  | #               - list of arguments to 'print' | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: This is a wrapper around 'print' to determine whether to send | 
					
						
							|  |  |  | #               a message to STDERR depending on a boolean. We need this to be | 
					
						
							|  |  |  | #               able to make the script silent when the -verbose option is | 
					
						
							|  |  |  | #               not selected | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub emit { | 
					
						
							|  |  |  |     if (shift) { | 
					
						
							|  |  |  |         print STDERR @_; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _debug | 
					
						
							|  |  |  | #      PURPOSE: Prints debug reports | 
					
						
							|  |  |  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | 
					
						
							|  |  |  | #               $message        Message to print | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | 
					
						
							|  |  |  | #               newline and then adds one in the 'print' to the caller doesn't | 
					
						
							|  |  |  | #               have to bother. Prepends the message with 'D> ' to show it's | 
					
						
							|  |  |  | #               a debug message. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _debug { | 
					
						
							|  |  |  |     my ( $active, $message ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     chomp($message); | 
					
						
							|  |  |  |     print STDERR "D> $message\n" if $active; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: Options | 
					
						
							|  |  |  | #      PURPOSE: Processes command-line options | 
					
						
							|  |  |  | #   PARAMETERS: $optref     Hash reference to hold the options | 
					
						
							|  |  |  | #      RETURNS: Undef | 
					
						
							|  |  |  | #  DESCRIPTION: | 
					
						
							|  |  |  | #       THROWS: no exceptions | 
					
						
							|  |  |  | #     COMMENTS: none | 
					
						
							|  |  |  | #     SEE ALSO: n/a | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub Options { | 
					
						
							|  |  |  |     my ($optref) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @options = ( | 
					
						
							|  |  |  |         "help",     "documentation|man", "debug=i", "dry-run!", | 
					
						
							|  |  |  |         "verbose+", "dbconfig=s",        "from=s",  "count=i", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( !GetOptions( $optref, @options ) ) { | 
					
						
							|  |  |  |         pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, | 
					
						
							|  |  |  |             -verbose => 0 ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __END__ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #  Application Documentation | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #{{{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 NAME | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | <application name> - <One line description of application's purpose> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 VERSION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The initial template usually just has: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This documentation refers to <application name> version 0.0.2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 USAGE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Brief working invocation example(s) here showing the most common usage(s) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # This section will be as far as many users ever read | 
					
						
							|  |  |  |     # so make it as educational and exemplary as possible. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 REQUIRED ARGUMENTS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A complete list of every argument that must appear on the command line. | 
					
						
							|  |  |  | when the application  is invoked, explaining what each of them does, any | 
					
						
							|  |  |  | restrictions on where each one may appear (i.e. flags that must appear | 
					
						
							|  |  |  | before or after filenames), and how the various arguments and options | 
					
						
							|  |  |  | may interact (e.g. mutual exclusions, required combinations, etc.) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If all of the application's arguments are optional this section | 
					
						
							|  |  |  | may be omitted entirely. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 OPTIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A complete list of every available option with which the application | 
					
						
							|  |  |  | can be invoked, explaining what each does, and listing any restrictions, | 
					
						
							|  |  |  | or interactions. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If the application has no options this section may be omitted entirely. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DESCRIPTION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A full description of the application and its features. | 
					
						
							|  |  |  | May include numerous subsections (i.e. =head2, =head3, etc.) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DIAGNOSTICS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of every error and warning message that the application can generate | 
					
						
							|  |  |  | (even the ones that will "never happen"), with a full explanation of each | 
					
						
							|  |  |  | problem, one or more likely causes, and any suggested remedies. If the | 
					
						
							|  |  |  | application generates exit status codes (e.g. under Unix) then list the exit | 
					
						
							|  |  |  | status associated with each error. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 CONFIGURATION AND ENVIRONMENT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A full explanation of any configuration system(s) used by the application, | 
					
						
							|  |  |  | including the names and locations of any configuration files, and the | 
					
						
							|  |  |  | meaning of any environment variables or properties that can be set. These | 
					
						
							|  |  |  | descriptions must also include details of any configuration language used | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DEPENDENCIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of all the other modules that this module relies upon, including any | 
					
						
							|  |  |  | restrictions on versions, and an indication whether these required modules are | 
					
						
							|  |  |  | part of the standard Perl distribution, part of the module's distribution, | 
					
						
							|  |  |  | or must be installed separately. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 INCOMPATIBILITIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of any modules that this module cannot be used in conjunction with. | 
					
						
							|  |  |  | This may be due to name conflicts in the interface, or competition for | 
					
						
							|  |  |  | system or program resources, or due to internal limitations of Perl | 
					
						
							|  |  |  | (for example, many modules that use source code filters are mutually | 
					
						
							|  |  |  | incompatible). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 BUGS AND LIMITATIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A list of known problems with the module, together with some indication | 
					
						
							|  |  |  | whether they are likely to be fixed in an upcoming release. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Also a list of restrictions on the features the module does provide: | 
					
						
							|  |  |  | data types that cannot be handled, performance issues and the circumstances | 
					
						
							|  |  |  | in which they may arise, practical limitations on the size of data sets, | 
					
						
							|  |  |  | special cases that are not (yet) handled, etc. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The initial template usually just has: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no known bugs in this module. | 
					
						
							|  |  |  | Please report problems to <Maintainer name(s)>  (<contact address>) | 
					
						
							|  |  |  | Patches are welcome. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 AUTHOR | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | <Author name(s)>  (<contact address>) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 LICENCE AND COPYRIGHT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Copyright (c) <year> <copyright holder> (<contact address>). All rights reserved. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Followed by whatever licence you wish to release it under. | 
					
						
							|  |  |  | For Perl code that is often just: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This module is free software; you can redistribute it and/or | 
					
						
							|  |  |  | modify it under the same terms as Perl itself. See perldoc perlartistic. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =cut | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #}}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # [zo to open fold, zc to close or za to toggle] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | 
					
						
							|  |  |  | 
 |