| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | #!/usr/bin/env perl | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #         FILE: fix_relative_links | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        USAGE: ./fix_relative_links [options] -episode=N FILE | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  DESCRIPTION: Processes an HTML input file, looking for relative URLs. If | 
					
						
							|  |  |  | #               any are found these are made absolute using the -baseURL=URL | 
					
						
							|  |  |  | #               option or a default. The intention is to make them into | 
					
						
							|  |  |  | #               HPR-absolute URLs. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      OPTIONS: --- | 
					
						
							|  |  |  | # REQUIREMENTS: --- | 
					
						
							|  |  |  | #         BUGS: --- | 
					
						
							|  |  |  | #        NOTES: --- | 
					
						
							|  |  |  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | 
					
						
							|  |  |  | #      VERSION: 0.0.3 | 
					
						
							|  |  |  | #      CREATED: 2022-10-14 11:56:03 | 
					
						
							|  |  |  | #     REVISION: 2022-10-23 22:12:08 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use v5.16; | 
					
						
							|  |  |  | use strict; | 
					
						
							|  |  |  | use warnings; | 
					
						
							|  |  |  | use utf8; | 
					
						
							| 
									
										
										
										
											2024-12-01 20:45:20 +00:00
										 |  |  | use feature qw{ say state }; | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | use Carp; | 
					
						
							|  |  |  | use Getopt::Long; | 
					
						
							|  |  |  | use Pod::Usage; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use File::Basename; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use IO::HTML; | 
					
						
							|  |  |  | use HTML::TreeBuilder 5 -weak; | 
					
						
							|  |  |  | use URI; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Log::Handler; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Data::Dumper; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Version number (manually incremented) | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | our $VERSION = '0.0.3'; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Script and directory names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( my $PROG = $0 ) =~ s|.*/||mx; | 
					
						
							|  |  |  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | 
					
						
							|  |  |  | $DIR = '.' unless $DIR; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Declarations | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Constants and other declarations | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $basedir = "$ENV{HOME}/HPR/Show_Submission"; | 
					
						
							|  |  |  | my $logdir  = "$basedir/logs"; | 
					
						
							|  |  |  | my $logfile = "$logdir/${PROG}.log"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Variables, arrays and hashes | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only ); | 
					
						
							|  |  |  | my ( $outfile, $filename, $showdir, $changes, $html ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Enable Unicode mode | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | binmode STDOUT, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | binmode STDERR, ":encoding(UTF-8)"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Options and arguments | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Option defaults | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEFDEBUG = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my %options; | 
					
						
							|  |  |  | Options( \%options ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default help shows minimal information | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     if ( $options{'help'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The -documentation or -man option shows the full POD documentation through | 
					
						
							|  |  |  | # a pager for convenience | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | 
					
						
							|  |  |  |     if ( $options{'documentation'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Collect options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $DEBUG      = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); | 
					
						
							|  |  |  | $showno     = $options{episode}; | 
					
						
							|  |  |  | $base_URL   = $options{baseURL}; | 
					
						
							|  |  |  | $fragment   = ( defined( $options{fragment} ) ? $options{fragment} : 0 ); | 
					
						
							|  |  |  | $count_only = ( defined( $options{count} )    ? $options{count}    : 0 ); | 
					
						
							|  |  |  | $outfile    = $options{output}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Argument | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $filename = shift; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Sanity checks | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( | 
					
						
							|  |  |  |     -msg     => "$PROG version $VERSION\nShow number missing\n", | 
					
						
							|  |  |  |     -exitval => 1, | 
					
						
							|  |  |  |     -verbose => 0 | 
					
						
							|  |  |  | ) unless $showno; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | pod2usage( | 
					
						
							|  |  |  |     -msg     => "$PROG version $VERSION\nInput file name missing\n", | 
					
						
							|  |  |  |     -exitval => 1, | 
					
						
							|  |  |  |     -verbose => 0 | 
					
						
							|  |  |  | ) unless $filename; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Add leading zeroes to the show number if necessary | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $showno = sprintf( '%04d', $showno ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Directories and files specific to this show | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $showdir = "$basedir/shownotes/hpr$showno"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Allow the input filename to be a bare name | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ( !-e $filename ) { | 
					
						
							|  |  |  |     $filename = "$showdir/$filename"; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | die "Unable to find $filename" unless ( -e $filename ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Work on the output file, allowing defaults and substitution points for | 
					
						
							|  |  |  | # convenience. If there's no outfile we'll just process the HTML and nothing | 
					
						
							|  |  |  | # more. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ( defined($outfile) ) { | 
					
						
							|  |  |  |     $outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' ); | 
					
						
							|  |  |  |     $outfile = "$showdir/$outfile" if (dirname($outfile) eq '.'); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default base URL | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | unless ($base_URL) { | 
					
						
							|  |  |  |     $base_URL = "https://hackerpublicradio.org/eps/hpr$showno/"; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Base URL must have a trailing '/' | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $base_URL .= '/' unless ( $base_URL =~ qr{/$} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Set up logging keeping the default log layout except for the date. The format | 
					
						
							|  |  |  | # is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is | 
					
						
							|  |  |  | # the message. | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $log = Log::Handler->new(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $log->add( | 
					
						
							|  |  |  |     file => { | 
					
						
							|  |  |  |         timeformat => "%Y/%m/%d %H:%M:%S", | 
					
						
							|  |  |  |         filename   => $logfile, | 
					
						
							|  |  |  |         minlevel   => 0, | 
					
						
							|  |  |  |         maxlevel   => 7, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Log preamble | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $log->info("Show number: $showno"); | 
					
						
							|  |  |  | $log->info("Processing: $filename"); | 
					
						
							|  |  |  | $log->info("Base: $base_URL"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Find and change any relative URLs returning the number of changes and the | 
					
						
							|  |  |  | # altered HTML | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( $changes, $html ) | 
					
						
							|  |  |  |     = find_links_in_file( $filename, $base_URL, $fragment, $count_only ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | $log->info("Number of changes: $changes"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Exit without writing if we're just counting | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($count_only) { | 
					
						
							|  |  |  |     $log->info("Count only mode"); | 
					
						
							|  |  |  |     exit $changes; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Exit without writing if there were no changes | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($changes == 0) { | 
					
						
							|  |  |  |     $log->info("No output written"); | 
					
						
							|  |  |  |     exit $changes; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Write output if an output file was specified | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | if ($outfile) { | 
					
						
							|  |  |  |     write_output( $outfile, $html ); | 
					
						
							|  |  |  |     $log->info("Changes applied; written to $outfile"); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | else { | 
					
						
							|  |  |  |     $log->info("No output written"); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | exit $changes; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: find_links_in_file | 
					
						
							|  |  |  | #      PURPOSE: Finds relative links in an HTML file | 
					
						
							|  |  |  | #   PARAMETERS: $filename       the name of the file we're parsing | 
					
						
							|  |  |  | #               $base_URL       the part of the full URL we'll replace | 
					
						
							|  |  |  | #               $fragment       Boolean signalling whether to treat the HTML | 
					
						
							|  |  |  | #                               as a fragment or an entire document | 
					
						
							|  |  |  | #               $count_only     Boolean signalling that all we want is the | 
					
						
							|  |  |  | #                               count of relative URLs, no action is to be taken | 
					
						
							|  |  |  | #      RETURNS: The number of URLs "repaired". | 
					
						
							|  |  |  | #  DESCRIPTION: | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub find_links_in_file { | 
					
						
							|  |  |  |     my ( $filename, $base_URL, $fragment, $count_only ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $base_uri, $tree, $uri_orig, $uri_new ); | 
					
						
							|  |  |  |     my ( $newlink, $linkedits, $result ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Parse the base URL | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $base_uri = URI->new($base_URL); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Create a tree object | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $tree = HTML::TreeBuilder->new; | 
					
						
							|  |  |  |     $tree->ignore_unknown(0); | 
					
						
							|  |  |  |     $tree->no_expand_entities(1); | 
					
						
							|  |  |  |     $tree->p_strict(1); | 
					
						
							|  |  |  |     $tree->store_comments(1); | 
					
						
							|  |  |  |     $tree->warn(1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Parse the file using IO::HTML to grab it. Die if we fail because then we | 
					
						
							|  |  |  |     # know this stuff needs some urgent attention. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $tree->parse_file( html_file($filename) ) | 
					
						
							|  |  |  |         or die "HTML::TreeBuilder failed to process $filename: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     $linkedits = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Scan for all anchors and images using the HTML::Element method | 
					
						
							|  |  |  |     # 'extract_links'. The technique used here is from the HTML::Element man | 
					
						
							|  |  |  |     # page. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     for ( @{ $tree->extract_links( 'a', 'img' ) } ) { | 
					
						
							|  |  |  |         my ( $link, $element, $attr, $tag ) = @$_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Parse the link | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $uri_orig = URI->new($link); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # A relative link (presumably) doesn't have a scheme | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         unless ( $uri_orig->scheme ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Original link | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             say "Relative link: $link"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Make the link absolute | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             $uri_new = make_absolute( $uri_orig, $base_uri ); | 
					
						
							|  |  |  |             #            $uri_new = URI->new_abs( $link, $base_URL ); | 
					
						
							|  |  |  |             $newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque ); | 
					
						
							|  |  |  |             say "Absolute link: $newlink"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Modify the HTML to make the relative absolute | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $uri_orig->fragment ) { | 
					
						
							|  |  |  |                 # Not sure if we need to cater for URI fragments, but we'll try it | 
					
						
							|  |  |  |                 $element->attr( $attr, $newlink . '#' . $uri_orig->fragment ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             else { | 
					
						
							|  |  |  |                 $element->attr( $attr, $newlink ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             $linkedits++; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Exit here if we were just asked to count | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return ( $linkedits, undef ) if $count_only; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # In 'HTML fragment' mode generate the body part without the <body> tags. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ($fragment) { | 
					
						
							|  |  |  |         my $body = $tree->look_down( _tag => 'body' ); | 
					
						
							|  |  |  |         ( $result = $body->as_HTML( undef, ' ', {} ) ) | 
					
						
							|  |  |  |             =~ s{(^<body[^>]*>|</body>$)}{}gi; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         $result = $tree->as_HTML( undef, ' ', {} ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return ( $linkedits, $result ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: write_output | 
					
						
							|  |  |  | #      PURPOSE: Write the "repaired" HTML | 
					
						
							|  |  |  | #   PARAMETERS: $outfile        name of the output file | 
					
						
							|  |  |  | #               $html           the HTML to write out | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub write_output { | 
					
						
							|  |  |  |     my ( $outfile, $html ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     open( my $out, '>:encoding(UTF-8)', $outfile ) | 
					
						
							|  |  |  |         or die "Unable to open $outfile for output: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print $out $html; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     close($out); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: make_absolute | 
					
						
							|  |  |  | #      PURPOSE: Take a relative URI and a base URI and return the absolute URI | 
					
						
							|  |  |  | #   PARAMETERS: $relative       relative URL as a URI object | 
					
						
							|  |  |  | #               $base           base URL as a URI object | 
					
						
							|  |  |  | #      RETURNS: Absolute URL as a URI object | 
					
						
							|  |  |  | #  DESCRIPTION: | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub make_absolute { | 
					
						
							|  |  |  |     my ( $relative, $base ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( %base_path, @relative_path, $absolute ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Chop up the path from the base and store in a hash | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     %base_path = map { $_ => 1 } split( '/', $base->path ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Chop up the relative path | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     @relative_path = split( '/', $relative->path ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Remove relative path elements if they are in the base | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     @relative_path = grep { !exists( $base_path{$_} ) } @relative_path; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # If the relative path is empty we assume it's referring to the | 
					
						
							|  |  |  |     # 'index.html' file. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     push( @relative_path, 'index.html' ) unless (@relative_path); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Rebuild the relative path | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $relative->path( join( '/', @relative_path ) ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Return the result of joining relative URL and base URL | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $absolute = URI->new_abs( $relative->as_string, $base->as_string ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $absolute; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: output_file_name | 
					
						
							|  |  |  | #      PURPOSE: Generate an output file name with three choices | 
					
						
							|  |  |  | #   PARAMETERS: $optarg         the argument to the option choosing the filename | 
					
						
							|  |  |  | #               $showno         the show number to add to certain name types | 
					
						
							|  |  |  | #               $template       a default 'sprintf' template for the name | 
					
						
							|  |  |  | #      RETURNS: The name of the output file | 
					
						
							|  |  |  | #  DESCRIPTION: If there's a defined output filename then there are three | 
					
						
							|  |  |  | #               options: a null string, a plain filename and a substitution | 
					
						
							|  |  |  | #               string with '%d' sequences. The null string means the user used | 
					
						
							|  |  |  | #               '-option' without a value, so we want to generate a substitution | 
					
						
							|  |  |  | #               string. A string with '%d' requires a check to ensure there's | 
					
						
							|  |  |  | #               the right number, just one. The plain filename needs no more | 
					
						
							|  |  |  | #               work. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub output_file_name { | 
					
						
							|  |  |  |     my ( $optarg, $showno, $template ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $filename, $count ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # We shouldn't be called with a null option argument | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     return unless defined($optarg); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Does the option have an argument? | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( $optarg =~ /^$/ ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # No argument; use the show number from the -episode=N option | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $filename = sprintf( $template, $showno ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     elsif ( $optarg =~ /%d/ ) { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # There's an argument, does it have a '%d' in it? | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $count = () = $optarg =~ /%d/g; | 
					
						
							|  |  |  |         die "Invalid - too many '%d' sequences in '$optarg'\n" | 
					
						
							|  |  |  |             if ( $count > 1 ); | 
					
						
							|  |  |  |         $filename = sprintf( $optarg, $showno ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # It's a plain filename, just return it | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         $filename = $optarg; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return $filename; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: coalesce | 
					
						
							|  |  |  | #      PURPOSE: To find the first defined argument and return it | 
					
						
							|  |  |  | #   PARAMETERS: Arbitrary number of arguments | 
					
						
							|  |  |  | #      RETURNS: The first defined argument or undef if there are none | 
					
						
							|  |  |  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | 
					
						
							|  |  |  | #               returned when doing so might be a problem. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub coalesce { | 
					
						
							|  |  |  |     foreach (@_) { | 
					
						
							|  |  |  |         return $_ if defined($_); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return undef;    ## no critic | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _debug | 
					
						
							|  |  |  | #      PURPOSE: Prints debug reports | 
					
						
							|  |  |  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | 
					
						
							|  |  |  | #               $message        Message to print | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | 
					
						
							|  |  |  | #               newline and then adds one in the 'print' to the caller doesn't | 
					
						
							|  |  |  | #               have to bother. Prepends the message with 'D> ' to show it's | 
					
						
							|  |  |  | #               a debug message. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _debug { | 
					
						
							|  |  |  |     my ( $active, $message ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     chomp($message); | 
					
						
							|  |  |  |     print "D> $message\n" if $active; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: Options | 
					
						
							|  |  |  | #      PURPOSE: Processes command-line options | 
					
						
							|  |  |  | #   PARAMETERS: $optref     Hash reference to hold the options | 
					
						
							|  |  |  | #      RETURNS: Undef | 
					
						
							|  |  |  | #  DESCRIPTION: Process the options we want to offer. See the documentation | 
					
						
							|  |  |  | #               for details | 
					
						
							|  |  |  | #       THROWS: no exceptions | 
					
						
							|  |  |  | #     COMMENTS: none | 
					
						
							|  |  |  | #     SEE ALSO: n/a | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub Options { | 
					
						
							|  |  |  |     my ($optref) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @options = ( | 
					
						
							|  |  |  |         "help",      "documentation|man", "debug=i", "episode=i", | 
					
						
							|  |  |  |         "baseURL=s", "fragment!",         "count!",  "output:s", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( !GetOptions( $optref, @options ) ) { | 
					
						
							|  |  |  |         pod2usage( | 
					
						
							|  |  |  |             -msg     => "$PROG version $VERSION\n", | 
					
						
							|  |  |  |             -exitval => 1, | 
					
						
							|  |  |  |             -verbose => 0 | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __END__ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #  Application Documentation | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #{{{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 NAME | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | fix_relative_links - Repair relative URLs in HTML shownotes | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 VERSION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This documentation refers to fix_relative_links version 0.0.3 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 USAGE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 REQUIRED ARGUMENTS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<filename> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The name of the file containing the HTML to be repaired. If no path is given | 
					
						
							|  |  |  | this will be supplied by the script as: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ~/HPR/Show_Submission/shownotes/hpr${show}/ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | It is probably wiser to be explicit about the path to the HTML file to be | 
					
						
							|  |  |  | parsed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 OPTIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-help> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Prints a brief help message describing the usage of the program, and then exits. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-documentation> or B<-man> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Prints the entire documentation for the script in the form of a manual page. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-debug=N> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Causes certain debugging information to be displayed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     0   (the default) no debug output | 
					
						
							|  |  |  |     1    | 
					
						
							|  |  |  |     2    | 
					
						
							|  |  |  |     3    | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-episode=N> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option is mandatory and specifies the show number being processed. The | 
					
						
							|  |  |  | number is used to generate default file names and paths as well as the default | 
					
						
							|  |  |  | base URL described below. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-baseURL=URL> | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-01 20:45:20 +00:00
										 |  |  | This option will default to the following URL if not provided: | 
					
						
							| 
									
										
										
										
											2024-06-04 16:35:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     https://hackerpublicradio.org/eps/hpr${show}/ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | It can be used to define a non-standard URL, such as one at a lower level than | 
					
						
							|  |  |  | the example above which might contain thumbnail pictures for example. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]fragment> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This Boolean option defines the HTML being parsed and checked as a fragment or | 
					
						
							|  |  |  | a complete stand-alone document. By default B<-nofragment> is assumed. It is | 
					
						
							|  |  |  | necessary to use B<fragment> for the case where the HTML shownotes are being | 
					
						
							|  |  |  | parsed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-[no]count> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This Boolean option defines whether to simply count the necessary changes or | 
					
						
							|  |  |  | to apply them to the given HTML file. By default B<-nocount> is assumed, and | 
					
						
							|  |  |  | changes will be applied. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<-output[=FILE]> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This option can be omitted or can be given without the B<FILE> name. If | 
					
						
							|  |  |  | omitted entirely no output will be written even though the HTML file has been | 
					
						
							|  |  |  | read and processed. If specified without the output file name the default name | 
					
						
							|  |  |  | will be B<hpr${show}_new.html>. If no path is specified with the file name | 
					
						
							|  |  |  | then a default will be generated as: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The output file name can be given in the form of a B<printf> template such as: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hpr%d_new.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | and the B<%d> will be replaced by the show number given through the | 
					
						
							|  |  |  | B<-episode=N> option described above. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DESCRIPTION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script reads a file of HTML which has either been submitted by an HPR host | 
					
						
							|  |  |  | as it is or has been generated from one of the markup languages accepted in | 
					
						
							|  |  |  | the upload form. Most often this file will contain the main notes for a show | 
					
						
							|  |  |  | and will eventually be saved in the HPR database. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | It is also possible to use the script to process other HTML files submitted | 
					
						
							|  |  |  | with an HPR show. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The purpose of the script is to find relative URLs in the HTML and convert | 
					
						
							|  |  |  | them to absolute ones. The HPR website requests that absolute URLs be used | 
					
						
							|  |  |  | since then they can be used in the various RSS feeds which are available, but | 
					
						
							|  |  |  | many hosts forget to follow this request. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or | 
					
						
							|  |  |  | B<img> tags. These are checked to ensure they contain absolute links, and if | 
					
						
							|  |  |  | not are converted appropriately using a base URL for the HPR website. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A count of changes is returned by the script and the converted HTML is written | 
					
						
							|  |  |  | out to a file if required. The script can be used to see if any conversions | 
					
						
							|  |  |  | are necessary before making the changes. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script is also capable of treating full HTML documents differently from | 
					
						
							|  |  |  | the HTML fragments that are stored in the HPR database. An option is required | 
					
						
							|  |  |  | to specify which type of HTML is being processed. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DIAGNOSTICS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Error and warning messages generated by the script. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Unable to find ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Type: fatal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script was unable to find the specified input file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<HTML::TreeBuilder failed to process ...: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Type: fatal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script attempted to use B<HTML::TreeBuilder> to parse the input file | 
					
						
							|  |  |  | but failed. The message also contains details of the failure. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Unable to open ... for output: ...> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Type: fatal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script attempted to open the requested output file but failed. The reason | 
					
						
							|  |  |  | for the failure is included in the error message. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<Invalid - too many '%d' sequences in '...'> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Type: fatal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script attempted to generate a name for the requested output file using | 
					
						
							|  |  |  | the supplied template, but failed because there were too many B<%d> elements | 
					
						
							|  |  |  | in the template. Only one should be provided, which will be substituted with | 
					
						
							|  |  |  | the show number. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DEPENDENCIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Carp | 
					
						
							|  |  |  |     Data::Dumper | 
					
						
							|  |  |  |     File::Basename | 
					
						
							|  |  |  |     Getopt::Long | 
					
						
							|  |  |  |     HTML::TreeBuilder | 
					
						
							|  |  |  |     IO::HTML | 
					
						
							|  |  |  |     Log::Handler | 
					
						
							|  |  |  |     Pod::Usage | 
					
						
							|  |  |  |     URI | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 BUGS AND LIMITATIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no known bugs in this module. | 
					
						
							|  |  |  | Please report problems to Dave Morriss (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | Patches are welcome. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 AUTHOR | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Dave Morriss (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 LICENCE AND COPYRIGHT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This module is free software; you can redistribute it and/or | 
					
						
							|  |  |  | modify it under the same terms as Perl itself. See perldoc perlartistic. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =cut | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #}}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # [zo to open fold, zc to close] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | 
					
						
							|  |  |  | 
 |