#!/usr/bin/env perl #=============================================================================== # # FILE: fix_relative_links # # USAGE: ./fix_relative_links [options] -episode=N FILE # # DESCRIPTION: Processes an HTML input file, looking for relative URLs. If # any are found these are made absolute using the -baseURL=URL # option or a default. The intention is to make them into # HPR-absolute URLs. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.3 # CREATED: 2022-10-14 11:56:03 # REVISION: 2022-10-23 22:12:08 # #=============================================================================== use v5.16; use strict; use warnings; use utf8; use feature qw{ say state }; use Carp; use Getopt::Long; use Pod::Usage; use File::Basename; use IO::HTML; use HTML::TreeBuilder 5 -weak; use URI; use Log::Handler; use Data::Dumper; # # Version number (manually incremented) # our $VERSION = '0.0.3'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; ( my $DIR = $0 ) =~ s|/?[^/]*$||mx; $DIR = '.' unless $DIR; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- # # Constants and other declarations # my $basedir = "$ENV{HOME}/HPR/Show_Submission"; my $logdir = "$basedir/logs"; my $logfile = "$logdir/${PROG}.log"; # # Variables, arrays and hashes # my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only ); my ( $outfile, $filename, $showdir, $changes, $html ); # # Enable Unicode mode # binmode STDOUT, ":encoding(UTF-8)"; binmode STDERR, ":encoding(UTF-8)"; #------------------------------------------------------------------------------- # Options and arguments #------------------------------------------------------------------------------- # # Option defaults # my $DEFDEBUG = 0; my %options; Options( \%options ); # # Default help shows minimal information # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) if ( $options{'help'} ); # # The -documentation or -man option shows the full POD documentation through # a pager for convenience # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) if ( $options{'documentation'} ); # # Collect options # $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); $showno = $options{episode}; $base_URL = $options{baseURL}; $fragment = ( defined( $options{fragment} ) ? $options{fragment} : 0 ); $count_only = ( defined( $options{count} ) ? $options{count} : 0 ); $outfile = $options{output}; # # Argument # $filename = shift; # # Sanity checks # pod2usage( -msg => "$PROG version $VERSION\nShow number missing\n", -exitval => 1, -verbose => 0 ) unless $showno; pod2usage( -msg => "$PROG version $VERSION\nInput file name missing\n", -exitval => 1, -verbose => 0 ) unless $filename; # # Add leading zeroes to the show number if necessary # $showno = sprintf( '%04d', $showno ); # # Directories and files specific to this show # $showdir = "$basedir/shownotes/hpr$showno"; # # Allow the input filename to be a bare name # if ( !-e $filename ) { $filename = "$showdir/$filename"; } die "Unable to find $filename" unless ( -e $filename ); # # Work on the output file, allowing defaults and substitution points for # convenience. If there's no outfile we'll just process the HTML and nothing # more. # if ( defined($outfile) ) { $outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' ); $outfile = "$showdir/$outfile" if (dirname($outfile) eq '.'); } # # Default base URL # unless ($base_URL) { $base_URL = "https://hackerpublicradio.org/eps/hpr$showno/"; } # # Base URL must have a trailing '/' # $base_URL .= '/' unless ( $base_URL =~ qr{/$} ); #------------------------------------------------------------------------------- # Set up logging keeping the default log layout except for the date. The format # is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is # the message. #------------------------------------------------------------------------------- my $log = Log::Handler->new(); $log->add( file => { timeformat => "%Y/%m/%d %H:%M:%S", filename => $logfile, minlevel => 0, maxlevel => 7, } ); # # Log preamble # $log->info("Show number: $showno"); $log->info("Processing: $filename"); $log->info("Base: $base_URL"); # # Find and change any relative URLs returning the number of changes and the # altered HTML # ( $changes, $html ) = find_links_in_file( $filename, $base_URL, $fragment, $count_only ); $log->info("Number of changes: $changes"); # # Exit without writing if we're just counting # if ($count_only) { $log->info("Count only mode"); exit $changes; } # # Exit without writing if there were no changes # if ($changes == 0) { $log->info("No output written"); exit $changes; } # # Write output if an output file was specified # if ($outfile) { write_output( $outfile, $html ); $log->info("Changes applied; written to $outfile"); } else { $log->info("No output written"); } exit $changes; #=== FUNCTION ================================================================ # NAME: find_links_in_file # PURPOSE: Finds relative links in an HTML file # PARAMETERS: $filename the name of the file we're parsing # $base_URL the part of the full URL we'll replace # $fragment Boolean signalling whether to treat the HTML # as a fragment or an entire document # $count_only Boolean signalling that all we want is the # count of relative URLs, no action is to be taken # RETURNS: The number of URLs "repaired". # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub find_links_in_file { my ( $filename, $base_URL, $fragment, $count_only ) = @_; my ( $base_uri, $tree, $uri_orig, $uri_new ); my ( $newlink, $linkedits, $result ); # # Parse the base URL # $base_uri = URI->new($base_URL); # # Create a tree object # $tree = HTML::TreeBuilder->new; $tree->ignore_unknown(0); $tree->no_expand_entities(1); $tree->p_strict(1); $tree->store_comments(1); $tree->warn(1); # # Parse the file using IO::HTML to grab it. Die if we fail because then we # know this stuff needs some urgent attention. # $tree->parse_file( html_file($filename) ) or die "HTML::TreeBuilder failed to process $filename: $!\n"; $linkedits = 0; # # Scan for all anchors and images using the HTML::Element method # 'extract_links'. The technique used here is from the HTML::Element man # page. # for ( @{ $tree->extract_links( 'a', 'img' ) } ) { my ( $link, $element, $attr, $tag ) = @$_; # # Parse the link # $uri_orig = URI->new($link); # # A relative link (presumably) doesn't have a scheme # unless ( $uri_orig->scheme ) { # # Original link # say "Relative link: $link"; # # Make the link absolute # $uri_new = make_absolute( $uri_orig, $base_uri ); # $uri_new = URI->new_abs( $link, $base_URL ); $newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque ); say "Absolute link: $newlink"; # # Modify the HTML to make the relative absolute # if ( $uri_orig->fragment ) { # Not sure if we need to cater for URI fragments, but we'll try it $element->attr( $attr, $newlink . '#' . $uri_orig->fragment ); } else { $element->attr( $attr, $newlink ); } $linkedits++; } } # # Exit here if we were just asked to count # return ( $linkedits, undef ) if $count_only; # # In 'HTML fragment' mode generate the body part without the
tags. # if ($fragment) { my $body = $tree->look_down( _tag => 'body' ); ( $result = $body->as_HTML( undef, ' ', {} ) ) =~ s{(^]*>|$)}{}gi; } else { $result = $tree->as_HTML( undef, ' ', {} ); } return ( $linkedits, $result ); } #=== FUNCTION ================================================================ # NAME: write_output # PURPOSE: Write the "repaired" HTML # PARAMETERS: $outfile name of the output file # $html the HTML to write out # RETURNS: Nothing # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub write_output { my ( $outfile, $html ) = @_; open( my $out, '>:encoding(UTF-8)', $outfile ) or die "Unable to open $outfile for output: $!\n"; print $out $html; close($out); } #=== FUNCTION ================================================================ # NAME: make_absolute # PURPOSE: Take a relative URI and a base URI and return the absolute URI # PARAMETERS: $relative relative URL as a URI object # $base base URL as a URI object # RETURNS: Absolute URL as a URI object # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub make_absolute { my ( $relative, $base ) = @_; my ( %base_path, @relative_path, $absolute ); # # Chop up the path from the base and store in a hash # %base_path = map { $_ => 1 } split( '/', $base->path ); # # Chop up the relative path # @relative_path = split( '/', $relative->path ); # # Remove relative path elements if they are in the base # @relative_path = grep { !exists( $base_path{$_} ) } @relative_path; # # If the relative path is empty we assume it's referring to the # 'index.html' file. # push( @relative_path, 'index.html' ) unless (@relative_path); # # Rebuild the relative path # $relative->path( join( '/', @relative_path ) ); # # Return the result of joining relative URL and base URL # $absolute = URI->new_abs( $relative->as_string, $base->as_string ); return $absolute; } #=== FUNCTION ================================================================ # NAME: output_file_name # PURPOSE: Generate an output file name with three choices # PARAMETERS: $optarg the argument to the option choosing the filename # $showno the show number to add to certain name types # $template a default 'sprintf' template for the name # RETURNS: The name of the output file # DESCRIPTION: If there's a defined output filename then there are three # options: a null string, a plain filename and a substitution # string with '%d' sequences. The null string means the user used # '-option' without a value, so we want to generate a substitution # string. A string with '%d' requires a check to ensure there's # the right number, just one. The plain filename needs no more # work. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub output_file_name { my ( $optarg, $showno, $template ) = @_; my ( $filename, $count ); # # We shouldn't be called with a null option argument # return unless defined($optarg); # # Does the option have an argument? # if ( $optarg =~ /^$/ ) { # # No argument; use the show number from the -episode=N option # $filename = sprintf( $template, $showno ); } elsif ( $optarg =~ /%d/ ) { # # There's an argument, does it have a '%d' in it? # $count = () = $optarg =~ /%d/g; die "Invalid - too many '%d' sequences in '$optarg'\n" if ( $count > 1 ); $filename = sprintf( $optarg, $showno ); } else { # # It's a plain filename, just return it # $filename = $optarg; } return $filename; } #=== FUNCTION ================================================================ # NAME: coalesce # PURPOSE: To find the first defined argument and return it # PARAMETERS: Arbitrary number of arguments # RETURNS: The first defined argument or undef if there are none # DESCRIPTION: Just a simple way of ensuring an 'undef' value is never # returned when doing so might be a problem. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub coalesce { foreach (@_) { return $_ if defined($_); } return undef; ## no critic } #=== FUNCTION ================================================================ # NAME: _debug # PURPOSE: Prints debug reports # PARAMETERS: $active Boolean: 1 for print, 0 for no print # $message Message to print # RETURNS: Nothing # DESCRIPTION: Outputs a message if $active is true. It removes any trailing # newline and then adds one in the 'print' to the caller doesn't # have to bother. Prepends the message with 'D> ' to show it's # a debug message. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub _debug { my ( $active, $message ) = @_; chomp($message); print "D> $message\n" if $active; } #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: Process the options we want to offer. See the documentation # for details # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "documentation|man", "debug=i", "episode=i", "baseURL=s", "fragment!", "count!", "output:s", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME fix_relative_links - Repair relative URLs in HTML shownotes =head1 VERSION This documentation refers to fix_relative_links version 0.0.3 =head1 USAGE ./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment =head1 REQUIRED ARGUMENTS =over 4 =item B