hpr-tools/Show_Submission/fix_relative_links

752 lines
21 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: fix_relative_links
#
# USAGE: ./fix_relative_links [options] -episode=N FILE
#
# DESCRIPTION: Processes an HTML input file, looking for relative URLs. If
# any are found these are made absolute using the -baseURL=URL
# option or a default. The intention is to make them into
# HPR-absolute URLs.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.3
# CREATED: 2022-10-14 11:56:03
# REVISION: 2022-10-23 22:12:08
#
#===============================================================================
use v5.16;
use strict;
use warnings;
use utf8;
use feature qw{ postderef say signatures state };
no warnings qw{ experimental::postderef experimental::signatures };
use Carp;
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use IO::HTML;
use HTML::TreeBuilder 5 -weak;
use URI;
use Log::Handler;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.3';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Show_Submission";
my $logdir = "$basedir/logs";
my $logfile = "$logdir/${PROG}.log";
#
# Variables, arrays and hashes
#
my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only );
my ( $outfile, $filename, $showdir, $changes, $html );
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;
my %options;
Options( \%options );
#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
if ( $options{'help'} );
#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
if ( $options{'documentation'} );
#
# Collect options
#
$DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
$showno = $options{episode};
$base_URL = $options{baseURL};
$fragment = ( defined( $options{fragment} ) ? $options{fragment} : 0 );
$count_only = ( defined( $options{count} ) ? $options{count} : 0 );
$outfile = $options{output};
#
# Argument
#
$filename = shift;
#
# Sanity checks
#
pod2usage(
-msg => "$PROG version $VERSION\nShow number missing\n",
-exitval => 1,
-verbose => 0
) unless $showno;
pod2usage(
-msg => "$PROG version $VERSION\nInput file name missing\n",
-exitval => 1,
-verbose => 0
) unless $filename;
#
# Add leading zeroes to the show number if necessary
#
$showno = sprintf( '%04d', $showno );
#
# Directories and files specific to this show
#
$showdir = "$basedir/shownotes/hpr$showno";
#
# Allow the input filename to be a bare name
#
if ( !-e $filename ) {
$filename = "$showdir/$filename";
}
die "Unable to find $filename" unless ( -e $filename );
#
# Work on the output file, allowing defaults and substitution points for
# convenience. If there's no outfile we'll just process the HTML and nothing
# more.
#
if ( defined($outfile) ) {
$outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' );
$outfile = "$showdir/$outfile" if (dirname($outfile) eq '.');
}
#
# Default base URL
#
unless ($base_URL) {
$base_URL = "https://hackerpublicradio.org/eps/hpr$showno/";
}
#
# Base URL must have a trailing '/'
#
$base_URL .= '/' unless ( $base_URL =~ qr{/$} );
#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date. The format
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
# the message.
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();
$log->add(
file => {
timeformat => "%Y/%m/%d %H:%M:%S",
filename => $logfile,
minlevel => 0,
maxlevel => 7,
}
);
#
# Log preamble
#
$log->info("Show number: $showno");
$log->info("Processing: $filename");
$log->info("Base: $base_URL");
#
# Find and change any relative URLs returning the number of changes and the
# altered HTML
#
( $changes, $html )
= find_links_in_file( $filename, $base_URL, $fragment, $count_only );
$log->info("Number of changes: $changes");
#
# Exit without writing if we're just counting
#
if ($count_only) {
$log->info("Count only mode");
exit $changes;
}
#
# Exit without writing if there were no changes
#
if ($changes == 0) {
$log->info("No output written");
exit $changes;
}
#
# Write output if an output file was specified
#
if ($outfile) {
write_output( $outfile, $html );
$log->info("Changes applied; written to $outfile");
}
else {
$log->info("No output written");
}
exit $changes;
#=== FUNCTION ================================================================
# NAME: find_links_in_file
# PURPOSE: Finds relative links in an HTML file
# PARAMETERS: $filename the name of the file we're parsing
# $base_URL the part of the full URL we'll replace
# $fragment Boolean signalling whether to treat the HTML
# as a fragment or an entire document
# $count_only Boolean signalling that all we want is the
# count of relative URLs, no action is to be taken
# RETURNS: The number of URLs "repaired".
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_links_in_file {
my ( $filename, $base_URL, $fragment, $count_only ) = @_;
my ( $base_uri, $tree, $uri_orig, $uri_new );
my ( $newlink, $linkedits, $result );
#
# Parse the base URL
#
$base_uri = URI->new($base_URL);
#
# Create a tree object
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);
$tree->warn(1);
#
# Parse the file using IO::HTML to grab it. Die if we fail because then we
# know this stuff needs some urgent attention.
#
$tree->parse_file( html_file($filename) )
or die "HTML::TreeBuilder failed to process $filename: $!\n";
$linkedits = 0;
#
# Scan for all anchors and images using the HTML::Element method
# 'extract_links'. The technique used here is from the HTML::Element man
# page.
#
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
#
# Parse the link
#
$uri_orig = URI->new($link);
#
# A relative link (presumably) doesn't have a scheme
#
unless ( $uri_orig->scheme ) {
#
# Original link
#
say "Relative link: $link";
#
# Make the link absolute
#
$uri_new = make_absolute( $uri_orig, $base_uri );
# $uri_new = URI->new_abs( $link, $base_URL );
$newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque );
say "Absolute link: $newlink";
#
# Modify the HTML to make the relative absolute
#
if ( $uri_orig->fragment ) {
# Not sure if we need to cater for URI fragments, but we'll try it
$element->attr( $attr, $newlink . '#' . $uri_orig->fragment );
}
else {
$element->attr( $attr, $newlink );
}
$linkedits++;
}
}
#
# Exit here if we were just asked to count
#
return ( $linkedits, undef ) if $count_only;
#
# In 'HTML fragment' mode generate the body part without the <body> tags.
#
if ($fragment) {
my $body = $tree->look_down( _tag => 'body' );
( $result = $body->as_HTML( undef, ' ', {} ) )
=~ s{(^<body[^>]*>|</body>$)}{}gi;
}
else {
$result = $tree->as_HTML( undef, ' ', {} );
}
return ( $linkedits, $result );
}
#=== FUNCTION ================================================================
# NAME: write_output
# PURPOSE: Write the "repaired" HTML
# PARAMETERS: $outfile name of the output file
# $html the HTML to write out
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub write_output {
my ( $outfile, $html ) = @_;
open( my $out, '>:encoding(UTF-8)', $outfile )
or die "Unable to open $outfile for output: $!\n";
print $out $html;
close($out);
}
#=== FUNCTION ================================================================
# NAME: make_absolute
# PURPOSE: Take a relative URI and a base URI and return the absolute URI
# PARAMETERS: $relative relative URL as a URI object
# $base base URL as a URI object
# RETURNS: Absolute URL as a URI object
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub make_absolute {
my ( $relative, $base ) = @_;
my ( %base_path, @relative_path, $absolute );
#
# Chop up the path from the base and store in a hash
#
%base_path = map { $_ => 1 } split( '/', $base->path );
#
# Chop up the relative path
#
@relative_path = split( '/', $relative->path );
#
# Remove relative path elements if they are in the base
#
@relative_path = grep { !exists( $base_path{$_} ) } @relative_path;
#
# If the relative path is empty we assume it's referring to the
# 'index.html' file.
#
push( @relative_path, 'index.html' ) unless (@relative_path);
#
# Rebuild the relative path
#
$relative->path( join( '/', @relative_path ) );
#
# Return the result of joining relative URL and base URL
#
$absolute = URI->new_abs( $relative->as_string, $base->as_string );
return $absolute;
}
#=== FUNCTION ================================================================
# NAME: output_file_name
# PURPOSE: Generate an output file name with three choices
# PARAMETERS: $optarg the argument to the option choosing the filename
# $showno the show number to add to certain name types
# $template a default 'sprintf' template for the name
# RETURNS: The name of the output file
# DESCRIPTION: If there's a defined output filename then there are three
# options: a null string, a plain filename and a substitution
# string with '%d' sequences. The null string means the user used
# '-option' without a value, so we want to generate a substitution
# string. A string with '%d' requires a check to ensure there's
# the right number, just one. The plain filename needs no more
# work.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub output_file_name {
my ( $optarg, $showno, $template ) = @_;
my ( $filename, $count );
#
# We shouldn't be called with a null option argument
#
return unless defined($optarg);
#
# Does the option have an argument?
#
if ( $optarg =~ /^$/ ) {
#
# No argument; use the show number from the -episode=N option
#
$filename = sprintf( $template, $showno );
}
elsif ( $optarg =~ /%d/ ) {
#
# There's an argument, does it have a '%d' in it?
#
$count = () = $optarg =~ /%d/g;
die "Invalid - too many '%d' sequences in '$optarg'\n"
if ( $count > 1 );
$filename = sprintf( $optarg, $showno );
}
else {
#
# It's a plain filename, just return it
#
$filename = $optarg;
}
return $filename;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the first defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument or undef if there are none
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
# returned when doing so might be a problem.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return undef; ## no critic
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION: Process the options we want to offer. See the documentation
# for details
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "documentation|man", "debug=i", "episode=i",
"baseURL=s", "fragment!", "count!", "output:s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage(
-msg => "$PROG version $VERSION\n",
-exitval => 1,
-verbose => 0
);
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
fix_relative_links - Repair relative URLs in HTML shownotes
=head1 VERSION
This documentation refers to fix_relative_links version 0.0.3
=head1 USAGE
./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment
=head1 REQUIRED ARGUMENTS
=over 4
=item B<filename>
The name of the file containing the HTML to be repaired. If no path is given
this will be supplied by the script as:
~/HPR/Show_Submission/shownotes/hpr${show}/
It is probably wiser to be explicit about the path to the HTML file to be
parsed.
=back
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
=item B<-documentation> or B<-man>
Prints the entire documentation for the script in the form of a manual page.
=item B<-debug=N>
Causes certain debugging information to be displayed.
0 (the default) no debug output
1
2
3
=item B<-episode=N>
This option is mandatory and specifies the show number being processed. The
number is used to generate default file names and paths as well as the default
base URL described below.
=item B<-baseURL=URL>
This option will default to the foillowing URL if not provided:
https://hackerpublicradio.org/eps/hpr${show}/
It can be used to define a non-standard URL, such as one at a lower level than
the example above which might contain thumbnail pictures for example.
=item B<-[no]fragment>
This Boolean option defines the HTML being parsed and checked as a fragment or
a complete stand-alone document. By default B<-nofragment> is assumed. It is
necessary to use B<fragment> for the case where the HTML shownotes are being
parsed.
=item B<-[no]count>
This Boolean option defines whether to simply count the necessary changes or
to apply them to the given HTML file. By default B<-nocount> is assumed, and
changes will be applied.
=item B<-output[=FILE]>
This option can be omitted or can be given without the B<FILE> name. If
omitted entirely no output will be written even though the HTML file has been
read and processed. If specified without the output file name the default name
will be B<hpr${show}_new.html>. If no path is specified with the file name
then a default will be generated as:
~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html
The output file name can be given in the form of a B<printf> template such as:
hpr%d_new.html
and the B<%d> will be replaced by the show number given through the
B<-episode=N> option described above.
=back
=head1 DESCRIPTION
The script reads a file of HTML which has either been submitted by an HPR host
as it is or has been generated from one of the markup languages accepted in
the upload form. Most often this file will contain the main notes for a show
and will eventually be saved in the HPR database.
It is also possible to use the script to process other HTML files submitted
with an HPR show.
The purpose of the script is to find relative URLs in the HTML and convert
them to absolute ones. The HPR website requests that absolute URLs be used
since then they can be used in the various RSS feeds which are available, but
many hosts forget to follow this request.
The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or
B<img> tags. These are checked to ensure they contain absolute links, and if
not are converted appropriately using a base URL for the HPR website.
A count of changes is returned by the script and the converted HTML is written
out to a file if required. The script can be used to see if any conversions
are necessary before making the changes.
The script is also capable of treating full HTML documents differently from
the HTML fragments that are stored in the HPR database. An option is required
to specify which type of HTML is being processed.
=head1 DIAGNOSTICS
Error and warning messages generated by the script.
=over 4
=item B<Unable to find ...>
Type: fatal
The script was unable to find the specified input file.
=item B<HTML::TreeBuilder failed to process ...: ...>
Type: fatal
The script attempted to use B<HTML::TreeBuilder> to parse the input file
but failed. The message also contains details of the failure.
=item B<Unable to open ... for output: ...>
Type: fatal
The script attempted to open the requested output file but failed. The reason
for the failure is included in the error message.
=item B<Invalid - too many '%d' sequences in '...'>
Type: fatal
The script attempted to generate a name for the requested output file using
the supplied template, but failed because there were too many B<%d> elements
in the template. Only one should be provided, which will be substituted with
the show number.
=back
=head1 DEPENDENCIES
Carp
Data::Dumper
File::Basename
Getopt::Long
HTML::TreeBuilder
IO::HTML
Log::Handler
Pod::Usage
URI
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker