hpr-tools/Show_Submission/fix_relative_links

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: fix_relative_links
#
#        USAGE: ./fix_relative_links [options] -episode=N FILE
#
#  DESCRIPTION: Processes an HTML input file, looking for relative URLs. If
#               any are found these are made absolute using the -baseURL=URL
#               option or a default. The intention is to make them into
#               HPR-absolute URLs.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.3
#      CREATED: 2022-10-14 11:56:03
#     REVISION: 2022-10-23 22:12:08
#
#===============================================================================

use v5.16;
use strict;
use warnings;
use utf8;
use feature qw{ say state };

use Carp;
use Getopt::Long;
use Pod::Usage;

use File::Basename;

use IO::HTML;
use HTML::TreeBuilder 5 -weak;
use URI;

use Log::Handler;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.0.3';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Show_Submission";
my $logdir  = "$basedir/logs";
my $logfile = "$logdir/${PROG}.log";

#
# Variables, arrays and hashes
#
my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only );
my ( $outfile, $filename, $showdir, $changes, $html );

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Option defaults
#
my $DEFDEBUG = 0;

my %options;
Options( \%options );

#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
    if ( $options{'help'} );

#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
    if ( $options{'documentation'} );

#
# Collect options
#
$DEBUG      = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
$showno     = $options{episode};
$base_URL   = $options{baseURL};
$fragment   = ( defined( $options{fragment} ) ? $options{fragment} : 0 );
$count_only = ( defined( $options{count} )    ? $options{count}    : 0 );
$outfile    = $options{output};

#
# Argument
#
$filename = shift;

#
# Sanity checks
#
pod2usage(
    -msg     => "$PROG version $VERSION\nShow number missing\n",
    -exitval => 1,
    -verbose => 0
) unless $showno;

pod2usage(
    -msg     => "$PROG version $VERSION\nInput file name missing\n",
    -exitval => 1,
    -verbose => 0
) unless $filename;

#
# Add leading zeroes to the show number if necessary
#
$showno = sprintf( '%04d', $showno );

#
# Directories and files specific to this show
#
$showdir = "$basedir/shownotes/hpr$showno";

#
# Allow the input filename to be a bare name
#
if ( !-e $filename ) {
    $filename = "$showdir/$filename";
}
die "Unable to find $filename" unless ( -e $filename );

#
# Work on the output file, allowing defaults and substitution points for
# convenience. If there's no outfile we'll just process the HTML and nothing
# more.
#
if ( defined($outfile) ) {
    $outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' );
    $outfile = "$showdir/$outfile" if (dirname($outfile) eq '.');
}

#
# Default base URL
#
unless ($base_URL) {
    $base_URL = "https://hackerpublicradio.org/eps/hpr$showno/";
}

#
# Base URL must have a trailing '/'
#
$base_URL .= '/' unless ( $base_URL =~ qr{/$} );

#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date. The format
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
# the message.
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();

$log->add(
    file => {
        timeformat => "%Y/%m/%d %H:%M:%S",
        filename   => $logfile,
        minlevel   => 0,
        maxlevel   => 7,
    }
);

#
# Log preamble
#
$log->info("Show number: $showno");
$log->info("Processing: $filename");
$log->info("Base: $base_URL");

#
# Find and change any relative URLs returning the number of changes and the
# altered HTML
#
( $changes, $html )
    = find_links_in_file( $filename, $base_URL, $fragment, $count_only );

$log->info("Number of changes: $changes");

#
# Exit without writing if we're just counting
#
if ($count_only) {
    $log->info("Count only mode");
    exit $changes;
}

#
# Exit without writing if there were no changes
#
if ($changes == 0) {
    $log->info("No output written");
    exit $changes;
}

#
# Write output if an output file was specified
#
if ($outfile) {
    write_output( $outfile, $html );
    $log->info("Changes applied; written to $outfile");
}
else {
    $log->info("No output written");
}

exit $changes;

#===  FUNCTION  ================================================================
#         NAME: find_links_in_file
#      PURPOSE: Finds relative links in an HTML file
#   PARAMETERS: $filename       the name of the file we're parsing
#               $base_URL       the part of the full URL we'll replace
#               $fragment       Boolean signalling whether to treat the HTML
#                               as a fragment or an entire document
#               $count_only     Boolean signalling that all we want is the
#                               count of relative URLs, no action is to be taken
#      RETURNS: The number of URLs "repaired".
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub find_links_in_file {
    my ( $filename, $base_URL, $fragment, $count_only ) = @_;

    my ( $base_uri, $tree, $uri_orig, $uri_new );
    my ( $newlink, $linkedits, $result );

    #
    # Parse the base URL
    #
    $base_uri = URI->new($base_URL);

    #
    # Create a tree object
    #
    $tree = HTML::TreeBuilder->new;
    $tree->ignore_unknown(0);
    $tree->no_expand_entities(1);
    $tree->p_strict(1);
    $tree->store_comments(1);
    $tree->warn(1);

    #
    # Parse the file using IO::HTML to grab it. Die if we fail because then we
    # know this stuff needs some urgent attention.
    #
    $tree->parse_file( html_file($filename) )
        or die "HTML::TreeBuilder failed to process $filename: $!\n";

    $linkedits = 0;

    #
    # Scan for all anchors and images using the HTML::Element method
    # 'extract_links'. The technique used here is from the HTML::Element man
    # page.
    #
    for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
        my ( $link, $element, $attr, $tag ) = @$_;

        #
        # Parse the link
        #
        $uri_orig = URI->new($link);

        #
        # A relative link (presumably) doesn't have a scheme
        #
        unless ( $uri_orig->scheme ) {
            #
            # Original link
            #
            say "Relative link: $link";

            #
            # Make the link absolute
            #
            $uri_new = make_absolute( $uri_orig, $base_uri );
            #            $uri_new = URI->new_abs( $link, $base_URL );
            $newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque );
            say "Absolute link: $newlink";

            #
            # Modify the HTML to make the relative absolute
            #
            if ( $uri_orig->fragment ) {
                # Not sure if we need to cater for URI fragments, but we'll try it
                $element->attr( $attr, $newlink . '#' . $uri_orig->fragment );
            }
            else {
                $element->attr( $attr, $newlink );
            }

            $linkedits++;
        }
    }

    #
    # Exit here if we were just asked to count
    #
    return ( $linkedits, undef ) if $count_only;

    #
    # In 'HTML fragment' mode generate the body part without the <body> tags.
    #
    if ($fragment) {
        my $body = $tree->look_down( _tag => 'body' );
        ( $result = $body->as_HTML( undef, ' ', {} ) )
            =~ s{(^<body[^>]*>|</body>$)}{}gi;
    }
    else {
        $result = $tree->as_HTML( undef, ' ', {} );
    }

    return ( $linkedits, $result );

}

#===  FUNCTION  ================================================================
#         NAME: write_output
#      PURPOSE: Write the "repaired" HTML
#   PARAMETERS: $outfile        name of the output file
#               $html           the HTML to write out
#      RETURNS: Nothing
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub write_output {
    my ( $outfile, $html ) = @_;

    open( my $out, '>:encoding(UTF-8)', $outfile )
        or die "Unable to open $outfile for output: $!\n";

    print $out $html;

    close($out);

}

#===  FUNCTION  ================================================================
#         NAME: make_absolute
#      PURPOSE: Take a relative URI and a base URI and return the absolute URI
#   PARAMETERS: $relative       relative URL as a URI object
#               $base           base URL as a URI object
#      RETURNS: Absolute URL as a URI object
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub make_absolute {
    my ( $relative, $base ) = @_;

    my ( %base_path, @relative_path, $absolute );

    #
    # Chop up the path from the base and store in a hash
    #
    %base_path = map { $_ => 1 } split( '/', $base->path );

    #
    # Chop up the relative path
    #
    @relative_path = split( '/', $relative->path );

    #
    # Remove relative path elements if they are in the base
    #
    @relative_path = grep { !exists( $base_path{$_} ) } @relative_path;

    #
    # If the relative path is empty we assume it's referring to the
    # 'index.html' file.
    #
    push( @relative_path, 'index.html' ) unless (@relative_path);

    #
    # Rebuild the relative path
    #
    $relative->path( join( '/', @relative_path ) );

    #
    # Return the result of joining relative URL and base URL
    #
    $absolute = URI->new_abs( $relative->as_string, $base->as_string );

    return $absolute;

}

#===  FUNCTION  ================================================================
#         NAME: output_file_name
#      PURPOSE: Generate an output file name with three choices
#   PARAMETERS: $optarg         the argument to the option choosing the filename
#               $showno         the show number to add to certain name types
#               $template       a default 'sprintf' template for the name
#      RETURNS: The name of the output file
#  DESCRIPTION: If there's a defined output filename then there are three
#               options: a null string, a plain filename and a substitution
#               string with '%d' sequences. The null string means the user used
#               '-option' without a value, so we want to generate a substitution
#               string. A string with '%d' requires a check to ensure there's
#               the right number, just one. The plain filename needs no more
#               work.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub output_file_name {
    my ( $optarg, $showno, $template ) = @_;

    my ( $filename, $count );

    #
    # We shouldn't be called with a null option argument
    #
    return unless defined($optarg);

    #
    # Does the option have an argument?
    #
    if ( $optarg =~ /^$/ ) {
        #
        # No argument; use the show number from the -episode=N option
        #
        $filename = sprintf( $template, $showno );
    }
    elsif ( $optarg =~ /%d/ ) {
        #
        # There's an argument, does it have a '%d' in it?
        #
        $count = () = $optarg =~ /%d/g;
        die "Invalid - too many '%d' sequences in '$optarg'\n"
            if ( $count > 1 );
        $filename = sprintf( $optarg, $showno );
    }
    else {
        #
        # It's a plain filename, just return it
        #
        $filename = $optarg;
    }

    return $filename;
}

#===  FUNCTION  ================================================================
#         NAME: coalesce
#      PURPOSE: To find the first defined argument and return it
#   PARAMETERS: Arbitrary number of arguments
#      RETURNS: The first defined argument or undef if there are none
#  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
#               returned when doing so might be a problem.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub coalesce {
    foreach (@_) {
        return $_ if defined($_);
    }
    return undef;    ## no critic
}

#===  FUNCTION  ================================================================
#         NAME: _debug
#      PURPOSE: Prints debug reports
#   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
#               $message        Message to print
#      RETURNS: Nothing
#  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
#               newline and then adds one in the 'print' to the caller doesn't
#               have to bother. Prepends the message with 'D> ' to show it's
#               a debug message.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub _debug {
    my ( $active, $message ) = @_;

    chomp($message);
    print "D> $message\n" if $active;
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION: Process the options we want to offer. See the documentation
#               for details
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = (
        "help",      "documentation|man", "debug=i", "episode=i",
        "baseURL=s", "fragment!",         "count!",  "output:s",
    );

    if ( !GetOptions( $optref, @options ) ) {
        pod2usage(
            -msg     => "$PROG version $VERSION\n",
            -exitval => 1,
            -verbose => 0
        );
    }

    return;
}

__END__

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#  Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{

=head1 NAME

fix_relative_links - Repair relative URLs in HTML shownotes

=head1 VERSION

This documentation refers to fix_relative_links version 0.0.3

=head1 USAGE

    ./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment

=head1 REQUIRED ARGUMENTS

=over 4

=item B<filename>

The name of the file containing the HTML to be repaired. If no path is given
this will be supplied by the script as:

    ~/HPR/Show_Submission/shownotes/hpr${show}/

It is probably wiser to be explicit about the path to the HTML file to be
parsed.

=back

=head1 OPTIONS

=over 8

=item B<-help>

Prints a brief help message describing the usage of the program, and then exits.

=item B<-documentation> or B<-man>

Prints the entire documentation for the script in the form of a manual page.

=item B<-debug=N>

Causes certain debugging information to be displayed.

    0   (the default) no debug output
    1
    2
    3

=item B<-episode=N>

This option is mandatory and specifies the show number being processed. The
number is used to generate default file names and paths as well as the default
base URL described below.

=item B<-baseURL=URL>

This option will default to the following URL if not provided:

    https://hackerpublicradio.org/eps/hpr${show}/

It can be used to define a non-standard URL, such as one at a lower level than
the example above which might contain thumbnail pictures for example.

=item B<-[no]fragment>

This Boolean option defines the HTML being parsed and checked as a fragment or
a complete stand-alone document. By default B<-nofragment> is assumed. It is
necessary to use B<fragment> for the case where the HTML shownotes are being
parsed.

=item B<-[no]count>

This Boolean option defines whether to simply count the necessary changes or
to apply them to the given HTML file. By default B<-nocount> is assumed, and
changes will be applied.

=item B<-output[=FILE]>

This option can be omitted or can be given without the B<FILE> name. If
omitted entirely no output will be written even though the HTML file has been
read and processed. If specified without the output file name the default name
will be B<hpr${show}_new.html>. If no path is specified with the file name
then a default will be generated as:

    ~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html

The output file name can be given in the form of a B<printf> template such as:

    hpr%d_new.html

and the B<%d> will be replaced by the show number given through the
B<-episode=N> option described above.

=back

=head1 DESCRIPTION

The script reads a file of HTML which has either been submitted by an HPR host
as it is or has been generated from one of the markup languages accepted in
the upload form. Most often this file will contain the main notes for a show
and will eventually be saved in the HPR database.

It is also possible to use the script to process other HTML files submitted
with an HPR show.

The purpose of the script is to find relative URLs in the HTML and convert
them to absolute ones. The HPR website requests that absolute URLs be used
since then they can be used in the various RSS feeds which are available, but
many hosts forget to follow this request.

The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or
B<img> tags. These are checked to ensure they contain absolute links, and if
not are converted appropriately using a base URL for the HPR website.

A count of changes is returned by the script and the converted HTML is written
out to a file if required. The script can be used to see if any conversions
are necessary before making the changes.

The script is also capable of treating full HTML documents differently from
the HTML fragments that are stored in the HPR database. An option is required
to specify which type of HTML is being processed.

=head1 DIAGNOSTICS

Error and warning messages generated by the script.

=over 4

=item B<Unable to find ...>

Type: fatal

The script was unable to find the specified input file.

=item B<HTML::TreeBuilder failed to process ...: ...>

Type: fatal

The script attempted to use B<HTML::TreeBuilder> to parse the input file
but failed. The message also contains details of the failure.

=item B<Unable to open ... for output: ...>

Type: fatal

The script attempted to open the requested output file but failed. The reason
for the failure is included in the error message.

=item B<Invalid - too many '%d' sequences in '...'>

Type: fatal

The script attempted to generate a name for the requested output file using
the supplied template, but failed because there were too many B<%d> elements
in the template. Only one should be provided, which will be substituted with
the show number.

=back

=head1 DEPENDENCIES

    Carp
    Data::Dumper
    File::Basename
    Getopt::Long
    HTML::TreeBuilder
    IO::HTML
    Log::Handler
    Pod::Usage
    URI

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.

=head1 AUTHOR

Dave Morriss (Dave.Morriss@gmail.com)

=head1 LICENCE AND COPYRIGHT

Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=cut

#}}}

# [zo to open fold, zc to close]

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker