forked from HPR/hpr-tools
752 lines
21 KiB
Plaintext
752 lines
21 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: fix_relative_links
|
||
|
#
|
||
|
# USAGE: ./fix_relative_links [options] -episode=N FILE
|
||
|
#
|
||
|
# DESCRIPTION: Processes an HTML input file, looking for relative URLs. If
|
||
|
# any are found these are made absolute using the -baseURL=URL
|
||
|
# option or a default. The intention is to make them into
|
||
|
# HPR-absolute URLs.
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: ---
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.0.3
|
||
|
# CREATED: 2022-10-14 11:56:03
|
||
|
# REVISION: 2022-10-23 22:12:08
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use v5.16;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use utf8;
|
||
|
use feature qw{ postderef say signatures state };
|
||
|
no warnings qw{ experimental::postderef experimental::signatures };
|
||
|
|
||
|
use Carp;
|
||
|
use Getopt::Long;
|
||
|
use Pod::Usage;
|
||
|
|
||
|
use File::Basename;
|
||
|
|
||
|
use IO::HTML;
|
||
|
use HTML::TreeBuilder 5 -weak;
|
||
|
use URI;
|
||
|
|
||
|
use Log::Handler;
|
||
|
|
||
|
use Data::Dumper;
|
||
|
|
||
|
#
|
||
|
# Version number (manually incremented)
|
||
|
#
|
||
|
our $VERSION = '0.0.3';
|
||
|
|
||
|
#
|
||
|
# Script and directory names
|
||
|
#
|
||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||
|
$DIR = '.' unless $DIR;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Declarations
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Constants and other declarations
|
||
|
#
|
||
|
my $basedir = "$ENV{HOME}/HPR/Show_Submission";
|
||
|
my $logdir = "$basedir/logs";
|
||
|
my $logfile = "$logdir/${PROG}.log";
|
||
|
|
||
|
#
|
||
|
# Variables, arrays and hashes
|
||
|
#
|
||
|
my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only );
|
||
|
my ( $outfile, $filename, $showdir, $changes, $html );
|
||
|
|
||
|
#
|
||
|
# Enable Unicode mode
|
||
|
#
|
||
|
binmode STDOUT, ":encoding(UTF-8)";
|
||
|
binmode STDERR, ":encoding(UTF-8)";
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Options and arguments
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Option defaults
|
||
|
#
|
||
|
my $DEFDEBUG = 0;
|
||
|
|
||
|
my %options;
|
||
|
Options( \%options );
|
||
|
|
||
|
#
|
||
|
# Default help shows minimal information
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
||
|
if ( $options{'help'} );
|
||
|
|
||
|
#
|
||
|
# The -documentation or -man option shows the full POD documentation through
|
||
|
# a pager for convenience
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
||
|
if ( $options{'documentation'} );
|
||
|
|
||
|
#
|
||
|
# Collect options
|
||
|
#
|
||
|
$DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
||
|
$showno = $options{episode};
|
||
|
$base_URL = $options{baseURL};
|
||
|
$fragment = ( defined( $options{fragment} ) ? $options{fragment} : 0 );
|
||
|
$count_only = ( defined( $options{count} ) ? $options{count} : 0 );
|
||
|
$outfile = $options{output};
|
||
|
|
||
|
#
|
||
|
# Argument
|
||
|
#
|
||
|
$filename = shift;
|
||
|
|
||
|
#
|
||
|
# Sanity checks
|
||
|
#
|
||
|
pod2usage(
|
||
|
-msg => "$PROG version $VERSION\nShow number missing\n",
|
||
|
-exitval => 1,
|
||
|
-verbose => 0
|
||
|
) unless $showno;
|
||
|
|
||
|
pod2usage(
|
||
|
-msg => "$PROG version $VERSION\nInput file name missing\n",
|
||
|
-exitval => 1,
|
||
|
-verbose => 0
|
||
|
) unless $filename;
|
||
|
|
||
|
#
|
||
|
# Add leading zeroes to the show number if necessary
|
||
|
#
|
||
|
$showno = sprintf( '%04d', $showno );
|
||
|
|
||
|
#
|
||
|
# Directories and files specific to this show
|
||
|
#
|
||
|
$showdir = "$basedir/shownotes/hpr$showno";
|
||
|
|
||
|
#
|
||
|
# Allow the input filename to be a bare name
|
||
|
#
|
||
|
if ( !-e $filename ) {
|
||
|
$filename = "$showdir/$filename";
|
||
|
}
|
||
|
die "Unable to find $filename" unless ( -e $filename );
|
||
|
|
||
|
#
|
||
|
# Work on the output file, allowing defaults and substitution points for
|
||
|
# convenience. If there's no outfile we'll just process the HTML and nothing
|
||
|
# more.
|
||
|
#
|
||
|
if ( defined($outfile) ) {
|
||
|
$outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' );
|
||
|
$outfile = "$showdir/$outfile" if (dirname($outfile) eq '.');
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Default base URL
|
||
|
#
|
||
|
unless ($base_URL) {
|
||
|
$base_URL = "https://hackerpublicradio.org/eps/hpr$showno/";
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Base URL must have a trailing '/'
|
||
|
#
|
||
|
$base_URL .= '/' unless ( $base_URL =~ qr{/$} );
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Set up logging keeping the default log layout except for the date. The format
|
||
|
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
|
||
|
# the message.
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $log = Log::Handler->new();
|
||
|
|
||
|
$log->add(
|
||
|
file => {
|
||
|
timeformat => "%Y/%m/%d %H:%M:%S",
|
||
|
filename => $logfile,
|
||
|
minlevel => 0,
|
||
|
maxlevel => 7,
|
||
|
}
|
||
|
);
|
||
|
|
||
|
#
|
||
|
# Log preamble
|
||
|
#
|
||
|
$log->info("Show number: $showno");
|
||
|
$log->info("Processing: $filename");
|
||
|
$log->info("Base: $base_URL");
|
||
|
|
||
|
#
|
||
|
# Find and change any relative URLs returning the number of changes and the
|
||
|
# altered HTML
|
||
|
#
|
||
|
( $changes, $html )
|
||
|
= find_links_in_file( $filename, $base_URL, $fragment, $count_only );
|
||
|
|
||
|
$log->info("Number of changes: $changes");
|
||
|
|
||
|
#
|
||
|
# Exit without writing if we're just counting
|
||
|
#
|
||
|
if ($count_only) {
|
||
|
$log->info("Count only mode");
|
||
|
exit $changes;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Exit without writing if there were no changes
|
||
|
#
|
||
|
if ($changes == 0) {
|
||
|
$log->info("No output written");
|
||
|
exit $changes;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Write output if an output file was specified
|
||
|
#
|
||
|
if ($outfile) {
|
||
|
write_output( $outfile, $html );
|
||
|
$log->info("Changes applied; written to $outfile");
|
||
|
}
|
||
|
else {
|
||
|
$log->info("No output written");
|
||
|
}
|
||
|
|
||
|
exit $changes;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: find_links_in_file
|
||
|
# PURPOSE: Finds relative links in an HTML file
|
||
|
# PARAMETERS: $filename the name of the file we're parsing
|
||
|
# $base_URL the part of the full URL we'll replace
|
||
|
# $fragment Boolean signalling whether to treat the HTML
|
||
|
# as a fragment or an entire document
|
||
|
# $count_only Boolean signalling that all we want is the
|
||
|
# count of relative URLs, no action is to be taken
|
||
|
# RETURNS: The number of URLs "repaired".
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub find_links_in_file {
|
||
|
my ( $filename, $base_URL, $fragment, $count_only ) = @_;
|
||
|
|
||
|
my ( $base_uri, $tree, $uri_orig, $uri_new );
|
||
|
my ( $newlink, $linkedits, $result );
|
||
|
|
||
|
#
|
||
|
# Parse the base URL
|
||
|
#
|
||
|
$base_uri = URI->new($base_URL);
|
||
|
|
||
|
#
|
||
|
# Create a tree object
|
||
|
#
|
||
|
$tree = HTML::TreeBuilder->new;
|
||
|
$tree->ignore_unknown(0);
|
||
|
$tree->no_expand_entities(1);
|
||
|
$tree->p_strict(1);
|
||
|
$tree->store_comments(1);
|
||
|
$tree->warn(1);
|
||
|
|
||
|
#
|
||
|
# Parse the file using IO::HTML to grab it. Die if we fail because then we
|
||
|
# know this stuff needs some urgent attention.
|
||
|
#
|
||
|
$tree->parse_file( html_file($filename) )
|
||
|
or die "HTML::TreeBuilder failed to process $filename: $!\n";
|
||
|
|
||
|
$linkedits = 0;
|
||
|
|
||
|
#
|
||
|
# Scan for all anchors and images using the HTML::Element method
|
||
|
# 'extract_links'. The technique used here is from the HTML::Element man
|
||
|
# page.
|
||
|
#
|
||
|
for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
|
||
|
my ( $link, $element, $attr, $tag ) = @$_;
|
||
|
|
||
|
#
|
||
|
# Parse the link
|
||
|
#
|
||
|
$uri_orig = URI->new($link);
|
||
|
|
||
|
#
|
||
|
# A relative link (presumably) doesn't have a scheme
|
||
|
#
|
||
|
unless ( $uri_orig->scheme ) {
|
||
|
#
|
||
|
# Original link
|
||
|
#
|
||
|
say "Relative link: $link";
|
||
|
|
||
|
#
|
||
|
# Make the link absolute
|
||
|
#
|
||
|
$uri_new = make_absolute( $uri_orig, $base_uri );
|
||
|
# $uri_new = URI->new_abs( $link, $base_URL );
|
||
|
$newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque );
|
||
|
say "Absolute link: $newlink";
|
||
|
|
||
|
#
|
||
|
# Modify the HTML to make the relative absolute
|
||
|
#
|
||
|
if ( $uri_orig->fragment ) {
|
||
|
# Not sure if we need to cater for URI fragments, but we'll try it
|
||
|
$element->attr( $attr, $newlink . '#' . $uri_orig->fragment );
|
||
|
}
|
||
|
else {
|
||
|
$element->attr( $attr, $newlink );
|
||
|
}
|
||
|
|
||
|
$linkedits++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Exit here if we were just asked to count
|
||
|
#
|
||
|
return ( $linkedits, undef ) if $count_only;
|
||
|
|
||
|
#
|
||
|
# In 'HTML fragment' mode generate the body part without the <body> tags.
|
||
|
#
|
||
|
if ($fragment) {
|
||
|
my $body = $tree->look_down( _tag => 'body' );
|
||
|
( $result = $body->as_HTML( undef, ' ', {} ) )
|
||
|
=~ s{(^<body[^>]*>|</body>$)}{}gi;
|
||
|
}
|
||
|
else {
|
||
|
$result = $tree->as_HTML( undef, ' ', {} );
|
||
|
}
|
||
|
|
||
|
return ( $linkedits, $result );
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: write_output
|
||
|
# PURPOSE: Write the "repaired" HTML
|
||
|
# PARAMETERS: $outfile name of the output file
|
||
|
# $html the HTML to write out
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub write_output {
|
||
|
my ( $outfile, $html ) = @_;
|
||
|
|
||
|
open( my $out, '>:encoding(UTF-8)', $outfile )
|
||
|
or die "Unable to open $outfile for output: $!\n";
|
||
|
|
||
|
print $out $html;
|
||
|
|
||
|
close($out);
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: make_absolute
|
||
|
# PURPOSE: Take a relative URI and a base URI and return the absolute URI
|
||
|
# PARAMETERS: $relative relative URL as a URI object
|
||
|
# $base base URL as a URI object
|
||
|
# RETURNS: Absolute URL as a URI object
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub make_absolute {
|
||
|
my ( $relative, $base ) = @_;
|
||
|
|
||
|
my ( %base_path, @relative_path, $absolute );
|
||
|
|
||
|
#
|
||
|
# Chop up the path from the base and store in a hash
|
||
|
#
|
||
|
%base_path = map { $_ => 1 } split( '/', $base->path );
|
||
|
|
||
|
#
|
||
|
# Chop up the relative path
|
||
|
#
|
||
|
@relative_path = split( '/', $relative->path );
|
||
|
|
||
|
#
|
||
|
# Remove relative path elements if they are in the base
|
||
|
#
|
||
|
@relative_path = grep { !exists( $base_path{$_} ) } @relative_path;
|
||
|
|
||
|
#
|
||
|
# If the relative path is empty we assume it's referring to the
|
||
|
# 'index.html' file.
|
||
|
#
|
||
|
push( @relative_path, 'index.html' ) unless (@relative_path);
|
||
|
|
||
|
#
|
||
|
# Rebuild the relative path
|
||
|
#
|
||
|
$relative->path( join( '/', @relative_path ) );
|
||
|
|
||
|
#
|
||
|
# Return the result of joining relative URL and base URL
|
||
|
#
|
||
|
$absolute = URI->new_abs( $relative->as_string, $base->as_string );
|
||
|
|
||
|
return $absolute;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: output_file_name
|
||
|
# PURPOSE: Generate an output file name with three choices
|
||
|
# PARAMETERS: $optarg the argument to the option choosing the filename
|
||
|
# $showno the show number to add to certain name types
|
||
|
# $template a default 'sprintf' template for the name
|
||
|
# RETURNS: The name of the output file
|
||
|
# DESCRIPTION: If there's a defined output filename then there are three
|
||
|
# options: a null string, a plain filename and a substitution
|
||
|
# string with '%d' sequences. The null string means the user used
|
||
|
# '-option' without a value, so we want to generate a substitution
|
||
|
# string. A string with '%d' requires a check to ensure there's
|
||
|
# the right number, just one. The plain filename needs no more
|
||
|
# work.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub output_file_name {
|
||
|
my ( $optarg, $showno, $template ) = @_;
|
||
|
|
||
|
my ( $filename, $count );
|
||
|
|
||
|
#
|
||
|
# We shouldn't be called with a null option argument
|
||
|
#
|
||
|
return unless defined($optarg);
|
||
|
|
||
|
#
|
||
|
# Does the option have an argument?
|
||
|
#
|
||
|
if ( $optarg =~ /^$/ ) {
|
||
|
#
|
||
|
# No argument; use the show number from the -episode=N option
|
||
|
#
|
||
|
$filename = sprintf( $template, $showno );
|
||
|
}
|
||
|
elsif ( $optarg =~ /%d/ ) {
|
||
|
#
|
||
|
# There's an argument, does it have a '%d' in it?
|
||
|
#
|
||
|
$count = () = $optarg =~ /%d/g;
|
||
|
die "Invalid - too many '%d' sequences in '$optarg'\n"
|
||
|
if ( $count > 1 );
|
||
|
$filename = sprintf( $optarg, $showno );
|
||
|
}
|
||
|
else {
|
||
|
#
|
||
|
# It's a plain filename, just return it
|
||
|
#
|
||
|
$filename = $optarg;
|
||
|
}
|
||
|
|
||
|
return $filename;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: coalesce
|
||
|
# PURPOSE: To find the first defined argument and return it
|
||
|
# PARAMETERS: Arbitrary number of arguments
|
||
|
# RETURNS: The first defined argument or undef if there are none
|
||
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
||
|
# returned when doing so might be a problem.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub coalesce {
|
||
|
foreach (@_) {
|
||
|
return $_ if defined($_);
|
||
|
}
|
||
|
return undef; ## no critic
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: _debug
|
||
|
# PURPOSE: Prints debug reports
|
||
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
||
|
# $message Message to print
|
||
|
# RETURNS: Nothing
|
||
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
||
|
# newline and then adds one in the 'print' to the caller doesn't
|
||
|
# have to bother. Prepends the message with 'D> ' to show it's
|
||
|
# a debug message.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub _debug {
|
||
|
my ( $active, $message ) = @_;
|
||
|
|
||
|
chomp($message);
|
||
|
print "D> $message\n" if $active;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Options
|
||
|
# PURPOSE: Processes command-line options
|
||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||
|
# RETURNS: Undef
|
||
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
||
|
# for details
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Options {
|
||
|
my ($optref) = @_;
|
||
|
|
||
|
my @options = (
|
||
|
"help", "documentation|man", "debug=i", "episode=i",
|
||
|
"baseURL=s", "fragment!", "count!", "output:s",
|
||
|
);
|
||
|
|
||
|
if ( !GetOptions( $optref, @options ) ) {
|
||
|
pod2usage(
|
||
|
-msg => "$PROG version $VERSION\n",
|
||
|
-exitval => 1,
|
||
|
-verbose => 0
|
||
|
);
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
__END__
|
||
|
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
# Application Documentation
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
#{{{
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
fix_relative_links - Repair relative URLs in HTML shownotes
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
This documentation refers to fix_relative_links version 0.0.3
|
||
|
|
||
|
=head1 USAGE
|
||
|
|
||
|
./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment
|
||
|
|
||
|
=head1 REQUIRED ARGUMENTS
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item B<filename>
|
||
|
|
||
|
The name of the file containing the HTML to be repaired. If no path is given
|
||
|
this will be supplied by the script as:
|
||
|
|
||
|
~/HPR/Show_Submission/shownotes/hpr${show}/
|
||
|
|
||
|
It is probably wiser to be explicit about the path to the HTML file to be
|
||
|
parsed.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 OPTIONS
|
||
|
|
||
|
=over 8
|
||
|
|
||
|
=item B<-help>
|
||
|
|
||
|
Prints a brief help message describing the usage of the program, and then exits.
|
||
|
|
||
|
=item B<-documentation> or B<-man>
|
||
|
|
||
|
Prints the entire documentation for the script in the form of a manual page.
|
||
|
|
||
|
=item B<-debug=N>
|
||
|
|
||
|
Causes certain debugging information to be displayed.
|
||
|
|
||
|
0 (the default) no debug output
|
||
|
1
|
||
|
2
|
||
|
3
|
||
|
|
||
|
=item B<-episode=N>
|
||
|
|
||
|
This option is mandatory and specifies the show number being processed. The
|
||
|
number is used to generate default file names and paths as well as the default
|
||
|
base URL described below.
|
||
|
|
||
|
=item B<-baseURL=URL>
|
||
|
|
||
|
This option will default to the foillowing URL if not provided:
|
||
|
|
||
|
https://hackerpublicradio.org/eps/hpr${show}/
|
||
|
|
||
|
It can be used to define a non-standard URL, such as one at a lower level than
|
||
|
the example above which might contain thumbnail pictures for example.
|
||
|
|
||
|
=item B<-[no]fragment>
|
||
|
|
||
|
This Boolean option defines the HTML being parsed and checked as a fragment or
|
||
|
a complete stand-alone document. By default B<-nofragment> is assumed. It is
|
||
|
necessary to use B<fragment> for the case where the HTML shownotes are being
|
||
|
parsed.
|
||
|
|
||
|
=item B<-[no]count>
|
||
|
|
||
|
This Boolean option defines whether to simply count the necessary changes or
|
||
|
to apply them to the given HTML file. By default B<-nocount> is assumed, and
|
||
|
changes will be applied.
|
||
|
|
||
|
=item B<-output[=FILE]>
|
||
|
|
||
|
This option can be omitted or can be given without the B<FILE> name. If
|
||
|
omitted entirely no output will be written even though the HTML file has been
|
||
|
read and processed. If specified without the output file name the default name
|
||
|
will be B<hpr${show}_new.html>. If no path is specified with the file name
|
||
|
then a default will be generated as:
|
||
|
|
||
|
~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html
|
||
|
|
||
|
The output file name can be given in the form of a B<printf> template such as:
|
||
|
|
||
|
hpr%d_new.html
|
||
|
|
||
|
and the B<%d> will be replaced by the show number given through the
|
||
|
B<-episode=N> option described above.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
The script reads a file of HTML which has either been submitted by an HPR host
|
||
|
as it is or has been generated from one of the markup languages accepted in
|
||
|
the upload form. Most often this file will contain the main notes for a show
|
||
|
and will eventually be saved in the HPR database.
|
||
|
|
||
|
It is also possible to use the script to process other HTML files submitted
|
||
|
with an HPR show.
|
||
|
|
||
|
The purpose of the script is to find relative URLs in the HTML and convert
|
||
|
them to absolute ones. The HPR website requests that absolute URLs be used
|
||
|
since then they can be used in the various RSS feeds which are available, but
|
||
|
many hosts forget to follow this request.
|
||
|
|
||
|
The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or
|
||
|
B<img> tags. These are checked to ensure they contain absolute links, and if
|
||
|
not are converted appropriately using a base URL for the HPR website.
|
||
|
|
||
|
A count of changes is returned by the script and the converted HTML is written
|
||
|
out to a file if required. The script can be used to see if any conversions
|
||
|
are necessary before making the changes.
|
||
|
|
||
|
The script is also capable of treating full HTML documents differently from
|
||
|
the HTML fragments that are stored in the HPR database. An option is required
|
||
|
to specify which type of HTML is being processed.
|
||
|
|
||
|
=head1 DIAGNOSTICS
|
||
|
|
||
|
Error and warning messages generated by the script.
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item B<Unable to find ...>
|
||
|
|
||
|
Type: fatal
|
||
|
|
||
|
The script was unable to find the specified input file.
|
||
|
|
||
|
=item B<HTML::TreeBuilder failed to process ...: ...>
|
||
|
|
||
|
Type: fatal
|
||
|
|
||
|
The script attempted to use B<HTML::TreeBuilder> to parse the input file
|
||
|
but failed. The message also contains details of the failure.
|
||
|
|
||
|
=item B<Unable to open ... for output: ...>
|
||
|
|
||
|
Type: fatal
|
||
|
|
||
|
The script attempted to open the requested output file but failed. The reason
|
||
|
for the failure is included in the error message.
|
||
|
|
||
|
=item B<Invalid - too many '%d' sequences in '...'>
|
||
|
|
||
|
Type: fatal
|
||
|
|
||
|
The script attempted to generate a name for the requested output file using
|
||
|
the supplied template, but failed because there were too many B<%d> elements
|
||
|
in the template. Only one should be provided, which will be substituted with
|
||
|
the show number.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 DEPENDENCIES
|
||
|
|
||
|
Carp
|
||
|
Data::Dumper
|
||
|
File::Basename
|
||
|
Getopt::Long
|
||
|
HTML::TreeBuilder
|
||
|
IO::HTML
|
||
|
Log::Handler
|
||
|
Pod::Usage
|
||
|
URI
|
||
|
|
||
|
=head1 BUGS AND LIMITATIONS
|
||
|
|
||
|
There are no known bugs in this module.
|
||
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
||
|
Patches are welcome.
|
||
|
|
||
|
=head1 AUTHOR
|
||
|
|
||
|
Dave Morriss (Dave.Morriss@gmail.com)
|
||
|
|
||
|
=head1 LICENCE AND COPYRIGHT
|
||
|
|
||
|
Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
||
|
|
||
|
This module is free software; you can redistribute it and/or
|
||
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
#}}}
|
||
|
|
||
|
# [zo to open fold, zc to close]
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||
|
|