forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			752 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			752 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: fix_relative_links
 | |
| #
 | |
| #        USAGE: ./fix_relative_links [options] -episode=N FILE
 | |
| #
 | |
| #  DESCRIPTION: Processes an HTML input file, looking for relative URLs. If
 | |
| #               any are found these are made absolute using the -baseURL=URL
 | |
| #               option or a default. The intention is to make them into
 | |
| #               HPR-absolute URLs.
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.3
 | |
| #      CREATED: 2022-10-14 11:56:03
 | |
| #     REVISION: 2022-10-23 22:12:08
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use v5.16;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| use feature qw{ postderef say signatures state };
 | |
| no warnings qw{ experimental::postderef experimental::signatures };
 | |
| 
 | |
| use Carp;
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use File::Basename;
 | |
| 
 | |
| use IO::HTML;
 | |
| use HTML::TreeBuilder 5 -weak;
 | |
| use URI;
 | |
| 
 | |
| use Log::Handler;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.0.3';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir = "$ENV{HOME}/HPR/Show_Submission";
 | |
| my $logdir  = "$basedir/logs";
 | |
| my $logfile = "$logdir/${PROG}.log";
 | |
| 
 | |
| #
 | |
| # Variables, arrays and hashes
 | |
| #
 | |
| my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only );
 | |
| my ( $outfile, $filename, $showdir, $changes, $html );
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options and arguments
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Option defaults
 | |
| #
 | |
| my $DEFDEBUG = 0;
 | |
| 
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help shows minimal information
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # The -documentation or -man option shows the full POD documentation through
 | |
| # a pager for convenience
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
 | |
|     if ( $options{'documentation'} );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| $DEBUG      = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
 | |
| $showno     = $options{episode};
 | |
| $base_URL   = $options{baseURL};
 | |
| $fragment   = ( defined( $options{fragment} ) ? $options{fragment} : 0 );
 | |
| $count_only = ( defined( $options{count} )    ? $options{count}    : 0 );
 | |
| $outfile    = $options{output};
 | |
| 
 | |
| #
 | |
| # Argument
 | |
| #
 | |
| $filename = shift;
 | |
| 
 | |
| #
 | |
| # Sanity checks
 | |
| #
 | |
| pod2usage(
 | |
|     -msg     => "$PROG version $VERSION\nShow number missing\n",
 | |
|     -exitval => 1,
 | |
|     -verbose => 0
 | |
| ) unless $showno;
 | |
| 
 | |
| pod2usage(
 | |
|     -msg     => "$PROG version $VERSION\nInput file name missing\n",
 | |
|     -exitval => 1,
 | |
|     -verbose => 0
 | |
| ) unless $filename;
 | |
| 
 | |
| #
 | |
| # Add leading zeroes to the show number if necessary
 | |
| #
 | |
| $showno = sprintf( '%04d', $showno );
 | |
| 
 | |
| #
 | |
| # Directories and files specific to this show
 | |
| #
 | |
| $showdir = "$basedir/shownotes/hpr$showno";
 | |
| 
 | |
| #
 | |
| # Allow the input filename to be a bare name
 | |
| #
 | |
| if ( !-e $filename ) {
 | |
|     $filename = "$showdir/$filename";
 | |
| }
 | |
| die "Unable to find $filename" unless ( -e $filename );
 | |
| 
 | |
| #
 | |
| # Work on the output file, allowing defaults and substitution points for
 | |
| # convenience. If there's no outfile we'll just process the HTML and nothing
 | |
| # more.
 | |
| #
 | |
| if ( defined($outfile) ) {
 | |
|     $outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' );
 | |
|     $outfile = "$showdir/$outfile" if (dirname($outfile) eq '.');
 | |
| }
 | |
| 
 | |
| #
 | |
| # Default base URL
 | |
| #
 | |
| unless ($base_URL) {
 | |
|     $base_URL = "https://hackerpublicradio.org/eps/hpr$showno/";
 | |
| }
 | |
| 
 | |
| #
 | |
| # Base URL must have a trailing '/'
 | |
| #
 | |
| $base_URL .= '/' unless ( $base_URL =~ qr{/$} );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up logging keeping the default log layout except for the date. The format
 | |
| # is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
 | |
| # the message.
 | |
| #-------------------------------------------------------------------------------
 | |
| my $log = Log::Handler->new();
 | |
| 
 | |
| $log->add(
 | |
|     file => {
 | |
|         timeformat => "%Y/%m/%d %H:%M:%S",
 | |
|         filename   => $logfile,
 | |
|         minlevel   => 0,
 | |
|         maxlevel   => 7,
 | |
|     }
 | |
| );
 | |
| 
 | |
| #
 | |
| # Log preamble
 | |
| #
 | |
| $log->info("Show number: $showno");
 | |
| $log->info("Processing: $filename");
 | |
| $log->info("Base: $base_URL");
 | |
| 
 | |
| #
 | |
| # Find and change any relative URLs returning the number of changes and the
 | |
| # altered HTML
 | |
| #
 | |
| ( $changes, $html )
 | |
|     = find_links_in_file( $filename, $base_URL, $fragment, $count_only );
 | |
| 
 | |
| $log->info("Number of changes: $changes");
 | |
| 
 | |
| #
 | |
| # Exit without writing if we're just counting
 | |
| #
 | |
| if ($count_only) {
 | |
|     $log->info("Count only mode");
 | |
|     exit $changes;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Exit without writing if there were no changes
 | |
| #
 | |
| if ($changes == 0) {
 | |
|     $log->info("No output written");
 | |
|     exit $changes;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Write output if an output file was specified
 | |
| #
 | |
| if ($outfile) {
 | |
|     write_output( $outfile, $html );
 | |
|     $log->info("Changes applied; written to $outfile");
 | |
| }
 | |
| else {
 | |
|     $log->info("No output written");
 | |
| }
 | |
| 
 | |
| exit $changes;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: find_links_in_file
 | |
| #      PURPOSE: Finds relative links in an HTML file
 | |
| #   PARAMETERS: $filename       the name of the file we're parsing
 | |
| #               $base_URL       the part of the full URL we'll replace
 | |
| #               $fragment       Boolean signalling whether to treat the HTML
 | |
| #                               as a fragment or an entire document
 | |
| #               $count_only     Boolean signalling that all we want is the
 | |
| #                               count of relative URLs, no action is to be taken
 | |
| #      RETURNS: The number of URLs "repaired".
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub find_links_in_file {
 | |
|     my ( $filename, $base_URL, $fragment, $count_only ) = @_;
 | |
| 
 | |
|     my ( $base_uri, $tree, $uri_orig, $uri_new );
 | |
|     my ( $newlink, $linkedits, $result );
 | |
| 
 | |
|     #
 | |
|     # Parse the base URL
 | |
|     #
 | |
|     $base_uri = URI->new($base_URL);
 | |
| 
 | |
|     #
 | |
|     # Create a tree object
 | |
|     #
 | |
|     $tree = HTML::TreeBuilder->new;
 | |
|     $tree->ignore_unknown(0);
 | |
|     $tree->no_expand_entities(1);
 | |
|     $tree->p_strict(1);
 | |
|     $tree->store_comments(1);
 | |
|     $tree->warn(1);
 | |
| 
 | |
|     #
 | |
|     # Parse the file using IO::HTML to grab it. Die if we fail because then we
 | |
|     # know this stuff needs some urgent attention.
 | |
|     #
 | |
|     $tree->parse_file( html_file($filename) )
 | |
|         or die "HTML::TreeBuilder failed to process $filename: $!\n";
 | |
| 
 | |
|     $linkedits = 0;
 | |
| 
 | |
|     #
 | |
|     # Scan for all anchors and images using the HTML::Element method
 | |
|     # 'extract_links'. The technique used here is from the HTML::Element man
 | |
|     # page.
 | |
|     #
 | |
|     for ( @{ $tree->extract_links( 'a', 'img' ) } ) {
 | |
|         my ( $link, $element, $attr, $tag ) = @$_;
 | |
| 
 | |
|         #
 | |
|         # Parse the link
 | |
|         #
 | |
|         $uri_orig = URI->new($link);
 | |
| 
 | |
|         #
 | |
|         # A relative link (presumably) doesn't have a scheme
 | |
|         #
 | |
|         unless ( $uri_orig->scheme ) {
 | |
|             #
 | |
|             # Original link
 | |
|             #
 | |
|             say "Relative link: $link";
 | |
| 
 | |
|             #
 | |
|             # Make the link absolute
 | |
|             #
 | |
|             $uri_new = make_absolute( $uri_orig, $base_uri );
 | |
|             #            $uri_new = URI->new_abs( $link, $base_URL );
 | |
|             $newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque );
 | |
|             say "Absolute link: $newlink";
 | |
| 
 | |
|             #
 | |
|             # Modify the HTML to make the relative absolute
 | |
|             #
 | |
|             if ( $uri_orig->fragment ) {
 | |
|                 # Not sure if we need to cater for URI fragments, but we'll try it
 | |
|                 $element->attr( $attr, $newlink . '#' . $uri_orig->fragment );
 | |
|             }
 | |
|             else {
 | |
|                 $element->attr( $attr, $newlink );
 | |
|             }
 | |
| 
 | |
|             $linkedits++;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Exit here if we were just asked to count
 | |
|     #
 | |
|     return ( $linkedits, undef ) if $count_only;
 | |
| 
 | |
|     #
 | |
|     # In 'HTML fragment' mode generate the body part without the <body> tags.
 | |
|     #
 | |
|     if ($fragment) {
 | |
|         my $body = $tree->look_down( _tag => 'body' );
 | |
|         ( $result = $body->as_HTML( undef, ' ', {} ) )
 | |
|             =~ s{(^<body[^>]*>|</body>$)}{}gi;
 | |
|     }
 | |
|     else {
 | |
|         $result = $tree->as_HTML( undef, ' ', {} );
 | |
|     }
 | |
| 
 | |
|     return ( $linkedits, $result );
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: write_output
 | |
| #      PURPOSE: Write the "repaired" HTML
 | |
| #   PARAMETERS: $outfile        name of the output file
 | |
| #               $html           the HTML to write out
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub write_output {
 | |
|     my ( $outfile, $html ) = @_;
 | |
| 
 | |
|     open( my $out, '>:encoding(UTF-8)', $outfile )
 | |
|         or die "Unable to open $outfile for output: $!\n";
 | |
| 
 | |
|     print $out $html;
 | |
| 
 | |
|     close($out);
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: make_absolute
 | |
| #      PURPOSE: Take a relative URI and a base URI and return the absolute URI
 | |
| #   PARAMETERS: $relative       relative URL as a URI object
 | |
| #               $base           base URL as a URI object
 | |
| #      RETURNS: Absolute URL as a URI object
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub make_absolute {
 | |
|     my ( $relative, $base ) = @_;
 | |
| 
 | |
|     my ( %base_path, @relative_path, $absolute );
 | |
| 
 | |
|     #
 | |
|     # Chop up the path from the base and store in a hash
 | |
|     #
 | |
|     %base_path = map { $_ => 1 } split( '/', $base->path );
 | |
| 
 | |
|     #
 | |
|     # Chop up the relative path
 | |
|     #
 | |
|     @relative_path = split( '/', $relative->path );
 | |
| 
 | |
|     #
 | |
|     # Remove relative path elements if they are in the base
 | |
|     #
 | |
|     @relative_path = grep { !exists( $base_path{$_} ) } @relative_path;
 | |
| 
 | |
|     #
 | |
|     # If the relative path is empty we assume it's referring to the
 | |
|     # 'index.html' file.
 | |
|     #
 | |
|     push( @relative_path, 'index.html' ) unless (@relative_path);
 | |
| 
 | |
|     #
 | |
|     # Rebuild the relative path
 | |
|     #
 | |
|     $relative->path( join( '/', @relative_path ) );
 | |
| 
 | |
|     #
 | |
|     # Return the result of joining relative URL and base URL
 | |
|     #
 | |
|     $absolute = URI->new_abs( $relative->as_string, $base->as_string );
 | |
| 
 | |
|     return $absolute;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: output_file_name
 | |
| #      PURPOSE: Generate an output file name with three choices
 | |
| #   PARAMETERS: $optarg         the argument to the option choosing the filename
 | |
| #               $showno         the show number to add to certain name types
 | |
| #               $template       a default 'sprintf' template for the name
 | |
| #      RETURNS: The name of the output file
 | |
| #  DESCRIPTION: If there's a defined output filename then there are three
 | |
| #               options: a null string, a plain filename and a substitution
 | |
| #               string with '%d' sequences. The null string means the user used
 | |
| #               '-option' without a value, so we want to generate a substitution
 | |
| #               string. A string with '%d' requires a check to ensure there's
 | |
| #               the right number, just one. The plain filename needs no more
 | |
| #               work.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub output_file_name {
 | |
|     my ( $optarg, $showno, $template ) = @_;
 | |
| 
 | |
|     my ( $filename, $count );
 | |
| 
 | |
|     #
 | |
|     # We shouldn't be called with a null option argument
 | |
|     #
 | |
|     return unless defined($optarg);
 | |
| 
 | |
|     #
 | |
|     # Does the option have an argument?
 | |
|     #
 | |
|     if ( $optarg =~ /^$/ ) {
 | |
|         #
 | |
|         # No argument; use the show number from the -episode=N option
 | |
|         #
 | |
|         $filename = sprintf( $template, $showno );
 | |
|     }
 | |
|     elsif ( $optarg =~ /%d/ ) {
 | |
|         #
 | |
|         # There's an argument, does it have a '%d' in it?
 | |
|         #
 | |
|         $count = () = $optarg =~ /%d/g;
 | |
|         die "Invalid - too many '%d' sequences in '$optarg'\n"
 | |
|             if ( $count > 1 );
 | |
|         $filename = sprintf( $optarg, $showno );
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # It's a plain filename, just return it
 | |
|         #
 | |
|         $filename = $optarg;
 | |
|     }
 | |
| 
 | |
|     return $filename;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
 | |
| #               returned when doing so might be a problem.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return undef;    ## no critic
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: _debug
 | |
| #      PURPOSE: Prints debug reports
 | |
| #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
 | |
| #               $message        Message to print
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing
 | |
| #               newline and then adds one in the 'print' to the caller doesn't
 | |
| #               have to bother. Prepends the message with 'D> ' to show it's
 | |
| #               a debug message.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub _debug {
 | |
|     my ( $active, $message ) = @_;
 | |
| 
 | |
|     chomp($message);
 | |
|     print "D> $message\n" if $active;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION: Process the options we want to offer. See the documentation
 | |
| #               for details
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",      "documentation|man", "debug=i", "episode=i",
 | |
|         "baseURL=s", "fragment!",         "count!",  "output:s",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage(
 | |
|             -msg     => "$PROG version $VERSION\n",
 | |
|             -exitval => 1,
 | |
|             -verbose => 0
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| fix_relative_links - Repair relative URLs in HTML shownotes
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to fix_relative_links version 0.0.3
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     ./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment
 | |
| 
 | |
| =head1 REQUIRED ARGUMENTS
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<filename>
 | |
| 
 | |
| The name of the file containing the HTML to be repaired. If no path is given
 | |
| this will be supplied by the script as:
 | |
| 
 | |
|     ~/HPR/Show_Submission/shownotes/hpr${show}/
 | |
| 
 | |
| It is probably wiser to be explicit about the path to the HTML file to be
 | |
| parsed.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Prints a brief help message describing the usage of the program, and then exits.
 | |
| 
 | |
| =item B<-documentation> or B<-man>
 | |
| 
 | |
| Prints the entire documentation for the script in the form of a manual page.
 | |
| 
 | |
| =item B<-debug=N>
 | |
| 
 | |
| Causes certain debugging information to be displayed.
 | |
| 
 | |
|     0   (the default) no debug output
 | |
|     1   
 | |
|     2   
 | |
|     3   
 | |
| 
 | |
| =item B<-episode=N>
 | |
| 
 | |
| This option is mandatory and specifies the show number being processed. The
 | |
| number is used to generate default file names and paths as well as the default
 | |
| base URL described below.
 | |
| 
 | |
| =item B<-baseURL=URL>
 | |
| 
 | |
| This option will default to the foillowing URL if not provided:
 | |
| 
 | |
|     https://hackerpublicradio.org/eps/hpr${show}/
 | |
| 
 | |
| It can be used to define a non-standard URL, such as one at a lower level than
 | |
| the example above which might contain thumbnail pictures for example.
 | |
| 
 | |
| =item B<-[no]fragment>
 | |
| 
 | |
| This Boolean option defines the HTML being parsed and checked as a fragment or
 | |
| a complete stand-alone document. By default B<-nofragment> is assumed. It is
 | |
| necessary to use B<fragment> for the case where the HTML shownotes are being
 | |
| parsed.
 | |
| 
 | |
| =item B<-[no]count>
 | |
| 
 | |
| This Boolean option defines whether to simply count the necessary changes or
 | |
| to apply them to the given HTML file. By default B<-nocount> is assumed, and
 | |
| changes will be applied.
 | |
| 
 | |
| =item B<-output[=FILE]>
 | |
| 
 | |
| This option can be omitted or can be given without the B<FILE> name. If
 | |
| omitted entirely no output will be written even though the HTML file has been
 | |
| read and processed. If specified without the output file name the default name
 | |
| will be B<hpr${show}_new.html>. If no path is specified with the file name
 | |
| then a default will be generated as:
 | |
| 
 | |
|     ~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html
 | |
| 
 | |
| The output file name can be given in the form of a B<printf> template such as:
 | |
| 
 | |
|     hpr%d_new.html
 | |
| 
 | |
| and the B<%d> will be replaced by the show number given through the
 | |
| B<-episode=N> option described above.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| The script reads a file of HTML which has either been submitted by an HPR host
 | |
| as it is or has been generated from one of the markup languages accepted in
 | |
| the upload form. Most often this file will contain the main notes for a show
 | |
| and will eventually be saved in the HPR database.
 | |
| 
 | |
| It is also possible to use the script to process other HTML files submitted
 | |
| with an HPR show.
 | |
| 
 | |
| The purpose of the script is to find relative URLs in the HTML and convert
 | |
| them to absolute ones. The HPR website requests that absolute URLs be used
 | |
| since then they can be used in the various RSS feeds which are available, but
 | |
| many hosts forget to follow this request.
 | |
| 
 | |
| The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or
 | |
| B<img> tags. These are checked to ensure they contain absolute links, and if
 | |
| not are converted appropriately using a base URL for the HPR website.
 | |
| 
 | |
| A count of changes is returned by the script and the converted HTML is written
 | |
| out to a file if required. The script can be used to see if any conversions
 | |
| are necessary before making the changes.
 | |
| 
 | |
| The script is also capable of treating full HTML documents differently from
 | |
| the HTML fragments that are stored in the HPR database. An option is required
 | |
| to specify which type of HTML is being processed.
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| Error and warning messages generated by the script.
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<Unable to find ...>
 | |
| 
 | |
| Type: fatal
 | |
| 
 | |
| The script was unable to find the specified input file.
 | |
| 
 | |
| =item B<HTML::TreeBuilder failed to process ...: ...>
 | |
| 
 | |
| Type: fatal
 | |
| 
 | |
| The script attempted to use B<HTML::TreeBuilder> to parse the input file
 | |
| but failed. The message also contains details of the failure.
 | |
| 
 | |
| =item B<Unable to open ... for output: ...>
 | |
| 
 | |
| Type: fatal
 | |
| 
 | |
| The script attempted to open the requested output file but failed. The reason
 | |
| for the failure is included in the error message.
 | |
| 
 | |
| =item B<Invalid - too many '%d' sequences in '...'>
 | |
| 
 | |
| Type: fatal
 | |
| 
 | |
| The script attempted to generate a name for the requested output file using
 | |
| the supplied template, but failed because there were too many B<%d> elements
 | |
| in the template. Only one should be provided, which will be substituted with
 | |
| the show number.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
|     Carp
 | |
|     Data::Dumper
 | |
|     File::Basename
 | |
|     Getopt::Long
 | |
|     HTML::TreeBuilder
 | |
|     IO::HTML
 | |
|     Log::Handler
 | |
|     Pod::Usage
 | |
|     URI
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Dave Morriss (Dave.Morriss@gmail.com)
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 | |
| 
 |