forked from HPR/hpr-tools
		
	
		
			
	
	
		
			752 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			752 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: fix_relative_links | ||
|  | # | ||
|  | #        USAGE: ./fix_relative_links [options] -episode=N FILE | ||
|  | # | ||
|  | #  DESCRIPTION: Processes an HTML input file, looking for relative URLs. If | ||
|  | #               any are found these are made absolute using the -baseURL=URL | ||
|  | #               option or a default. The intention is to make them into | ||
|  | #               HPR-absolute URLs. | ||
|  | # | ||
|  | #      OPTIONS: --- | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: --- | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.0.3 | ||
|  | #      CREATED: 2022-10-14 11:56:03 | ||
|  | #     REVISION: 2022-10-23 22:12:08 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use v5.16; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | use utf8; | ||
|  | use feature qw{ postderef say signatures state }; | ||
|  | no warnings qw{ experimental::postderef experimental::signatures }; | ||
|  | 
 | ||
|  | use Carp; | ||
|  | use Getopt::Long; | ||
|  | use Pod::Usage; | ||
|  | 
 | ||
|  | use File::Basename; | ||
|  | 
 | ||
|  | use IO::HTML; | ||
|  | use HTML::TreeBuilder 5 -weak; | ||
|  | use URI; | ||
|  | 
 | ||
|  | use Log::Handler; | ||
|  | 
 | ||
|  | use Data::Dumper; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (manually incremented) | ||
|  | # | ||
|  | our $VERSION = '0.0.3'; | ||
|  | 
 | ||
|  | # | ||
|  | # Script and directory names | ||
|  | # | ||
|  | ( my $PROG = $0 ) =~ s|.*/||mx; | ||
|  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | ||
|  | $DIR = '.' unless $DIR; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Declarations | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Constants and other declarations | ||
|  | # | ||
|  | my $basedir = "$ENV{HOME}/HPR/Show_Submission"; | ||
|  | my $logdir  = "$basedir/logs"; | ||
|  | my $logfile = "$logdir/${PROG}.log"; | ||
|  | 
 | ||
|  | # | ||
|  | # Variables, arrays and hashes | ||
|  | # | ||
|  | my ( $DEBUG, $verbose, $silent, $showno, $base_URL, $fragment, $count_only ); | ||
|  | my ( $outfile, $filename, $showdir, $changes, $html ); | ||
|  | 
 | ||
|  | # | ||
|  | # Enable Unicode mode | ||
|  | # | ||
|  | binmode STDOUT, ":encoding(UTF-8)"; | ||
|  | binmode STDERR, ":encoding(UTF-8)"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Options and arguments | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Option defaults | ||
|  | # | ||
|  | my $DEFDEBUG = 0; | ||
|  | 
 | ||
|  | my %options; | ||
|  | Options( \%options ); | ||
|  | 
 | ||
|  | # | ||
|  | # Default help shows minimal information | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | ||
|  |     if ( $options{'help'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # The -documentation or -man option shows the full POD documentation through | ||
|  | # a pager for convenience | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | ||
|  |     if ( $options{'documentation'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # Collect options | ||
|  | # | ||
|  | $DEBUG      = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); | ||
|  | $showno     = $options{episode}; | ||
|  | $base_URL   = $options{baseURL}; | ||
|  | $fragment   = ( defined( $options{fragment} ) ? $options{fragment} : 0 ); | ||
|  | $count_only = ( defined( $options{count} )    ? $options{count}    : 0 ); | ||
|  | $outfile    = $options{output}; | ||
|  | 
 | ||
|  | # | ||
|  | # Argument | ||
|  | # | ||
|  | $filename = shift; | ||
|  | 
 | ||
|  | # | ||
|  | # Sanity checks | ||
|  | # | ||
|  | pod2usage( | ||
|  |     -msg     => "$PROG version $VERSION\nShow number missing\n", | ||
|  |     -exitval => 1, | ||
|  |     -verbose => 0 | ||
|  | ) unless $showno; | ||
|  | 
 | ||
|  | pod2usage( | ||
|  |     -msg     => "$PROG version $VERSION\nInput file name missing\n", | ||
|  |     -exitval => 1, | ||
|  |     -verbose => 0 | ||
|  | ) unless $filename; | ||
|  | 
 | ||
|  | # | ||
|  | # Add leading zeroes to the show number if necessary | ||
|  | # | ||
|  | $showno = sprintf( '%04d', $showno ); | ||
|  | 
 | ||
|  | # | ||
|  | # Directories and files specific to this show | ||
|  | # | ||
|  | $showdir = "$basedir/shownotes/hpr$showno"; | ||
|  | 
 | ||
|  | # | ||
|  | # Allow the input filename to be a bare name | ||
|  | # | ||
|  | if ( !-e $filename ) { | ||
|  |     $filename = "$showdir/$filename"; | ||
|  | } | ||
|  | die "Unable to find $filename" unless ( -e $filename ); | ||
|  | 
 | ||
|  | # | ||
|  | # Work on the output file, allowing defaults and substitution points for | ||
|  | # convenience. If there's no outfile we'll just process the HTML and nothing | ||
|  | # more. | ||
|  | # | ||
|  | if ( defined($outfile) ) { | ||
|  |     $outfile = output_file_name( $outfile, $showno, 'hpr%d_new.html' ); | ||
|  |     $outfile = "$showdir/$outfile" if (dirname($outfile) eq '.'); | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Default base URL | ||
|  | # | ||
|  | unless ($base_URL) { | ||
|  |     $base_URL = "https://hackerpublicradio.org/eps/hpr$showno/"; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Base URL must have a trailing '/' | ||
|  | # | ||
|  | $base_URL .= '/' unless ( $base_URL =~ qr{/$} ); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Set up logging keeping the default log layout except for the date. The format | ||
|  | # is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is | ||
|  | # the message. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $log = Log::Handler->new(); | ||
|  | 
 | ||
|  | $log->add( | ||
|  |     file => { | ||
|  |         timeformat => "%Y/%m/%d %H:%M:%S", | ||
|  |         filename   => $logfile, | ||
|  |         minlevel   => 0, | ||
|  |         maxlevel   => 7, | ||
|  |     } | ||
|  | ); | ||
|  | 
 | ||
|  | # | ||
|  | # Log preamble | ||
|  | # | ||
|  | $log->info("Show number: $showno"); | ||
|  | $log->info("Processing: $filename"); | ||
|  | $log->info("Base: $base_URL"); | ||
|  | 
 | ||
|  | # | ||
|  | # Find and change any relative URLs returning the number of changes and the | ||
|  | # altered HTML | ||
|  | # | ||
|  | ( $changes, $html ) | ||
|  |     = find_links_in_file( $filename, $base_URL, $fragment, $count_only ); | ||
|  | 
 | ||
|  | $log->info("Number of changes: $changes"); | ||
|  | 
 | ||
|  | # | ||
|  | # Exit without writing if we're just counting | ||
|  | # | ||
|  | if ($count_only) { | ||
|  |     $log->info("Count only mode"); | ||
|  |     exit $changes; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Exit without writing if there were no changes | ||
|  | # | ||
|  | if ($changes == 0) { | ||
|  |     $log->info("No output written"); | ||
|  |     exit $changes; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Write output if an output file was specified | ||
|  | # | ||
|  | if ($outfile) { | ||
|  |     write_output( $outfile, $html ); | ||
|  |     $log->info("Changes applied; written to $outfile"); | ||
|  | } | ||
|  | else { | ||
|  |     $log->info("No output written"); | ||
|  | } | ||
|  | 
 | ||
|  | exit $changes; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: find_links_in_file | ||
|  | #      PURPOSE: Finds relative links in an HTML file | ||
|  | #   PARAMETERS: $filename       the name of the file we're parsing | ||
|  | #               $base_URL       the part of the full URL we'll replace | ||
|  | #               $fragment       Boolean signalling whether to treat the HTML | ||
|  | #                               as a fragment or an entire document | ||
|  | #               $count_only     Boolean signalling that all we want is the | ||
|  | #                               count of relative URLs, no action is to be taken | ||
|  | #      RETURNS: The number of URLs "repaired". | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub find_links_in_file { | ||
|  |     my ( $filename, $base_URL, $fragment, $count_only ) = @_; | ||
|  | 
 | ||
|  |     my ( $base_uri, $tree, $uri_orig, $uri_new ); | ||
|  |     my ( $newlink, $linkedits, $result ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Parse the base URL | ||
|  |     # | ||
|  |     $base_uri = URI->new($base_URL); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Create a tree object | ||
|  |     # | ||
|  |     $tree = HTML::TreeBuilder->new; | ||
|  |     $tree->ignore_unknown(0); | ||
|  |     $tree->no_expand_entities(1); | ||
|  |     $tree->p_strict(1); | ||
|  |     $tree->store_comments(1); | ||
|  |     $tree->warn(1); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Parse the file using IO::HTML to grab it. Die if we fail because then we | ||
|  |     # know this stuff needs some urgent attention. | ||
|  |     # | ||
|  |     $tree->parse_file( html_file($filename) ) | ||
|  |         or die "HTML::TreeBuilder failed to process $filename: $!\n"; | ||
|  | 
 | ||
|  |     $linkedits = 0; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Scan for all anchors and images using the HTML::Element method | ||
|  |     # 'extract_links'. The technique used here is from the HTML::Element man | ||
|  |     # page. | ||
|  |     # | ||
|  |     for ( @{ $tree->extract_links( 'a', 'img' ) } ) { | ||
|  |         my ( $link, $element, $attr, $tag ) = @$_; | ||
|  | 
 | ||
|  |         # | ||
|  |         # Parse the link | ||
|  |         # | ||
|  |         $uri_orig = URI->new($link); | ||
|  | 
 | ||
|  |         # | ||
|  |         # A relative link (presumably) doesn't have a scheme | ||
|  |         # | ||
|  |         unless ( $uri_orig->scheme ) { | ||
|  |             # | ||
|  |             # Original link | ||
|  |             # | ||
|  |             say "Relative link: $link"; | ||
|  | 
 | ||
|  |             # | ||
|  |             # Make the link absolute | ||
|  |             # | ||
|  |             $uri_new = make_absolute( $uri_orig, $base_uri ); | ||
|  |             #            $uri_new = URI->new_abs( $link, $base_URL ); | ||
|  |             $newlink = sprintf( "%s:%s", $uri_new->scheme, $uri_new->opaque ); | ||
|  |             say "Absolute link: $newlink"; | ||
|  | 
 | ||
|  |             # | ||
|  |             # Modify the HTML to make the relative absolute | ||
|  |             # | ||
|  |             if ( $uri_orig->fragment ) { | ||
|  |                 # Not sure if we need to cater for URI fragments, but we'll try it | ||
|  |                 $element->attr( $attr, $newlink . '#' . $uri_orig->fragment ); | ||
|  |             } | ||
|  |             else { | ||
|  |                 $element->attr( $attr, $newlink ); | ||
|  |             } | ||
|  | 
 | ||
|  |             $linkedits++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Exit here if we were just asked to count | ||
|  |     # | ||
|  |     return ( $linkedits, undef ) if $count_only; | ||
|  | 
 | ||
|  |     # | ||
|  |     # In 'HTML fragment' mode generate the body part without the <body> tags. | ||
|  |     # | ||
|  |     if ($fragment) { | ||
|  |         my $body = $tree->look_down( _tag => 'body' ); | ||
|  |         ( $result = $body->as_HTML( undef, ' ', {} ) ) | ||
|  |             =~ s{(^<body[^>]*>|</body>$)}{}gi; | ||
|  |     } | ||
|  |     else { | ||
|  |         $result = $tree->as_HTML( undef, ' ', {} ); | ||
|  |     } | ||
|  | 
 | ||
|  |     return ( $linkedits, $result ); | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: write_output | ||
|  | #      PURPOSE: Write the "repaired" HTML | ||
|  | #   PARAMETERS: $outfile        name of the output file | ||
|  | #               $html           the HTML to write out | ||
|  | #      RETURNS: Nothing | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub write_output { | ||
|  |     my ( $outfile, $html ) = @_; | ||
|  | 
 | ||
|  |     open( my $out, '>:encoding(UTF-8)', $outfile ) | ||
|  |         or die "Unable to open $outfile for output: $!\n"; | ||
|  | 
 | ||
|  |     print $out $html; | ||
|  | 
 | ||
|  |     close($out); | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: make_absolute | ||
|  | #      PURPOSE: Take a relative URI and a base URI and return the absolute URI | ||
|  | #   PARAMETERS: $relative       relative URL as a URI object | ||
|  | #               $base           base URL as a URI object | ||
|  | #      RETURNS: Absolute URL as a URI object | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub make_absolute { | ||
|  |     my ( $relative, $base ) = @_; | ||
|  | 
 | ||
|  |     my ( %base_path, @relative_path, $absolute ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Chop up the path from the base and store in a hash | ||
|  |     # | ||
|  |     %base_path = map { $_ => 1 } split( '/', $base->path ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Chop up the relative path | ||
|  |     # | ||
|  |     @relative_path = split( '/', $relative->path ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Remove relative path elements if they are in the base | ||
|  |     # | ||
|  |     @relative_path = grep { !exists( $base_path{$_} ) } @relative_path; | ||
|  | 
 | ||
|  |     # | ||
|  |     # If the relative path is empty we assume it's referring to the | ||
|  |     # 'index.html' file. | ||
|  |     # | ||
|  |     push( @relative_path, 'index.html' ) unless (@relative_path); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Rebuild the relative path | ||
|  |     # | ||
|  |     $relative->path( join( '/', @relative_path ) ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Return the result of joining relative URL and base URL | ||
|  |     # | ||
|  |     $absolute = URI->new_abs( $relative->as_string, $base->as_string ); | ||
|  | 
 | ||
|  |     return $absolute; | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: output_file_name | ||
|  | #      PURPOSE: Generate an output file name with three choices | ||
|  | #   PARAMETERS: $optarg         the argument to the option choosing the filename | ||
|  | #               $showno         the show number to add to certain name types | ||
|  | #               $template       a default 'sprintf' template for the name | ||
|  | #      RETURNS: The name of the output file | ||
|  | #  DESCRIPTION: If there's a defined output filename then there are three | ||
|  | #               options: a null string, a plain filename and a substitution | ||
|  | #               string with '%d' sequences. The null string means the user used | ||
|  | #               '-option' without a value, so we want to generate a substitution | ||
|  | #               string. A string with '%d' requires a check to ensure there's | ||
|  | #               the right number, just one. The plain filename needs no more | ||
|  | #               work. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub output_file_name { | ||
|  |     my ( $optarg, $showno, $template ) = @_; | ||
|  | 
 | ||
|  |     my ( $filename, $count ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # We shouldn't be called with a null option argument | ||
|  |     # | ||
|  |     return unless defined($optarg); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Does the option have an argument? | ||
|  |     # | ||
|  |     if ( $optarg =~ /^$/ ) { | ||
|  |         # | ||
|  |         # No argument; use the show number from the -episode=N option | ||
|  |         # | ||
|  |         $filename = sprintf( $template, $showno ); | ||
|  |     } | ||
|  |     elsif ( $optarg =~ /%d/ ) { | ||
|  |         # | ||
|  |         # There's an argument, does it have a '%d' in it? | ||
|  |         # | ||
|  |         $count = () = $optarg =~ /%d/g; | ||
|  |         die "Invalid - too many '%d' sequences in '$optarg'\n" | ||
|  |             if ( $count > 1 ); | ||
|  |         $filename = sprintf( $optarg, $showno ); | ||
|  |     } | ||
|  |     else { | ||
|  |         # | ||
|  |         # It's a plain filename, just return it | ||
|  |         # | ||
|  |         $filename = $optarg; | ||
|  |     } | ||
|  | 
 | ||
|  |     return $filename; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: coalesce | ||
|  | #      PURPOSE: To find the first defined argument and return it | ||
|  | #   PARAMETERS: Arbitrary number of arguments | ||
|  | #      RETURNS: The first defined argument or undef if there are none | ||
|  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | ||
|  | #               returned when doing so might be a problem. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub coalesce { | ||
|  |     foreach (@_) { | ||
|  |         return $_ if defined($_); | ||
|  |     } | ||
|  |     return undef;    ## no critic | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: _debug | ||
|  | #      PURPOSE: Prints debug reports | ||
|  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | ||
|  | #               $message        Message to print | ||
|  | #      RETURNS: Nothing | ||
|  | #  DESCRIPTION: Outputs a message if $active is true. It removes any trailing | ||
|  | #               newline and then adds one in the 'print' to the caller doesn't | ||
|  | #               have to bother. Prepends the message with 'D> ' to show it's | ||
|  | #               a debug message. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub _debug { | ||
|  |     my ( $active, $message ) = @_; | ||
|  | 
 | ||
|  |     chomp($message); | ||
|  |     print "D> $message\n" if $active; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: Options | ||
|  | #      PURPOSE: Processes command-line options | ||
|  | #   PARAMETERS: $optref     Hash reference to hold the options | ||
|  | #      RETURNS: Undef | ||
|  | #  DESCRIPTION: Process the options we want to offer. See the documentation | ||
|  | #               for details | ||
|  | #       THROWS: no exceptions | ||
|  | #     COMMENTS: none | ||
|  | #     SEE ALSO: n/a | ||
|  | #=============================================================================== | ||
|  | sub Options { | ||
|  |     my ($optref) = @_; | ||
|  | 
 | ||
|  |     my @options = ( | ||
|  |         "help",      "documentation|man", "debug=i", "episode=i", | ||
|  |         "baseURL=s", "fragment!",         "count!",  "output:s", | ||
|  |     ); | ||
|  | 
 | ||
|  |     if ( !GetOptions( $optref, @options ) ) { | ||
|  |         pod2usage( | ||
|  |             -msg     => "$PROG version $VERSION\n", | ||
|  |             -exitval => 1, | ||
|  |             -verbose => 0 | ||
|  |         ); | ||
|  |     } | ||
|  | 
 | ||
|  |     return; | ||
|  | } | ||
|  | 
 | ||
|  | __END__ | ||
|  | 
 | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #  Application Documentation | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #{{{ | ||
|  | 
 | ||
|  | =head1 NAME | ||
|  | 
 | ||
|  | fix_relative_links - Repair relative URLs in HTML shownotes | ||
|  | 
 | ||
|  | =head1 VERSION | ||
|  | 
 | ||
|  | This documentation refers to fix_relative_links version 0.0.3 | ||
|  | 
 | ||
|  | =head1 USAGE | ||
|  | 
 | ||
|  |     ./fix_relative_links -ep=3705 shownotes/hpr3705/hpr3705.html -fragment | ||
|  | 
 | ||
|  | =head1 REQUIRED ARGUMENTS | ||
|  | 
 | ||
|  | =over 4 | ||
|  | 
 | ||
|  | =item B<filename> | ||
|  | 
 | ||
|  | The name of the file containing the HTML to be repaired. If no path is given | ||
|  | this will be supplied by the script as: | ||
|  | 
 | ||
|  |     ~/HPR/Show_Submission/shownotes/hpr${show}/ | ||
|  | 
 | ||
|  | It is probably wiser to be explicit about the path to the HTML file to be | ||
|  | parsed. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 OPTIONS | ||
|  | 
 | ||
|  | =over 8 | ||
|  | 
 | ||
|  | =item B<-help> | ||
|  | 
 | ||
|  | Prints a brief help message describing the usage of the program, and then exits. | ||
|  | 
 | ||
|  | =item B<-documentation> or B<-man> | ||
|  | 
 | ||
|  | Prints the entire documentation for the script in the form of a manual page. | ||
|  | 
 | ||
|  | =item B<-debug=N> | ||
|  | 
 | ||
|  | Causes certain debugging information to be displayed. | ||
|  | 
 | ||
|  |     0   (the default) no debug output | ||
|  |     1    | ||
|  |     2    | ||
|  |     3    | ||
|  | 
 | ||
|  | =item B<-episode=N> | ||
|  | 
 | ||
|  | This option is mandatory and specifies the show number being processed. The | ||
|  | number is used to generate default file names and paths as well as the default | ||
|  | base URL described below. | ||
|  | 
 | ||
|  | =item B<-baseURL=URL> | ||
|  | 
 | ||
|  | This option will default to the foillowing URL if not provided: | ||
|  | 
 | ||
|  |     https://hackerpublicradio.org/eps/hpr${show}/ | ||
|  | 
 | ||
|  | It can be used to define a non-standard URL, such as one at a lower level than | ||
|  | the example above which might contain thumbnail pictures for example. | ||
|  | 
 | ||
|  | =item B<-[no]fragment> | ||
|  | 
 | ||
|  | This Boolean option defines the HTML being parsed and checked as a fragment or | ||
|  | a complete stand-alone document. By default B<-nofragment> is assumed. It is | ||
|  | necessary to use B<fragment> for the case where the HTML shownotes are being | ||
|  | parsed. | ||
|  | 
 | ||
|  | =item B<-[no]count> | ||
|  | 
 | ||
|  | This Boolean option defines whether to simply count the necessary changes or | ||
|  | to apply them to the given HTML file. By default B<-nocount> is assumed, and | ||
|  | changes will be applied. | ||
|  | 
 | ||
|  | =item B<-output[=FILE]> | ||
|  | 
 | ||
|  | This option can be omitted or can be given without the B<FILE> name. If | ||
|  | omitted entirely no output will be written even though the HTML file has been | ||
|  | read and processed. If specified without the output file name the default name | ||
|  | will be B<hpr${show}_new.html>. If no path is specified with the file name | ||
|  | then a default will be generated as: | ||
|  | 
 | ||
|  |     ~/HPR/Show_Submission/shownotes/hpr${show}/hpr${show}_new.html | ||
|  | 
 | ||
|  | The output file name can be given in the form of a B<printf> template such as: | ||
|  | 
 | ||
|  |     hpr%d_new.html | ||
|  | 
 | ||
|  | and the B<%d> will be replaced by the show number given through the | ||
|  | B<-episode=N> option described above. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 DESCRIPTION | ||
|  | 
 | ||
|  | The script reads a file of HTML which has either been submitted by an HPR host | ||
|  | as it is or has been generated from one of the markup languages accepted in | ||
|  | the upload form. Most often this file will contain the main notes for a show | ||
|  | and will eventually be saved in the HPR database. | ||
|  | 
 | ||
|  | It is also possible to use the script to process other HTML files submitted | ||
|  | with an HPR show. | ||
|  | 
 | ||
|  | The purpose of the script is to find relative URLs in the HTML and convert | ||
|  | them to absolute ones. The HPR website requests that absolute URLs be used | ||
|  | since then they can be used in the various RSS feeds which are available, but | ||
|  | many hosts forget to follow this request. | ||
|  | 
 | ||
|  | The HTML is parsed using B<HTML::TreeBuilder> and is searched for B<a> or | ||
|  | B<img> tags. These are checked to ensure they contain absolute links, and if | ||
|  | not are converted appropriately using a base URL for the HPR website. | ||
|  | 
 | ||
|  | A count of changes is returned by the script and the converted HTML is written | ||
|  | out to a file if required. The script can be used to see if any conversions | ||
|  | are necessary before making the changes. | ||
|  | 
 | ||
|  | The script is also capable of treating full HTML documents differently from | ||
|  | the HTML fragments that are stored in the HPR database. An option is required | ||
|  | to specify which type of HTML is being processed. | ||
|  | 
 | ||
|  | =head1 DIAGNOSTICS | ||
|  | 
 | ||
|  | Error and warning messages generated by the script. | ||
|  | 
 | ||
|  | =over 4 | ||
|  | 
 | ||
|  | =item B<Unable to find ...> | ||
|  | 
 | ||
|  | Type: fatal | ||
|  | 
 | ||
|  | The script was unable to find the specified input file. | ||
|  | 
 | ||
|  | =item B<HTML::TreeBuilder failed to process ...: ...> | ||
|  | 
 | ||
|  | Type: fatal | ||
|  | 
 | ||
|  | The script attempted to use B<HTML::TreeBuilder> to parse the input file | ||
|  | but failed. The message also contains details of the failure. | ||
|  | 
 | ||
|  | =item B<Unable to open ... for output: ...> | ||
|  | 
 | ||
|  | Type: fatal | ||
|  | 
 | ||
|  | The script attempted to open the requested output file but failed. The reason | ||
|  | for the failure is included in the error message. | ||
|  | 
 | ||
|  | =item B<Invalid - too many '%d' sequences in '...'> | ||
|  | 
 | ||
|  | Type: fatal | ||
|  | 
 | ||
|  | The script attempted to generate a name for the requested output file using | ||
|  | the supplied template, but failed because there were too many B<%d> elements | ||
|  | in the template. Only one should be provided, which will be substituted with | ||
|  | the show number. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 DEPENDENCIES | ||
|  | 
 | ||
|  |     Carp | ||
|  |     Data::Dumper | ||
|  |     File::Basename | ||
|  |     Getopt::Long | ||
|  |     HTML::TreeBuilder | ||
|  |     IO::HTML | ||
|  |     Log::Handler | ||
|  |     Pod::Usage | ||
|  |     URI | ||
|  | 
 | ||
|  | =head1 BUGS AND LIMITATIONS | ||
|  | 
 | ||
|  | There are no known bugs in this module. | ||
|  | Please report problems to Dave Morriss (Dave.Morriss@gmail.com) | ||
|  | Patches are welcome. | ||
|  | 
 | ||
|  | =head1 AUTHOR | ||
|  | 
 | ||
|  | Dave Morriss (Dave.Morriss@gmail.com) | ||
|  | 
 | ||
|  | =head1 LICENCE AND COPYRIGHT | ||
|  | 
 | ||
|  | Copyright (c) <year> Dave Morriss (Dave.Morriss@gmail.com). All rights reserved. | ||
|  | 
 | ||
|  | This module is free software; you can redistribute it and/or | ||
|  | modify it under the same terms as Perl itself. See perldoc perlartistic. | ||
|  | 
 | ||
|  | This program is distributed in the hope that it will be useful, | ||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
|  | 
 | ||
|  | =cut | ||
|  | 
 | ||
|  | #}}} | ||
|  | 
 | ||
|  | # [zo to open fold, zc to close] | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | ||
|  | 
 |