246 lines
		
	
	
		
			7.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			246 lines
		
	
	
		
			7.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: reformat_html | ||
|  | # | ||
|  | #        USAGE: ./reformat_html < input.html > output.html | ||
|  | # | ||
|  | #  DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes' | ||
|  | #               field to the format required in the 'description' field of an | ||
|  | #               item on the IA. It reads from STDIN and writes to STDOUT. | ||
|  | # | ||
|  | #      OPTIONS: --- | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: --- | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.0.1 | ||
|  | #      CREATED: 2025-02-09 22:56:30 | ||
|  | #     REVISION: 2025-02-13 11:13:37 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use v5.36; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | use feature qw{ say try }; | ||
|  | no warnings qw{ experimental::try }; | ||
|  | 
 | ||
|  | use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 | ||
|  | 
 | ||
|  | use HTML::TreeBuilder 5 -weak; | ||
|  | use HTML::Entities; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (Incremented by Vim) | ||
|  | # | ||
|  | our $VERSION = '0.0.1'; | ||
|  | 
 | ||
|  | # | ||
|  | # Declarations | ||
|  | # | ||
|  | my ($verbose, @notes, $notes, $tree); | ||
|  | 
 | ||
|  | # | ||
|  | # Read the input data into an array | ||
|  | # | ||
|  | try { | ||
|  |     @notes = <STDIN>; | ||
|  | } | ||
|  | catch ($e) { | ||
|  |     warn "Problem reading input HTML; $e"; | ||
|  |     exit 1; | ||
|  | } | ||
|  | 
 | ||
|  | die "No input HTML detected\n" unless @notes; | ||
|  | 
 | ||
|  | # | ||
|  | # Turn the array into a scalar | ||
|  | # | ||
|  | $notes = join( '', @notes ); | ||
|  | 
 | ||
|  | # | ||
|  | # Get ready to parse the array | ||
|  | # | ||
|  | $tree = HTML::TreeBuilder->new; | ||
|  | $tree->ignore_unknown(0); | ||
|  | $tree->no_expand_entities(1); | ||
|  | $tree->p_strict(1); | ||
|  | $tree->store_comments(1);               # Necessary? | ||
|  | $tree->warn(1); | ||
|  | 
 | ||
|  | # | ||
|  | # Parse HTML to the tree structure | ||
|  | # | ||
|  | $tree->parse_content($notes) | ||
|  |     or die "HTML::TreeBuilder failed to parse input HTML: $!\n"; | ||
|  | 
 | ||
|  | # | ||
|  | # Flatten all <pre> tags and add <br/> tags | ||
|  | # | ||
|  | $notes = flatten_pre($tree); | ||
|  | 
 | ||
|  | # | ||
|  | # Deal with non-ASCII | ||
|  | # | ||
|  | $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); | ||
|  | 
 | ||
|  | # | ||
|  | # Remove all newlines | ||
|  | # | ||
|  | $notes =~ s/\n//g; | ||
|  | 
 | ||
|  | # | ||
|  | # Write the end result to the STDOUT | ||
|  | # | ||
|  | say $notes; | ||
|  | 
 | ||
|  | exit; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: flatten_pre | ||
|  | #      PURPOSE: Process notes "flattening" <pre> contents | ||
|  | #   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and | ||
|  | #                       partially processed notes | ||
|  | #      RETURNS: Processed notes | ||
|  | #  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also | ||
|  | #               contain some formatting tags like <em> and <code>, but spaces | ||
|  | #               and newlines are significant. The Internet Archive upload API | ||
|  | #               uses HTTP headers which are text strings without newlines, so | ||
|  | #               when these tags are uploaded through this route some | ||
|  | #               formatting is lost. What this routine does is parse the | ||
|  | #               contents of all <pre> sections in $notes, adding <br/> tags | ||
|  | #               to replace newlines. It has to perform a full parse | ||
|  | #               since the contents may include HTML tags and these need to be | ||
|  | #               passed through intact. It calls the subroutine 'flatten_item' to | ||
|  | #               deal with the recursive nature of HTML tags. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub flatten_pre { | ||
|  |     my ($tree) = @_; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Find all the <pre> tags | ||
|  |     # | ||
|  |     my @pre_tags = $tree->look_down( _tag => 'pre', ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Walk the various <pre> elements in the document | ||
|  |     # | ||
|  |     foreach my $tag (@pre_tags) { | ||
|  |         # | ||
|  |         # Save the tag and empty the original | ||
|  |         # | ||
|  |         my $saved = $tag->clone(); | ||
|  |         $tag->delete_content(); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Walk the saved content and rebuild the tag into $atag using the | ||
|  |         # nested arrayref structure permitted by HTML::Element for | ||
|  |         # convenience (the alternative is a little nasty). See the | ||
|  |         # documentation for 'new_from_lol' in HTML::Element. | ||
|  |         # | ||
|  |         my $atag; | ||
|  |         foreach my $item ( @{ $saved->content_array_ref } ) { | ||
|  |             push( @$atag, flatten_item($item) ); | ||
|  |         } | ||
|  | 
 | ||
|  |         # | ||
|  |         # Rebuild the tag from the arrayref we built. We treat the arrayref | ||
|  |         # structure we just built as an array because otherwise the top level | ||
|  |         # is interpreted as a spurious <null> tag. | ||
|  |         # | ||
|  |         $tag->push_content(@$atag); | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Trim out the original notes from the enclosing tags we added earlier | ||
|  |     # | ||
|  |     my $body = $tree->look_down( _tag => 'body' ); | ||
|  |     ( my $result = $body->as_HTML( undef, ' ', {} ) ) | ||
|  |         =~ s{(^<body[^>]*>|</body>$)}{}gi; | ||
|  | 
 | ||
|  |     return $result; | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: flatten_item | ||
|  | #      PURPOSE: Recursively "flatten" items within the enclosing <pre> | ||
|  | #   PARAMETERS: $item   an HTML::Element item parsed from the original | ||
|  | #                       <pre> section | ||
|  | #      RETURNS: An arrayref if the last seen item was a tag, otherwise a list | ||
|  | #  DESCRIPTION: Since <pre> sections can contain inline elements which change | ||
|  | #               the rendering of the text we need to parse these as we add | ||
|  | #               <br/> tags. This routine does this by recursively descending | ||
|  | #               through the contents. A common tag sequence is <pre><code> for | ||
|  | #               scripts and the like. This routine deals with such sequences. | ||
|  | #               It expects to receive the contents in sequence and builds the | ||
|  | #               result as a nested arrayref structure. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub flatten_item { | ||
|  |     my ($item) = @_; | ||
|  | 
 | ||
|  |     return unless defined($item); | ||
|  | 
 | ||
|  |     my ( @result, %attr ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Is it a sub-tag or non-tag content? | ||
|  |     # | ||
|  |     if ( ref($item) ) { | ||
|  |         # | ||
|  |         # It's a tag. Save the tag name and any attributes and recurse into | ||
|  |         # it. Return an arrayref | ||
|  |         # | ||
|  |         push( @result, $item->tag() ); | ||
|  |         %attr = $item->all_external_attr(); | ||
|  |         push( @result, \%attr ) if %attr; | ||
|  |         for my $child ( $item->content_list() ) { | ||
|  |             push( @result, flatten_item($child) ); | ||
|  |         } | ||
|  |         return \@result; | ||
|  |     } | ||
|  |     else { | ||
|  |         # | ||
|  |         # It's non-tag content. Join the lines with <br/> tags.  Return an | ||
|  |         # array (since this is a simple list). | ||
|  |         # | ||
|  |         # Note that we split with a LIMIT of -1 which causes any trailing list | ||
|  |         # items to be returned; default behaviour is to drop them. | ||
|  |         # | ||
|  |         $item =~ s/\r//g; | ||
|  |         my @content = split( /\n/, $item, -1 ); | ||
|  |         if (@content) { | ||
|  |             # | ||
|  |             # Remove a leading blank line - usually the result of | ||
|  |             # a "<pre>'NL'text" sequence | ||
|  |             # | ||
|  |             shift(@content) if ( $content[0] =~ /^\s*$/ ); | ||
|  | 
 | ||
|  |             # | ||
|  |             # Join back the lines with <br/> tags between them. | ||
|  |             # | ||
|  |             foreach my $txt (@content) { | ||
|  |                 push( @result, $txt, ['br'] ); | ||
|  |             } | ||
|  | 
 | ||
|  |             # | ||
|  |             # Remove the <br/> at the end, it's spurious | ||
|  |             # | ||
|  |             pop(@result); | ||
|  |         } | ||
|  | 
 | ||
|  |         return (@result); | ||
|  |     } | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | ||
|  | 
 |