hpr-tools/InternetArchive/reformat_html

246 lines
7.0 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: reformat_html
#
# USAGE: ./reformat_html < input.html > output.html
#
# DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'
# field to the format required in the 'description' field of an
# item on the IA. It reads from STDIN and writes to STDOUT.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.1
# CREATED: 2025-02-09 22:56:30
# REVISION: 2025-02-13 11:13:37
#
#===============================================================================
use v5.36;
use strict;
use warnings;
use feature qw{ say try };
no warnings qw{ experimental::try };
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
use HTML::TreeBuilder 5 -weak;
use HTML::Entities;
#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.1';
#
# Declarations
#
my ($verbose, @notes, $notes, $tree);
#
# Read the input data into an array
#
try {
@notes = <STDIN>;
}
catch ($e) {
warn "Problem reading input HTML; $e";
exit 1;
}
die "No input HTML detected\n" unless @notes;
#
# Turn the array into a scalar
#
$notes = join( '', @notes );
#
# Get ready to parse the array
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1); # Necessary?
$tree->warn(1);
#
# Parse HTML to the tree structure
#
$tree->parse_content($notes)
or die "HTML::TreeBuilder failed to parse input HTML: $!\n";
#
# Flatten all <pre> tags and add <br/> tags
#
$notes = flatten_pre($tree);
#
# Deal with non-ASCII
#
$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
#
# Remove all newlines
#
$notes =~ s/\n//g;
#
# Write the end result to the STDOUT
#
say $notes;
exit;
#=== FUNCTION ================================================================
# NAME: flatten_pre
# PURPOSE: Process notes "flattening" <pre> contents
# PARAMETERS: $tree HTML::TreeBuilder object containing parsed and
# partially processed notes
# RETURNS: Processed notes
# DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
# contain some formatting tags like <em> and <code>, but spaces
# and newlines are significant. The Internet Archive upload API
# uses HTTP headers which are text strings without newlines, so
# when these tags are uploaded through this route some
# formatting is lost. What this routine does is parse the
# contents of all <pre> sections in $notes, adding <br/> tags
# to replace newlines. It has to perform a full parse
# since the contents may include HTML tags and these need to be
# passed through intact. It calls the subroutine 'flatten_item' to
# deal with the recursive nature of HTML tags.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flatten_pre {
my ($tree) = @_;
#
# Find all the <pre> tags
#
my @pre_tags = $tree->look_down( _tag => 'pre', );
#
# Walk the various <pre> elements in the document
#
foreach my $tag (@pre_tags) {
#
# Save the tag and empty the original
#
my $saved = $tag->clone();
$tag->delete_content();
#
# Walk the saved content and rebuild the tag into $atag using the
# nested arrayref structure permitted by HTML::Element for
# convenience (the alternative is a little nasty). See the
# documentation for 'new_from_lol' in HTML::Element.
#
my $atag;
foreach my $item ( @{ $saved->content_array_ref } ) {
push( @$atag, flatten_item($item) );
}
#
# Rebuild the tag from the arrayref we built. We treat the arrayref
# structure we just built as an array because otherwise the top level
# is interpreted as a spurious <null> tag.
#
$tag->push_content(@$atag);
}
#
# Trim out the original notes from the enclosing tags we added earlier
#
my $body = $tree->look_down( _tag => 'body' );
( my $result = $body->as_HTML( undef, ' ', {} ) )
=~ s{(^<body[^>]*>|</body>$)}{}gi;
return $result;
}
#=== FUNCTION ================================================================
# NAME: flatten_item
# PURPOSE: Recursively "flatten" items within the enclosing <pre>
# PARAMETERS: $item an HTML::Element item parsed from the original
# <pre> section
# RETURNS: An arrayref if the last seen item was a tag, otherwise a list
# DESCRIPTION: Since <pre> sections can contain inline elements which change
# the rendering of the text we need to parse these as we add
# <br/> tags. This routine does this by recursively descending
# through the contents. A common tag sequence is <pre><code> for
# scripts and the like. This routine deals with such sequences.
# It expects to receive the contents in sequence and builds the
# result as a nested arrayref structure.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flatten_item {
my ($item) = @_;
return unless defined($item);
my ( @result, %attr );
#
# Is it a sub-tag or non-tag content?
#
if ( ref($item) ) {
#
# It's a tag. Save the tag name and any attributes and recurse into
# it. Return an arrayref
#
push( @result, $item->tag() );
%attr = $item->all_external_attr();
push( @result, \%attr ) if %attr;
for my $child ( $item->content_list() ) {
push( @result, flatten_item($child) );
}
return \@result;
}
else {
#
# It's non-tag content. Join the lines with <br/> tags. Return an
# array (since this is a simple list).
#
# Note that we split with a LIMIT of -1 which causes any trailing list
# items to be returned; default behaviour is to drop them.
#
$item =~ s/\r//g;
my @content = split( /\n/, $item, -1 );
if (@content) {
#
# Remove a leading blank line - usually the result of
# a "<pre>'NL'text" sequence
#
shift(@content) if ( $content[0] =~ /^\s*$/ );
#
# Join back the lines with <br/> tags between them.
#
foreach my $txt (@content) {
push( @result, $txt, ['br'] );
}
#
# Remove the <br/> at the end, it's spurious
#
pop(@result);
}
return (@result);
}
}
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker