hpr-tools/InternetArchive/reformat_html

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: reformat_html
#
#        USAGE: ./reformat_html < input.html > output.html
#
#  DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'
#               field to the format required in the 'description' field of an
#               item on the IA. It reads from STDIN and writes to STDOUT.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.1
#      CREATED: 2025-02-09 22:56:30
#     REVISION: 2025-02-13 11:13:37
#
#===============================================================================

use v5.36;
use strict;
use warnings;
use feature qw{ say try };
no warnings qw{ experimental::try };

use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8

use HTML::TreeBuilder 5 -weak;
use HTML::Entities;

#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.1';

#
# Declarations
#
my ($verbose, @notes, $notes, $tree);

#
# Read the input data into an array
#
try {
    @notes = <STDIN>;
}
catch ($e) {
    warn "Problem reading input HTML; $e";
    exit 1;
}

die "No input HTML detected\n" unless @notes;

#
# Turn the array into a scalar
#
$notes = join( '', @notes );

#
# Get ready to parse the array
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);               # Necessary?
$tree->warn(1);

#
# Parse HTML to the tree structure
#
$tree->parse_content($notes)
    or die "HTML::TreeBuilder failed to parse input HTML: $!\n";

#
# Flatten all <pre> tags and add <br/> tags
#
$notes = flatten_pre($tree);

#
# Deal with non-ASCII
#
$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );

#
# Remove all newlines
#
$notes =~ s/\n//g;

#
# Write the end result to the STDOUT
#
say $notes;

exit;

#===  FUNCTION  ================================================================
#         NAME: flatten_pre
#      PURPOSE: Process notes "flattening" <pre> contents
#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
#                       partially processed notes
#      RETURNS: Processed notes
#  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
#               contain some formatting tags like <em> and <code>, but spaces
#               and newlines are significant. The Internet Archive upload API
#               uses HTTP headers which are text strings without newlines, so
#               when these tags are uploaded through this route some
#               formatting is lost. What this routine does is parse the
#               contents of all <pre> sections in $notes, adding <br/> tags
#               to replace newlines. It has to perform a full parse
#               since the contents may include HTML tags and these need to be
#               passed through intact. It calls the subroutine 'flatten_item' to
#               deal with the recursive nature of HTML tags.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_pre {
    my ($tree) = @_;

    #
    # Find all the <pre> tags
    #
    my @pre_tags = $tree->look_down( _tag => 'pre', );

    #
    # Walk the various <pre> elements in the document
    #
    foreach my $tag (@pre_tags) {
        #
        # Save the tag and empty the original
        #
        my $saved = $tag->clone();
        $tag->delete_content();

        #
        # Walk the saved content and rebuild the tag into $atag using the
        # nested arrayref structure permitted by HTML::Element for
        # convenience (the alternative is a little nasty). See the
        # documentation for 'new_from_lol' in HTML::Element.
        #
        my $atag;
        foreach my $item ( @{ $saved->content_array_ref } ) {
            push( @$atag, flatten_item($item) );
        }

        #
        # Rebuild the tag from the arrayref we built. We treat the arrayref
        # structure we just built as an array because otherwise the top level
        # is interpreted as a spurious <null> tag.
        #
        $tag->push_content(@$atag);
    }

    #
    # Trim out the original notes from the enclosing tags we added earlier
    #
    my $body = $tree->look_down( _tag => 'body' );
    ( my $result = $body->as_HTML( undef, ' ', {} ) )
        =~ s{(^<body[^>]*>|</body>$)}{}gi;

    return $result;

}

#===  FUNCTION  ================================================================
#         NAME: flatten_item
#      PURPOSE: Recursively "flatten" items within the enclosing <pre>
#   PARAMETERS: $item   an HTML::Element item parsed from the original
#                       <pre> section
#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
#  DESCRIPTION: Since <pre> sections can contain inline elements which change
#               the rendering of the text we need to parse these as we add
#               <br/> tags. This routine does this by recursively descending
#               through the contents. A common tag sequence is <pre><code> for
#               scripts and the like. This routine deals with such sequences.
#               It expects to receive the contents in sequence and builds the
#               result as a nested arrayref structure.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_item {
    my ($item) = @_;

    return unless defined($item);

    my ( @result, %attr );

    #
    # Is it a sub-tag or non-tag content?
    #
    if ( ref($item) ) {
        #
        # It's a tag. Save the tag name and any attributes and recurse into
        # it. Return an arrayref
        #
        push( @result, $item->tag() );
        %attr = $item->all_external_attr();
        push( @result, \%attr ) if %attr;
        for my $child ( $item->content_list() ) {
            push( @result, flatten_item($child) );
        }
        return \@result;
    }
    else {
        #
        # It's non-tag content. Join the lines with <br/> tags.  Return an
        # array (since this is a simple list).
        #
        # Note that we split with a LIMIT of -1 which causes any trailing list
        # items to be returned; default behaviour is to drop them.
        #
        $item =~ s/\r//g;
        my @content = split( /\n/, $item, -1 );
        if (@content) {
            #
            # Remove a leading blank line - usually the result of
            # a "<pre>'NL'text" sequence
            #
            shift(@content) if ( $content[0] =~ /^\s*$/ );

            #
            # Join back the lines with <br/> tags between them.
            #
            foreach my $txt (@content) {
                push( @result, $txt, ['br'] );
            }

            #
            # Remove the <br/> at the end, it's spurious
            #
            pop(@result);
        }

        return (@result);
    }

}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker