#!/usr/bin/env perl #=============================================================================== # # FILE: reformat_html # # USAGE: ./reformat_html < input.html > output.html # # DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes' # field to the format required in the 'description' field of an # item on the IA. It reads from STDIN and writes to STDOUT. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.1 # CREATED: 2025-02-09 22:56:30 # REVISION: 2025-02-13 11:13:37 # #=============================================================================== use v5.36; use strict; use warnings; use feature qw{ say try }; no warnings qw{ experimental::try }; use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 use HTML::TreeBuilder 5 -weak; use HTML::Entities; # # Version number (Incremented by Vim) # our $VERSION = '0.0.1'; # # Declarations # my ($verbose, @notes, $notes, $tree); # # Read the input data into an array # try { @notes = ; } catch ($e) { warn "Problem reading input HTML; $e"; exit 1; } die "No input HTML detected\n" unless @notes; # # Turn the array into a scalar # $notes = join( '', @notes ); # # Get ready to parse the array # $tree = HTML::TreeBuilder->new; $tree->ignore_unknown(0); $tree->no_expand_entities(1); $tree->p_strict(1); $tree->store_comments(1); # Necessary? $tree->warn(1); # # Parse HTML to the tree structure # $tree->parse_content($notes) or die "HTML::TreeBuilder failed to parse input HTML: $!\n"; # # Flatten all

 tags and add 
 tags
#
$notes = flatten_pre($tree);

#
# Deal with non-ASCII
#
$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );

#
# Remove all newlines
#
$notes =~ s/\n//g;

#
# Write the end result to the STDOUT
#
say $notes;

exit;

#===  FUNCTION  ================================================================
#         NAME: flatten_pre
#      PURPOSE: Process notes "flattening"  contents
#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
#                       partially processed notes
#      RETURNS: Processed notes
#  DESCRIPTION: The HTML "" tag encloses preformatted text. It can also
#               contain some formatting tags like  and , but spaces
#               and newlines are significant. The Internet Archive upload API
#               uses HTTP headers which are text strings without newlines, so
#               when these tags are uploaded through this route some
#               formatting is lost. What this routine does is parse the
#               contents of all  sections in $notes, adding 
 tags
#               to replace newlines. It has to perform a full parse
#               since the contents may include HTML tags and these need to be
#               passed through intact. It calls the subroutine 'flatten_item' to
#               deal with the recursive nature of HTML tags.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_pre {
    my ($tree) = @_;

    #
    # Find all the  tags
    #
    my @pre_tags = $tree->look_down( _tag => 'pre', );

    #
    # Walk the various  elements in the document
    #
    foreach my $tag (@pre_tags) {
        #
        # Save the tag and empty the original
        #
        my $saved = $tag->clone();
        $tag->delete_content();

        #
        # Walk the saved content and rebuild the tag into $atag using the
        # nested arrayref structure permitted by HTML::Element for
        # convenience (the alternative is a little nasty). See the
        # documentation for 'new_from_lol' in HTML::Element.
        #
        my $atag;
        foreach my $item ( @{ $saved->content_array_ref } ) {
            push( @$atag, flatten_item($item) );
        }

        #
        # Rebuild the tag from the arrayref we built. We treat the arrayref
        # structure we just built as an array because otherwise the top level
        # is interpreted as a spurious  tag.
        #
        $tag->push_content(@$atag);
    }

    #
    # Trim out the original notes from the enclosing tags we added earlier
    #
    my $body = $tree->look_down( _tag => 'body' );
    ( my $result = $body->as_HTML( undef, ' ', {} ) )
        =~ s{(^]*>|$)}{}gi;

    return $result;

}

#===  FUNCTION  ================================================================
#         NAME: flatten_item
#      PURPOSE: Recursively "flatten" items within the enclosing #   PARAMETERS: $item   an HTML::Element item parsed from the original
#                        section
#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
#  DESCRIPTION: Since  sections can contain inline elements which change
#               the rendering of the text we need to parse these as we add
#               
 tags. This routine does this by recursively descending
#               through the contents. A common tag sequence is  for
#               scripts and the like. This routine deals with such sequences.
#               It expects to receive the contents in sequence and builds the
#               result as a nested arrayref structure.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_item {
    my ($item) = @_;

    return unless defined($item);

    my ( @result, %attr );

    #
    # Is it a sub-tag or non-tag content?
    #
    if ( ref($item) ) {
        #
        # It's a tag. Save the tag name and any attributes and recurse into
        # it. Return an arrayref
        #
        push( @result, $item->tag() );
        %attr = $item->all_external_attr();
        push( @result, \%attr ) if %attr;
        for my $child ( $item->content_list() ) {
            push( @result, flatten_item($child) );
        }
        return \@result;
    }
    else {
        #
        # It's non-tag content. Join the lines with 
 tags.  Return an
        # array (since this is a simple list).
        #
        # Note that we split with a LIMIT of -1 which causes any trailing list
        # items to be returned; default behaviour is to drop them.
        #
        $item =~ s/\r//g;
        my @content = split( /\n/, $item, -1 );
        if (@content) {
            #
            # Remove a leading blank line - usually the result of
            # a "'NL'text" sequence
            #
            shift(@content) if ( $content[0] =~ /^\s*$/ );

            #
            # Join back the lines with 
 tags between them.
            #
            foreach my $txt (@content) {
                push( @result, $txt, ['br'] );
            }

            #
            # Remove the 
 at the end, it's spurious
            #
            pop(@result);
        }

        return (@result);
    }

}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker