hpr-tools/InternetArchive/reformat_html

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: reformat_html
#
#        USAGE: ./reformat_html < input.html > output.html
#
#  DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'
#               field to the format required in the 'description' field of an
#               item on the IA. It reads from STDIN and writes to STDOUT.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.1
#      CREATED: 2025-02-09 22:56:30
#     REVISION: 2025-02-13 11:13:37
#
#===============================================================================

use v5.36;
use strict;
use warnings;
use feature qw{ say try };
no warnings qw{ experimental::try };

use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8

use HTML::TreeBuilder 5 -weak;
use HTML::Entities;

#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.1';

#
# Declarations
#
my ($verbose, @notes, $notes, $tree);

#
# Read the input data into an array
#
try {
    @notes = <STDIN>;
}
catch ($e) {
    warn "Problem reading input HTML; $e";
    exit 1;
}

die "No input HTML detected\n" unless @notes;

#
# Turn the array into a scalar
#
$notes = join( '', @notes );

#
# Get ready to parse the array
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);               # Necessary?
$tree->warn(1);

#
# Parse HTML to the tree structure
#
$tree->parse_content($notes)
    or die "HTML::TreeBuilder failed to parse input HTML: $!\n";

#
# Flatten all <pre> tags and add <br/> tags
#
$notes = flatten_pre($tree);

#
# Deal with non-ASCII
#
$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );

#
# Remove all newlines
#
$notes =~ s/\n//g;

#
# Write the end result to the STDOUT
#
say $notes;

exit;

#===  FUNCTION  ================================================================
#         NAME: flatten_pre
#      PURPOSE: Process notes "flattening" <pre> contents
#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
#                       partially processed notes
#      RETURNS: Processed notes
#  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
#               contain some formatting tags like <em> and <code>, but spaces
#               and newlines are significant. The Internet Archive upload API
#               uses HTTP headers which are text strings without newlines, so
#               when these tags are uploaded through this route some
#               formatting is lost. What this routine does is parse the
#               contents of all <pre> sections in $notes, adding <br/> tags
#               to replace newlines. It has to perform a full parse
#               since the contents may include HTML tags and these need to be
#               passed through intact. It calls the subroutine 'flatten_item' to
#               deal with the recursive nature of HTML tags.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_pre {
    my ($tree) = @_;

    #
    # Find all the <pre> tags
    #
    my @pre_tags = $tree->look_down( _tag => 'pre', );

    #
    # Walk the various <pre> elements in the document
    #
    foreach my $tag (@pre_tags) {
        #
        # Save the tag and empty the original
        #
        my $saved = $tag->clone();
        $tag->delete_content();

        #
        # Walk the saved content and rebuild the tag into $atag using the
        # nested arrayref structure permitted by HTML::Element for
        # convenience (the alternative is a little nasty). See the
        # documentation for 'new_from_lol' in HTML::Element.
        #
        my $atag;
        foreach my $item ( @{ $saved->content_array_ref } ) {
            push( @$atag, flatten_item($item) );
        }

        #
        # Rebuild the tag from the arrayref we built. We treat the arrayref
        # structure we just built as an array because otherwise the top level
        # is interpreted as a spurious <null> tag.
        #
        $tag->push_content(@$atag);
    }

    #
    # Trim out the original notes from the enclosing tags we added earlier
    #
    my $body = $tree->look_down( _tag => 'body' );
    ( my $result = $body->as_HTML( undef, ' ', {} ) )
        =~ s{(^<body[^>]*>|</body>$)}{}gi;

    return $result;

}

#===  FUNCTION  ================================================================
#         NAME: flatten_item
#      PURPOSE: Recursively "flatten" items within the enclosing <pre>
#   PARAMETERS: $item   an HTML::Element item parsed from the original
#                       <pre> section
#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
#  DESCRIPTION: Since <pre> sections can contain inline elements which change
#               the rendering of the text we need to parse these as we add
#               <br/> tags. This routine does this by recursively descending
#               through the contents. A common tag sequence is <pre><code> for
#               scripts and the like. This routine deals with such sequences.
#               It expects to receive the contents in sequence and builds the
#               result as a nested arrayref structure.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub flatten_item {
    my ($item) = @_;

    return unless defined($item);

    my ( @result, %attr );

    #
    # Is it a sub-tag or non-tag content?
    #
    if ( ref($item) ) {
        #
        # It's a tag. Save the tag name and any attributes and recurse into
        # it. Return an arrayref
        #
        push( @result, $item->tag() );
        %attr = $item->all_external_attr();
        push( @result, \%attr ) if %attr;
        for my $child ( $item->content_list() ) {
            push( @result, flatten_item($child) );
        }
        return \@result;
    }
    else {
        #
        # It's non-tag content. Join the lines with <br/> tags.  Return an
        # array (since this is a simple list).
        #
        # Note that we split with a LIMIT of -1 which causes any trailing list
        # items to be returned; default behaviour is to drop them.
        #
        $item =~ s/\r//g;
        my @content = split( /\n/, $item, -1 );
        if (@content) {
            #
            # Remove a leading blank line - usually the result of
            # a "<pre>'NL'text" sequence
            #
            shift(@content) if ( $content[0] =~ /^\s*$/ );

            #
            # Join back the lines with <br/> tags between them.
            #
            foreach my $txt (@content) {
                push( @result, $txt, ['br'] );
            }

            #
            # Remove the <br/> at the end, it's spurious
            #
            pop(@result);
        }

        return (@result);
    }

}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
New 'reformat_html', plus some cleaning InternetArchive/future_upload: now updates the state of shows InternetArchive/reformat_html: new Perl script to reformat the HTML originally found in the HPR database in the 'notes' field to the format required in the 'description' field of an item on the IA. It reads from STDIN and writes to STDOUT. 2025-02-13 11:24:27 +00:00			`#!/usr/bin/env perl`
			`#===============================================================================`
			`#`
			`# FILE: reformat_html`
			`#`
			`# USAGE: ./reformat_html < input.html > output.html`
			`#`
			`# DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'`
			`# field to the format required in the 'description' field of an`
			`# item on the IA. It reads from STDIN and writes to STDOUT.`
			`#`
			`# OPTIONS: ---`
			`# REQUIREMENTS: ---`
			`# BUGS: ---`
			`# NOTES: ---`
			`# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com`
			`# VERSION: 0.0.1`
			`# CREATED: 2025-02-09 22:56:30`
			`# REVISION: 2025-02-13 11:13:37`
			`#`
			`#===============================================================================`

			`use v5.36;`
			`use strict;`
			`use warnings;`
			`use feature qw{ say try };`
			`no warnings qw{ experimental::try };`

			`use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8`

			`use HTML::TreeBuilder 5 -weak;`
			`use HTML::Entities;`

			`#`
			`# Version number (Incremented by Vim)`
			`#`
			`our $VERSION = '0.0.1';`

			`#`
			`# Declarations`
			`#`
			`my ($verbose, @notes, $notes, $tree);`

			`#`
			`# Read the input data into an array`
			`#`
			`try {`
			`@notes = <STDIN>;`
			`}`
			`catch ($e) {`
			`warn "Problem reading input HTML; $e";`
			`exit 1;`
			`}`

			`die "No input HTML detected\n" unless @notes;`

			`#`
			`# Turn the array into a scalar`
			`#`
			`$notes = join( '', @notes );`

			`#`
			`# Get ready to parse the array`
			`#`
			`$tree = HTML::TreeBuilder->new;`
			`$tree->ignore_unknown(0);`
			`$tree->no_expand_entities(1);`
			`$tree->p_strict(1);`
			`$tree->store_comments(1); # Necessary?`
			`$tree->warn(1);`

			`#`
			`# Parse HTML to the tree structure`
			`#`
			`$tree->parse_content($notes)`
			`or die "HTML::TreeBuilder failed to parse input HTML: $!\n";`

			`#`
			`# Flatten all <pre> tags and add <br/> tags`
			`#`
			`$notes = flatten_pre($tree);`

			`#`
			`# Deal with non-ASCII`
			`#`
			`$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );`

			`#`
			`# Remove all newlines`
			`#`
			`$notes =~ s/\n//g;`

			`#`
			`# Write the end result to the STDOUT`
			`#`
			`say $notes;`

			`exit;`

			`#=== FUNCTION ================================================================`
			`# NAME: flatten_pre`
			`# PURPOSE: Process notes "flattening" <pre> contents`
			`# PARAMETERS: $tree HTML::TreeBuilder object containing parsed and`
			`# partially processed notes`
			`# RETURNS: Processed notes`
			`# DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also`
			`# contain some formatting tags like <em> and <code>, but spaces`
			`# and newlines are significant. The Internet Archive upload API`
			`# uses HTTP headers which are text strings without newlines, so`
			`# when these tags are uploaded through this route some`
			`# formatting is lost. What this routine does is parse the`
			`# contents of all <pre> sections in $notes, adding <br/> tags`
			`# to replace newlines. It has to perform a full parse`
			`# since the contents may include HTML tags and these need to be`
			`# passed through intact. It calls the subroutine 'flatten_item' to`
			`# deal with the recursive nature of HTML tags.`
			`# THROWS: No exceptions`
			`# COMMENTS: None`
			`# SEE ALSO: N/A`
			`#===============================================================================`
			`sub flatten_pre {`
			`my ($tree) = @_;`

			`#`
			`# Find all the <pre> tags`
			`#`
			`my @pre_tags = $tree->look_down( _tag => 'pre', );`

			`#`
			`# Walk the various <pre> elements in the document`
			`#`
			`foreach my $tag (@pre_tags) {`
			`#`
			`# Save the tag and empty the original`
			`#`
			`my $saved = $tag->clone();`
			`$tag->delete_content();`

			`#`
			`# Walk the saved content and rebuild the tag into $atag using the`
			`# nested arrayref structure permitted by HTML::Element for`
			`# convenience (the alternative is a little nasty). See the`
			`# documentation for 'new_from_lol' in HTML::Element.`
			`#`
			`my $atag;`
			`foreach my $item ( @{ $saved->content_array_ref } ) {`
			`push( @$atag, flatten_item($item) );`
			`}`

			`#`
			`# Rebuild the tag from the arrayref we built. We treat the arrayref`
			`# structure we just built as an array because otherwise the top level`
			`# is interpreted as a spurious <null> tag.`
			`#`
			`$tag->push_content(@$atag);`
			`}`

			`#`
			`# Trim out the original notes from the enclosing tags we added earlier`
			`#`
			`my $body = $tree->look_down( _tag => 'body' );`
			`( my $result = $body->as_HTML( undef, ' ', {} ) )`
			`=~ s{(^<body[^>]*>\|</body>$)}{}gi;`

			`return $result;`

			`}`

			`#=== FUNCTION ================================================================`
			`# NAME: flatten_item`
			`# PURPOSE: Recursively "flatten" items within the enclosing <pre>`
			`# PARAMETERS: $item an HTML::Element item parsed from the original`
			`# <pre> section`
			`# RETURNS: An arrayref if the last seen item was a tag, otherwise a list`
			`# DESCRIPTION: Since <pre> sections can contain inline elements which change`
			`# the rendering of the text we need to parse these as we add`
			`# <br/> tags. This routine does this by recursively descending`
			`# through the contents. A common tag sequence is <pre><code> for`
			`# scripts and the like. This routine deals with such sequences.`
			`# It expects to receive the contents in sequence and builds the`
			`# result as a nested arrayref structure.`
			`# THROWS: No exceptions`
			`# COMMENTS: None`
			`# SEE ALSO: N/A`
			`#===============================================================================`
			`sub flatten_item {`
			`my ($item) = @_;`

			`return unless defined($item);`

			`my ( @result, %attr );`

			`#`
			`# Is it a sub-tag or non-tag content?`
			`#`
			`if ( ref($item) ) {`
			`#`
			`# It's a tag. Save the tag name and any attributes and recurse into`
			`# it. Return an arrayref`
			`#`
			`push( @result, $item->tag() );`
			`%attr = $item->all_external_attr();`
			`push( @result, \%attr ) if %attr;`
			`for my $child ( $item->content_list() ) {`
			`push( @result, flatten_item($child) );`
			`}`
			`return \@result;`
			`}`
			`else {`
			`#`
			`# It's non-tag content. Join the lines with <br/> tags. Return an`
			`# array (since this is a simple list).`
			`#`
			`# Note that we split with a LIMIT of -1 which causes any trailing list`
			`# items to be returned; default behaviour is to drop them.`
			`#`
			`$item =~ s/\r//g;`
			`my @content = split( /\n/, $item, -1 );`
			`if (@content) {`
			`#`
			`# Remove a leading blank line - usually the result of`
			`# a "<pre>'NL'text" sequence`
			`#`
			`shift(@content) if ( $content[0] =~ /^\s*$/ );`

			`#`
			`# Join back the lines with <br/> tags between them.`
			`#`
			`foreach my $txt (@content) {`
			`push( @result, $txt, ['br'] );`
			`}`

			`#`
			`# Remove the <br/> at the end, it's spurious`
			`#`
			`pop(@result);`
			`}`

			`return (@result);`
			`}`

			`}`

			`# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker`