246 lines
7.0 KiB
Plaintext
246 lines
7.0 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: reformat_html
|
||
|
#
|
||
|
# USAGE: ./reformat_html < input.html > output.html
|
||
|
#
|
||
|
# DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'
|
||
|
# field to the format required in the 'description' field of an
|
||
|
# item on the IA. It reads from STDIN and writes to STDOUT.
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: ---
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.0.1
|
||
|
# CREATED: 2025-02-09 22:56:30
|
||
|
# REVISION: 2025-02-13 11:13:37
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use v5.36;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use feature qw{ say try };
|
||
|
no warnings qw{ experimental::try };
|
||
|
|
||
|
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
||
|
|
||
|
use HTML::TreeBuilder 5 -weak;
|
||
|
use HTML::Entities;
|
||
|
|
||
|
#
|
||
|
# Version number (Incremented by Vim)
|
||
|
#
|
||
|
our $VERSION = '0.0.1';
|
||
|
|
||
|
#
|
||
|
# Declarations
|
||
|
#
|
||
|
my ($verbose, @notes, $notes, $tree);
|
||
|
|
||
|
#
|
||
|
# Read the input data into an array
|
||
|
#
|
||
|
try {
|
||
|
@notes = <STDIN>;
|
||
|
}
|
||
|
catch ($e) {
|
||
|
warn "Problem reading input HTML; $e";
|
||
|
exit 1;
|
||
|
}
|
||
|
|
||
|
die "No input HTML detected\n" unless @notes;
|
||
|
|
||
|
#
|
||
|
# Turn the array into a scalar
|
||
|
#
|
||
|
$notes = join( '', @notes );
|
||
|
|
||
|
#
|
||
|
# Get ready to parse the array
|
||
|
#
|
||
|
$tree = HTML::TreeBuilder->new;
|
||
|
$tree->ignore_unknown(0);
|
||
|
$tree->no_expand_entities(1);
|
||
|
$tree->p_strict(1);
|
||
|
$tree->store_comments(1); # Necessary?
|
||
|
$tree->warn(1);
|
||
|
|
||
|
#
|
||
|
# Parse HTML to the tree structure
|
||
|
#
|
||
|
$tree->parse_content($notes)
|
||
|
or die "HTML::TreeBuilder failed to parse input HTML: $!\n";
|
||
|
|
||
|
#
|
||
|
# Flatten all <pre> tags and add <br/> tags
|
||
|
#
|
||
|
$notes = flatten_pre($tree);
|
||
|
|
||
|
#
|
||
|
# Deal with non-ASCII
|
||
|
#
|
||
|
$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
|
||
|
|
||
|
#
|
||
|
# Remove all newlines
|
||
|
#
|
||
|
$notes =~ s/\n//g;
|
||
|
|
||
|
#
|
||
|
# Write the end result to the STDOUT
|
||
|
#
|
||
|
say $notes;
|
||
|
|
||
|
exit;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: flatten_pre
|
||
|
# PURPOSE: Process notes "flattening" <pre> contents
|
||
|
# PARAMETERS: $tree HTML::TreeBuilder object containing parsed and
|
||
|
# partially processed notes
|
||
|
# RETURNS: Processed notes
|
||
|
# DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
|
||
|
# contain some formatting tags like <em> and <code>, but spaces
|
||
|
# and newlines are significant. The Internet Archive upload API
|
||
|
# uses HTTP headers which are text strings without newlines, so
|
||
|
# when these tags are uploaded through this route some
|
||
|
# formatting is lost. What this routine does is parse the
|
||
|
# contents of all <pre> sections in $notes, adding <br/> tags
|
||
|
# to replace newlines. It has to perform a full parse
|
||
|
# since the contents may include HTML tags and these need to be
|
||
|
# passed through intact. It calls the subroutine 'flatten_item' to
|
||
|
# deal with the recursive nature of HTML tags.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub flatten_pre {
|
||
|
my ($tree) = @_;
|
||
|
|
||
|
#
|
||
|
# Find all the <pre> tags
|
||
|
#
|
||
|
my @pre_tags = $tree->look_down( _tag => 'pre', );
|
||
|
|
||
|
#
|
||
|
# Walk the various <pre> elements in the document
|
||
|
#
|
||
|
foreach my $tag (@pre_tags) {
|
||
|
#
|
||
|
# Save the tag and empty the original
|
||
|
#
|
||
|
my $saved = $tag->clone();
|
||
|
$tag->delete_content();
|
||
|
|
||
|
#
|
||
|
# Walk the saved content and rebuild the tag into $atag using the
|
||
|
# nested arrayref structure permitted by HTML::Element for
|
||
|
# convenience (the alternative is a little nasty). See the
|
||
|
# documentation for 'new_from_lol' in HTML::Element.
|
||
|
#
|
||
|
my $atag;
|
||
|
foreach my $item ( @{ $saved->content_array_ref } ) {
|
||
|
push( @$atag, flatten_item($item) );
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Rebuild the tag from the arrayref we built. We treat the arrayref
|
||
|
# structure we just built as an array because otherwise the top level
|
||
|
# is interpreted as a spurious <null> tag.
|
||
|
#
|
||
|
$tag->push_content(@$atag);
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Trim out the original notes from the enclosing tags we added earlier
|
||
|
#
|
||
|
my $body = $tree->look_down( _tag => 'body' );
|
||
|
( my $result = $body->as_HTML( undef, ' ', {} ) )
|
||
|
=~ s{(^<body[^>]*>|</body>$)}{}gi;
|
||
|
|
||
|
return $result;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: flatten_item
|
||
|
# PURPOSE: Recursively "flatten" items within the enclosing <pre>
|
||
|
# PARAMETERS: $item an HTML::Element item parsed from the original
|
||
|
# <pre> section
|
||
|
# RETURNS: An arrayref if the last seen item was a tag, otherwise a list
|
||
|
# DESCRIPTION: Since <pre> sections can contain inline elements which change
|
||
|
# the rendering of the text we need to parse these as we add
|
||
|
# <br/> tags. This routine does this by recursively descending
|
||
|
# through the contents. A common tag sequence is <pre><code> for
|
||
|
# scripts and the like. This routine deals with such sequences.
|
||
|
# It expects to receive the contents in sequence and builds the
|
||
|
# result as a nested arrayref structure.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub flatten_item {
|
||
|
my ($item) = @_;
|
||
|
|
||
|
return unless defined($item);
|
||
|
|
||
|
my ( @result, %attr );
|
||
|
|
||
|
#
|
||
|
# Is it a sub-tag or non-tag content?
|
||
|
#
|
||
|
if ( ref($item) ) {
|
||
|
#
|
||
|
# It's a tag. Save the tag name and any attributes and recurse into
|
||
|
# it. Return an arrayref
|
||
|
#
|
||
|
push( @result, $item->tag() );
|
||
|
%attr = $item->all_external_attr();
|
||
|
push( @result, \%attr ) if %attr;
|
||
|
for my $child ( $item->content_list() ) {
|
||
|
push( @result, flatten_item($child) );
|
||
|
}
|
||
|
return \@result;
|
||
|
}
|
||
|
else {
|
||
|
#
|
||
|
# It's non-tag content. Join the lines with <br/> tags. Return an
|
||
|
# array (since this is a simple list).
|
||
|
#
|
||
|
# Note that we split with a LIMIT of -1 which causes any trailing list
|
||
|
# items to be returned; default behaviour is to drop them.
|
||
|
#
|
||
|
$item =~ s/\r//g;
|
||
|
my @content = split( /\n/, $item, -1 );
|
||
|
if (@content) {
|
||
|
#
|
||
|
# Remove a leading blank line - usually the result of
|
||
|
# a "<pre>'NL'text" sequence
|
||
|
#
|
||
|
shift(@content) if ( $content[0] =~ /^\s*$/ );
|
||
|
|
||
|
#
|
||
|
# Join back the lines with <br/> tags between them.
|
||
|
#
|
||
|
foreach my $txt (@content) {
|
||
|
push( @result, $txt, ['br'] );
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Remove the <br/> at the end, it's spurious
|
||
|
#
|
||
|
pop(@result);
|
||
|
}
|
||
|
|
||
|
return (@result);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||
|
|