hpr-tools/Show_Submission/extract_images

655 lines
18 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: extract_images
#
# USAGE: ./extract_images [--help] [--documentation|man]
# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent]
# HTML_file [ [HTML_file_2] [HTML_file_3] ... ]
#
# DESCRIPTION: Processes HTML files which may have 'data' URIs containing
# images, and extracts these images into files in the same
# directory. The 'data' scheme links are converted to 'https'
# and reference the extracted files. The modified HTML is
# output, and the original will be saved as a backup if
# requested.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.1
# CREATED: 2024-12-25 10:53:15
# REVISION: 2024-12-29 15:19:35
#
#===============================================================================
use v5.36;
use strict;
use warnings;
use feature qw{ postderef say signatures state try };
no warnings
qw{ experimental::postderef experimental::signatures experimental::try };
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
use Getopt::Long;
use Pod::Usage;
use HTML::TreeBuilder 5 -weak;
use URI;
use MIME::Types;
#use File::Slurper;
use Path::Tiny;
use File::Copy;
use Data::Dumper;
#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.1';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
my ( $notes, $typename, $tree, $uri );
my ( $fcount, $basename, $filename, $suffix, $fh );
my ( $updates, $bsuffix, $new_basename, $newURL );
my $backupcount = 5;
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
# {{{
#
# Option defaults
#
my $DEFDEBUG = 0;
my $DEFPREFIX = 'image';
my %options;
Options( \%options );
#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
if ( $options{'help'} );
#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
if ( $options{'documentation'} );
#
# Collect options
#
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX );
my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 );
my $force = ( defined( $options{force} ) ? $options{force} : 0 );
#
# Check we have arguments
#
die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n"
unless ( scalar(@ARGV) > 0 );
#
# Clean up the prefix
#
$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);
#
# Backup suffix (the dot gets added later)
#
$bsuffix = 'bck';
#
# Debug the options
#
_debug(
$DEBUG > 1,
'$silent = ' . $silent,
'$prefix = ' . $prefix,
'$backup = ' . $backup,
'$force = ' . $force,
);
# }}}
#-------------------------------------------------------------------------------
# Prepare items for later use: current working directory and a MIME::Types
# object
#-------------------------------------------------------------------------------
my $curdir = Path::Tiny->cwd;
my $mt = MIME::Types->new;
#-------------------------------------------------------------------------------
# Loop through the arguments
#-------------------------------------------------------------------------------
foreach my $notesfile (@ARGV) {
unless (-e $notesfile) {
warn "Unable to find $notesfile\n";
next;
}
#
# Force the MIME type of $notesfile to a string
#
$typename = "" . $mt->mimeTypeOf("$notesfile");
#
# Check the MIME type and reject non-HTML
#
unless ($typename eq 'text/html') {
warn "File $notesfile is not HTML\n";
next
}
say "Reading from $notesfile\n" unless $silent;
#
# Get HTML file basename without the suffix for building filenames
#
$basename = path($notesfile)->basename('.html');
#
# Read the HTML
#
open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n";
$notes = <$nfh>;
close($nfh);
#
# Image files are to have an index
#
$fcount = 0;
#
# Keep a note of HTML updates
#
$updates = 0;
#
# Initialise the TreeBuilder
#
$tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
$tree->store_comments(1);
$tree->warn(1);
#
# Load this HTML file into the TreeBuilder
#
$tree->parse_content($notes)
or die "HTML::TreeBuilder failed to parse notes: $!\n";
#
# Loop through the tree looking for 'data' scheme images
#
for ( @{ $tree->extract_links( 'img' ) } ) {
my ( $link, $element, $attr, $tag ) = @$_;
$uri = URI->new( $link );
unless ($silent) {
say "Scheme: ",$uri->scheme;
say "Media type: ",$uri->media_type;
}
#
# We only care about 'data' scheme stuff - for now anyway, and only
# images within this set
#
if ( $uri->scheme eq 'data' ) {
#
# Only images
#
if ( $uri->media_type =~ /^image/ ) {
#
# Extract the file name suffix from the MIME string, and give it
# a leading '.'
#
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
#
# Construct the filename for this image
#
$fcount++;
$filename
= "$curdir/${prefix}_${basename}_${fcount}${suffix}";
say "Writing to: $filename" unless $silent;
say '-' x 40 unless $silent;
#
# Check if the file exists. Don't clobber it unless --force is active
#
if ( -e $filename ) {
unless ($force) {
warn "File $filename exists; not overwriting\n";
}
}
else {
#
# Write the data to the file in binary format. The URI module
# does the conversion so it's already binary.
#
$fh = path($filename)->openw_raw;
print $fh $uri->data;
close($fh);
}
#
# Update the HTML with a link to the file we created
#
$updates++;
$newURL = path($filename)->basename;
$element->attr( $attr, $newURL );
}
}
} # extract_links loop
#
# Output the changed HTML turning what became standalone back into
# a <body> fragment.
#
if ($updates > 0) {
my $body = $tree->look_down( _tag => 'body' );
( my $result = $body->as_HTML( undef, ' ', {} ) )
=~ s{(^<body[^>]*>|</body>$)}{}gi;
#$notesfile = path($notesfile)->basename;
if ($backup) {
_backup( $notesfile, $bsuffix, $backupcount );
say "$notesfile backed up" unless $silent;
}
else {
say "$notesfile not backed up" unless $silent;
}
$fh = path($notesfile)->openw;
say $fh $result;
close($fh);
say "$updates images converted, $notesfile updated" unless $silent;
}
else {
say "No images found, no changes made to $notesfile" unless $silent;
}
} # $notesfile loop
exit;
#=== FUNCTION ================================================================
# NAME: _backup
# PURPOSE: Given a file, make a backup of it by appending $suffix, and
# handle previous backups
# PARAMETERS: $filename path of file to backup
# $suffix suffix to use, default 'bck'
# $limit number of backups to keep
# RETURNS: True or false depending on success
# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't.
# Rationalises the $limit to avoid it being 0 or less.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _backup {
my ( $filename, $suffix, $limit ) = @_;
my ( $backupname, $limitname );
#
# Check the target file exists
#
unless ( -e $filename ) {
warn "Unable to find $filename to backup\n";
return 0;
}
#
# Handle invalid $limit values
#
$limit = 1 unless ( $limit >= 1 );
#
# Defaults
#
$suffix = 'bck' unless $suffix;
$limitname = "$filename.$suffix.$limit";
$backupname = "$filename.$suffix";
#
# Look for existing backups
#
if ( -e $backupname ) {
#
# If maximum version exists delete it
#
if ( -e $limitname ) {
unlink($limitname);
}
#
# Go backwards through the versions incrementing their version numbers
#
for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) {
if ( -e "$filename.$suffix.$vsn" ) {
move( "$filename.$suffix.$vsn",
sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) );
}
}
#
# Make $filename.bck into $filename.bck.1
#
move( "$filename.$suffix", "$filename.$suffix.1" );
}
#
# Finally save the $filename as $filename.bck
#
move( $filename, "$filename.$suffix" );
return 1;
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $messages... Arbitrary list of messages to print
# RETURNS: Nothing
# DESCRIPTION: Outputs messages if $active is true. It removes any trailing
# newline from each one and then adds one in the 'print' to the
# caller doesn't have to bother. Prepends each message with 'D>'
# to show it's a debug message.
# THROWS: No exceptions
# COMMENTS: Differs from other functions of the same name
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my $active = shift;
my $message;
return unless $active;
while ($message = shift) {
chomp($message);
print STDERR "D> $message\n";
}
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION: Process the options we want to offer. See the documentation
# for details
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "documentation|man", "debug=i", "silent!",
"prefix=s", "backup!", "force!",
# "config=s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage(
-msg => "$PROG version $VERSION\n",
-exitval => 1,
-verbose => 0
);
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
extract_images - extract embedded images from HTML and save as files
=head1 VERSION
This documentation refers to extract_images version 0.0.1
=head1 USAGE
extract_images
[--help] [--documentation|man] [-debug=N]
[--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent]
HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ]
Examples:
extract_images --prefix=picture_ --backup index.html
extract_images --silent --nobackup shownotes.html
=head1 REQUIRED ARGUMENTS
The script requires the names of one or more HTML files which will be scanned
for embedded images.
=head1 OPTIONS
Note that all on/off options can be written as B<--silent>, B<--nosilent> or
B<--no-silent>. A single hyphen can be used at the start or a pair.
=over 8
=item B<--help>
Reports brief information about how to use the script and exits. To see the
full documentation use the option B<--documentation> or B<--man>. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.
=item B<--documentation> or B<--man>
Reports full information about how to use the script and exits. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.
=item B<--debug=N>
Run in debug mode at the level specified by I<N>. Possible values are:
=over 4
=item B<0>
No debugging (the default).
=item B<1>
TBA
=item B<2>
TBA
=item B<3>
TBA
=back
=item B<--prefix=STRING>
Since embedded images have no names, when they are written to image files
names are generated for them. These names are built from the following
components:
=over 4
=item B<HTML filename>
The name of the HTML file without a suffix. So, if the name is 'index.html',
the 'index' part will be used.
=item B<prefix>
The prefix string provided by this option, or 'image' if not specified.
=item B<counter>
The images found in the HTML are counted, starting from 1, and this number is
used here.
=item B<extension>
The image format extension, taken from the embedded image information.
Examples would be 'jpg', 'jpeg' and 'png'.
=back
The B<HTML filename>, B<prefix>, B<counter> and B<extension> are joined
together to make the file name as follows:
<HTML filename>_<prefix>_<counter>.<extension>
So the following run of the script would generate the first file shown below
assuming the first embedded picture was in 'PNG' format:
extract_images --prefix=picture hpr4283.html
hpr4283_picture_1.png
=item B<--[no]backup>
The HTML is modified to leave placeholders referring to any embedded images
found as it it processed. The new HTML is written to the original file if
B<--nobackup> is specified, or a backup of the original file is made, before
writing the new HTML. Making a backup is the default behaviour.
=item B<--[no]force>
Images are written to the current directory. If an image already exists, and
B<--force> has not been specified, the script will stop processing and exit.
The default setting of this option is B<--noforce>.
=item B<--[no]silent>
By default the script reports its progress as it processes an HTML file, but
this can be turned off by using this option.
=back
=head1 DESCRIPTION
This Perl script B<extract_images> parses HTML looking for embedded images.
Such images use a URI with the scheme 'data' followed by image details
including the encoded binary contents. See the Wikipedia article
https://en.wikipedia.org/wiki/Data_URI_scheme for details.
When such images are found they are written as files (with generated names),
and the HTML is updated with these names replacing the original URI. Further
work is likely to be required to build full image links, but the majority of
work will have been done.
The script will not overwrite image files unless the B<--force> option is
used, but will overwrite the original HTML file unless B<--backup> is
specified.
By default details of progress are written to standard output, but this can be
preveneted by using the B<--silent> option.
=head2 GENERATED IMAGE FILE NAMES
These names are built from various elements:
<HTML filename>_<prefix>_<counter>.<extension>
Where <HTML filename> is the base name of the HTML input file, <prefix> is the
string provided with the B<--prefix=STRING> option (or B<image> by default),
<counter> is a count of image files found during the scan of the HTML, and
<extension> is the appropriate file extension for the type of image being
converted.
See the option B<--prefix=STRING> for details of file name generation.
=head1 DIAGNOSTICS
TBA
=head1 CONFIGURATION AND ENVIRONMENT
TBA
=head1 DEPENDENCIES
Data::Dumper
File::Copy
Getopt::Long
HTML::TreeBuilder
MIME::Types
Path::Tiny
Pod::Usage
URI
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com).
All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker