diff --git a/37567b_extract_images b/37567b_extract_images new file mode 100644 index 0000000..f7353fd --- /dev/null +++ b/37567b_extract_images @@ -0,0 +1,654 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: extract_images +# +# USAGE: ./extract_images [--help] [--documentation|man] +# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] +# HTML_file [ [HTML_file_2] [HTML_file_3] ... ] +# +# DESCRIPTION: Processes HTML files which may have 'data' URIs containing +# images, and extracts these images into files in the same +# directory. The 'data' scheme links are converted to 'https' +# and reference the extracted files. The modified HTML is +# output, and the original will be saved as a backup if +# requested. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.1 +# CREATED: 2024-12-25 10:53:15 +# REVISION: 2024-12-29 15:19:35 +# +#=============================================================================== + +use v5.36; +use strict; +use warnings; +use feature qw{ postderef say signatures state try }; +no warnings + qw{ experimental::postderef experimental::signatures experimental::try }; + +use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 + +use Getopt::Long; +use Pod::Usage; + +use HTML::TreeBuilder 5 -weak; +use URI; +use MIME::Types; + +#use File::Slurper; +use Path::Tiny; +use File::Copy; + +use Data::Dumper; + +# +# Version number (Incremented by Vim) +# +our $VERSION = '0.0.1'; + +# +# Script and directory names +# +( my $PROG = $0 ) =~ s|.*/||mx; + +#------------------------------------------------------------------------------- +# Declarations +#------------------------------------------------------------------------------- +my ( $notes, $typename, $tree, $uri ); +my ( $fcount, $basename, $filename, $suffix, $fh ); +my ( $updates, $bsuffix, $new_basename, $newURL ); + +my $backupcount = 5; + +#------------------------------------------------------------------------------- +# Options and arguments +#------------------------------------------------------------------------------- +# {{{ +# +# Option defaults +# +my $DEFDEBUG = 0; +my $DEFPREFIX = 'image'; + +my %options; +Options( \%options ); + +# +# Default help shows minimal information +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) + if ( $options{'help'} ); + +# +# The -documentation or -man option shows the full POD documentation through +# a pager for convenience +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) + if ( $options{'documentation'} ); + +# +# Collect options +# +my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); +my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); +my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); +my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); +my $force = ( defined( $options{force} ) ? $options{force} : 0 ); + +# +# Check we have arguments +# +die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n" + unless ( scalar(@ARGV) > 0 ); + +# +# Clean up the prefix +# +$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); + +# +# Backup suffix (the dot gets added later) +# +$bsuffix = 'bck'; + +# +# Debug the options +# +_debug( + $DEBUG > 1, + '$silent = ' . $silent, + '$prefix = ' . $prefix, + '$backup = ' . $backup, + '$force = ' . $force, +); + +# }}} + +#------------------------------------------------------------------------------- +# Prepare items for later use: current working directory and a MIME::Types +# object +#------------------------------------------------------------------------------- +my $curdir = Path::Tiny->cwd; + +my $mt = MIME::Types->new; + +#------------------------------------------------------------------------------- +# Loop through the arguments +#------------------------------------------------------------------------------- +foreach my $notesfile (@ARGV) { + unless (-e $notesfile) { + warn "Unable to find $notesfile\n"; + next; + } + + # + # Force the MIME type of $notesfile to a string + # + $typename = "" . $mt->mimeTypeOf("$notesfile"); + + # + # Check the MIME type and reject non-HTML + # + unless ($typename eq 'text/html') { + warn "File $notesfile is not HTML\n"; + next + } + + say "Reading from $notesfile\n" unless $silent; + + # + # Get HTML file basename without the suffix for building filenames + # + $basename = path($notesfile)->basename('.html'); + + # + # Read the HTML + # + open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; + $notes = <$nfh>; + close($nfh); + + # + # Image files are to have an index + # + $fcount = 0; + + # + # Keep a note of HTML updates + # + $updates = 0; + + # + # Initialise the TreeBuilder + # + $tree = HTML::TreeBuilder->new; + $tree->ignore_unknown(0); + $tree->no_expand_entities(1); + $tree->p_strict(1); + $tree->store_comments(1); + $tree->warn(1); + + # + # Load this HTML file into the TreeBuilder + # + $tree->parse_content($notes) + or die "HTML::TreeBuilder failed to parse notes: $!\n"; + + # + # Loop through the tree looking for 'data' scheme images + # + for ( @{ $tree->extract_links( 'img' ) } ) { + my ( $link, $element, $attr, $tag ) = @$_; + + $uri = URI->new( $link ); + unless ($silent) { + say "Scheme: ",$uri->scheme; + say "Media type: ",$uri->media_type; + } + + # + # We only care about 'data' scheme stuff - for now anyway, and only + # images within this set + # + if ( $uri->scheme eq 'data' ) { + # + # Only images + # + if ( $uri->media_type =~ /^image/ ) { + # + # Extract the file name suffix from the MIME string, and give it + # a leading '.' + # + ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; + + # + # Construct the filename for this image + # + $fcount++; + $filename + = "$curdir/${prefix}_${basename}_${fcount}${suffix}"; + say "Writing to: $filename" unless $silent; + + say '-' x 40 unless $silent; + + # + # Check if the file exists. Don't clobber it unless --force is active + # + if ( -e $filename ) { + unless ($force) { + warn "File $filename exists; not overwriting\n"; + } + } + else { + # + # Write the data to the file in binary format. The URI module + # does the conversion so it's already binary. + # + $fh = path($filename)->openw_raw; + print $fh $uri->data; + close($fh); + } + + # + # Update the HTML with a link to the file we created + # + $updates++; + $newURL = path($filename)->basename; + $element->attr( $attr, $newURL ); + } + } + } # extract_links loop + + # + # Output the changed HTML turning what became standalone back into + # a fragment. + # + if ($updates > 0) { + my $body = $tree->look_down( _tag => 'body' ); + ( my $result = $body->as_HTML( undef, ' ', {} ) ) + =~ s{(^]*>|$)}{}gi; + + #$notesfile = path($notesfile)->basename; + if ($backup) { + _backup( $notesfile, $bsuffix, $backupcount ); + say "$notesfile backed up" unless $silent; + } + else { + say "$notesfile not backed up" unless $silent; + } + + $fh = path($notesfile)->openw; + say $fh $result; + close($fh); + + say "$updates images converted, $notesfile updated" unless $silent; + } + else { + say "No images found, no changes made to $notesfile" unless $silent; + } + +} # $notesfile loop + +exit; + +#=== FUNCTION ================================================================ +# NAME: _backup +# PURPOSE: Given a file, make a backup of it by appending $suffix, and +# handle previous backups +# PARAMETERS: $filename path of file to backup +# $suffix suffix to use, default 'bck' +# $limit number of backups to keep +# RETURNS: True or false depending on success +# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. +# Rationalises the $limit to avoid it being 0 or less. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub _backup { + my ( $filename, $suffix, $limit ) = @_; + + my ( $backupname, $limitname ); + + # + # Check the target file exists + # + unless ( -e $filename ) { + warn "Unable to find $filename to backup\n"; + return 0; + } + + # + # Handle invalid $limit values + # + $limit = 1 unless ( $limit >= 1 ); + + # + # Defaults + # + $suffix = 'bck' unless $suffix; + $limitname = "$filename.$suffix.$limit"; + $backupname = "$filename.$suffix"; + + # + # Look for existing backups + # + if ( -e $backupname ) { + + # + # If maximum version exists delete it + # + if ( -e $limitname ) { + unlink($limitname); + } + + # + # Go backwards through the versions incrementing their version numbers + # + for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { + if ( -e "$filename.$suffix.$vsn" ) { + move( "$filename.$suffix.$vsn", + sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); + } + } + + # + # Make $filename.bck into $filename.bck.1 + # + move( "$filename.$suffix", "$filename.$suffix.1" ); + + } + + # + # Finally save the $filename as $filename.bck + # + move( $filename, "$filename.$suffix" ); + + return 1; +} + +#=== FUNCTION ================================================================ +# NAME: _debug +# PURPOSE: Prints debug reports +# PARAMETERS: $active Boolean: 1 for print, 0 for no print +# $messages... Arbitrary list of messages to print +# RETURNS: Nothing +# DESCRIPTION: Outputs messages if $active is true. It removes any trailing +# newline from each one and then adds one in the 'print' to the +# caller doesn't have to bother. Prepends each message with 'D>' +# to show it's a debug message. +# THROWS: No exceptions +# COMMENTS: Differs from other functions of the same name +# SEE ALSO: N/A +#=============================================================================== +sub _debug { + my $active = shift; + + my $message; + return unless $active; + + while ($message = shift) { + chomp($message); + print STDERR "D> $message\n"; + } +} + +#=== FUNCTION ================================================================ +# NAME: Options +# PURPOSE: Processes command-line options +# PARAMETERS: $optref Hash reference to hold the options +# RETURNS: Undef +# DESCRIPTION: Process the options we want to offer. See the documentation +# for details +# THROWS: no exceptions +# COMMENTS: none +# SEE ALSO: n/a +#=============================================================================== +sub Options { + my ($optref) = @_; + + my @options = ( + "help", "documentation|man", "debug=i", "silent!", + "prefix=s", "backup!", "force!", + # "config=s", + ); + + if ( !GetOptions( $optref, @options ) ) { + pod2usage( + -msg => "$PROG version $VERSION\n", + -exitval => 1, + -verbose => 0 + ); + } + + return; +} + +__END__ + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Application Documentation +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#{{{ + +=head1 NAME + +extract_images - extract embedded images from HTML and save as files + +=head1 VERSION + +This documentation refers to extract_images version 0.0.1 + +=head1 USAGE + + extract_images + [--help] [--documentation|man] [-debug=N] + [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] + HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] + + Examples: + + extract_images --prefix=picture_ --backup index.html + + extract_images --silent --nobackup shownotes.html + +=head1 REQUIRED ARGUMENTS + +The script requires the names of one or more HTML files which will be scanned +for embedded images. + +=head1 OPTIONS + +Note that all on/off options can be written as B<--silent>, B<--nosilent> or +B<--no-silent>. A single hyphen can be used at the start or a pair. + +=over 8 + +=item B<--help> + +Reports brief information about how to use the script and exits. To see the +full documentation use the option B<--documentation> or B<--man>. Alternatively, +to generate a PDF version use the I tool from +I. This can be +installed with the cpan tool as App::pod2pdf. + +=item B<--documentation> or B<--man> + +Reports full information about how to use the script and exits. Alternatively, +to generate a PDF version use the I tool from +I. This can be +installed with the cpan tool as App::pod2pdf. + +=item B<--debug=N> + +Run in debug mode at the level specified by I. Possible values are: + +=over 4 + +=item B<0> + +No debugging (the default). + +=item B<1> + +TBA + +=item B<2> + +TBA + +=item B<3> + +TBA + +=back + +=item B<--prefix=STRING> + +Since embedded images have no names, when they are written to image files +names are generated for them. These names are built from the following +components: + +=over 4 + +=item B + +The name of the HTML file without a suffix. So, if the name is 'index.html', +the 'index' part will be used. + +=item B + +The prefix string provided by this option, or 'image' if not specified. + +=item B + +The images found in the HTML are counted, starting from 1, and this number is +used here. + +=item B + +The image format extension, taken from the embedded image information. +Examples would be 'jpg', 'jpeg' and 'png'. + +=back + +The B, B, B and B are joined +together to make the file name as follows: + + __. + +So the following run of the script would generate the first file shown below +assuming the first embedded picture was in 'PNG' format: + + extract_images --prefix=picture hpr4283.html + + hpr4283_picture_1.png + +=item B<--[no]backup> + +The HTML is modified to leave placeholders referring to any embedded images +found as it it processed. The new HTML is written to the original file if +B<--nobackup> is specified, or a backup of the original file is made, before +writing the new HTML. Making a backup is the default behaviour. + +=item B<--[no]force> + +Images are written to the current directory. If an image already exists, and +B<--force> has not been specified, the script will stop processing and exit. +The default setting of this option is B<--noforce>. + +=item B<--[no]silent> + +By default the script reports its progress as it processes an HTML file, but +this can be turned off by using this option. + +=back + +=head1 DESCRIPTION + +This Perl script B parses HTML looking for embedded images. +Such images use a URI with the scheme 'data' followed by image details +including the encoded binary contents. See the Wikipedia article +https://en.wikipedia.org/wiki/Data_URI_scheme for details. + +When such images are found they are written as files (with generated names), +and the HTML is updated with these names replacing the original URI. Further +work is likely to be required to build full image links, but the majority of +work will have been done. + +The script will not overwrite image files unless the B<--force> option is +used, but will overwrite the original HTML file unless B<--backup> is +specified. + +By default details of progress are written to standard output, but this can be +preveneted by using the B<--silent> option. + +=head2 GENERATED IMAGE FILE NAMES + +These names are built from various elements: + + __. + +Where is the base name of the HTML input file, is the +string provided with the B<--prefix=STRING> option (or B by default), + is a count of image files found during the scan of the HTML, and + is the appropriate file extension for the type of image being +converted. + +See the option B<--prefix=STRING> for details of file name generation. + +=head1 DIAGNOSTICS + +TBA + +=head1 CONFIGURATION AND ENVIRONMENT + +TBA + +=head1 DEPENDENCIES + + Data::Dumper + File::Copy + Getopt::Long + HTML::TreeBuilder + MIME::Types + Path::Tiny + Pod::Usage + URI + +=head1 BUGS AND LIMITATIONS + +There are no known bugs in this module. +Please report problems to Dave Morriss (Dave.Morriss@gmail.com) +Patches are welcome. + +=head1 AUTHOR + +Dave Morriss (Dave.Morriss@gmail.com) + +=head1 LICENCE AND COPYRIGHT + +Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com). +All rights reserved. + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. See perldoc perlartistic. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +=cut + +#}}} + +# [zo to open fold, zc to close] + +# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker + diff --git a/Show_Submission/extract_images b/Show_Submission/extract_images index f7353fd..757e217 100755 --- a/Show_Submission/extract_images +++ b/Show_Submission/extract_images @@ -19,18 +19,17 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.1 +# VERSION: 0.0.2 # CREATED: 2024-12-25 10:53:15 -# REVISION: 2024-12-29 15:19:35 +# REVISION: 2024-12-30 09:28:40 # #=============================================================================== use v5.36; use strict; use warnings; -use feature qw{ postderef say signatures state try }; -no warnings - qw{ experimental::postderef experimental::signatures experimental::try }; +use feature qw{ say state try }; +no warnings qw{ experimental::try }; use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 @@ -41,7 +40,6 @@ use HTML::TreeBuilder 5 -weak; use URI; use MIME::Types; -#use File::Slurper; use Path::Tiny; use File::Copy; @@ -50,7 +48,7 @@ use Data::Dumper; # # Version number (Incremented by Vim) # -our $VERSION = '0.0.1'; +our $VERSION = '0.0.2'; # # Script and directory names @@ -60,9 +58,9 @@ our $VERSION = '0.0.1'; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- -my ( $notes, $typename, $tree, $uri ); -my ( $fcount, $basename, $filename, $suffix, $fh ); -my ( $updates, $bsuffix, $new_basename, $newURL ); +my ( $notes, $typename, $uri ); +my ( $fcount, $basename, $filename, $suffix, $fh ); +my ( $updates, $bsuffix, $newURL ); my $backupcount = 5; @@ -104,8 +102,12 @@ my $force = ( defined( $options{force} ) ? $options{force} : 0 ); # # Check we have arguments # -die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n" - unless ( scalar(@ARGV) > 0 ); +pod2usage( + -msg => "Missing arguments. One or more HTML file names are needed\n", + -exitval => 1, + -verbose => 0 + ) +unless ( scalar(@ARGV) > 0 ); # # Clean up the prefix @@ -115,7 +117,7 @@ $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); # # Backup suffix (the dot gets added later) # -$bsuffix = 'bck'; +$bsuffix = 'bak'; # # Debug the options @@ -131,18 +133,26 @@ _debug( # }}} #------------------------------------------------------------------------------- -# Prepare items for later use: current working directory and a MIME::Types -# object +# Prepare a MIME::Types object #------------------------------------------------------------------------------- -my $curdir = Path::Tiny->cwd; - my $mt = MIME::Types->new; #------------------------------------------------------------------------------- # Loop through the arguments #------------------------------------------------------------------------------- foreach my $notesfile (@ARGV) { - unless (-e $notesfile) { + + # + # Get the absolute path of the argument + # + my $abs_nf = path($notesfile)->absolute; + + # + # Get the 'dirname' from the absolute path + # + my $dirname = path($abs_nf)->parent->stringify; + + unless (-e $abs_nf) { warn "Unable to find $notesfile\n"; next; } @@ -150,22 +160,22 @@ foreach my $notesfile (@ARGV) { # # Force the MIME type of $notesfile to a string # - $typename = "" . $mt->mimeTypeOf("$notesfile"); + $typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' ); # # Check the MIME type and reject non-HTML # unless ($typename eq 'text/html') { - warn "File $notesfile is not HTML\n"; + warn "File $abs_nf is not HTML\n"; next } - say "Reading from $notesfile\n" unless $silent; + say "Reading from $abs_nf\n" unless $silent; # - # Get HTML file basename without the suffix for building filenames + # Get HTML file basename (no path) without the suffix for building filenames # - $basename = path($notesfile)->basename('.html'); + $basename = path($abs_nf)->basename('.html'); # # Read the HTML @@ -175,7 +185,7 @@ foreach my $notesfile (@ARGV) { close($nfh); # - # Image files are to have an index + # Image files are to have an index/sequence number # $fcount = 0; @@ -187,7 +197,7 @@ foreach my $notesfile (@ARGV) { # # Initialise the TreeBuilder # - $tree = HTML::TreeBuilder->new; + my $tree = HTML::TreeBuilder->new; $tree->ignore_unknown(0); $tree->no_expand_entities(1); $tree->p_strict(1); @@ -195,7 +205,7 @@ foreach my $notesfile (@ARGV) { $tree->warn(1); # - # Load this HTML file into the TreeBuilder + # Load the HTML obtained from the file into the TreeBuilder # $tree->parse_content($notes) or die "HTML::TreeBuilder failed to parse notes: $!\n"; @@ -203,13 +213,13 @@ foreach my $notesfile (@ARGV) { # # Loop through the tree looking for 'data' scheme images # - for ( @{ $tree->extract_links( 'img' ) } ) { + for ( @{ $tree->extract_links('img') } ) { my ( $link, $element, $attr, $tag ) = @$_; - $uri = URI->new( $link ); + $uri = URI->new($link); unless ($silent) { - say "Scheme: ",$uri->scheme; - say "Media type: ",$uri->media_type; + say "Scheme: ", $uri->scheme; + say "Media type: ", $uri->media_type; } # @@ -221,34 +231,34 @@ foreach my $notesfile (@ARGV) { # Only images # if ( $uri->media_type =~ /^image/ ) { - # - # Extract the file name suffix from the MIME string, and give it - # a leading '.' - # + # + # Extract the file name suffix from the MIME string, and give it + # a leading '.' + # ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; # - # Construct the filename for this image + # Construct the filename for this image making sure it's in + # the directory the HTML is in. # $fcount++; $filename - = "$curdir/${prefix}_${basename}_${fcount}${suffix}"; + = "$dirname/${prefix}_${basename}_${fcount}${suffix}"; say "Writing to: $filename" unless $silent; say '-' x 40 unless $silent; # - # Check if the file exists. Don't clobber it unless --force is active + # If the file exists and --force is not active, warn and skip. + # Otherwise write the file. # - if ( -e $filename ) { - unless ($force) { - warn "File $filename exists; not overwriting\n"; - } + if ( -e $filename && !$force ) { + warn "File $filename exists; not overwriting\n"; } else { # - # Write the data to the file in binary format. The URI module - # does the conversion so it's already binary. + # Write the data to the file in binary format. The URI + # module does the conversion so it's already binary. # $fh = path($filename)->openw_raw; print $fh $uri->data; @@ -256,14 +266,16 @@ foreach my $notesfile (@ARGV) { } # - # Update the HTML with a link to the file we created + # Update the HTML with a link to the file we created (or that + # was already there from an earlier time). # $updates++; - $newURL = path($filename)->basename; + $newURL = path($filename)->basename; $element->attr( $attr, $newURL ); } } - } # extract_links loop + + } # extract_links loop # # Output the changed HTML turning what became standalone back into @@ -276,7 +288,7 @@ foreach my $notesfile (@ARGV) { #$notesfile = path($notesfile)->basename; if ($backup) { - _backup( $notesfile, $bsuffix, $backupcount ); + _backup( $abs_nf, $bsuffix, $backupcount ); say "$notesfile backed up" unless $silent; } else { @@ -373,6 +385,24 @@ sub _backup { return 1; } +#=== FUNCTION ================================================================ +# NAME: coalesce +# PURPOSE: To find the first defined argument and return it +# PARAMETERS: Arbitrary number of arguments +# RETURNS: The first defined argument or undef if there are none +# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never +# returned when doing so might be a problem. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub coalesce { + foreach (@_) { + return $_ if defined($_); + } + return undef; ## no critic +} + #=== FUNCTION ================================================================ # NAME: _debug # PURPOSE: Prints debug reports @@ -443,7 +473,7 @@ extract_images - extract embedded images from HTML and save as files =head1 VERSION -This documentation refers to extract_images version 0.0.1 +This documentation refers to extract_images version 0.0.2 =head1 USAGE