#!/usr/bin/env perl #=============================================================================== # # FILE: extract_images # # USAGE: ./extract_images [--help] [--documentation|man] # [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] # HTML_file [ [HTML_file_2] [HTML_file_3] ... ] # # DESCRIPTION: Processes HTML files which may have 'data' scheme URIs containing # images, and extracts these images into files in the same # directory. The 'data' scheme links are converted to 'https' # and reference the extracted files. The modified HTML is # output, and the original will be saved as a backup if # requested. # # The 'data' URI scheme is specified in RFC 2397. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.3 # CREATED: 2024-12-25 10:53:15 # REVISION: 2024-12-31 20:56:50 # #=============================================================================== use v5.36; use strict; use warnings; use feature qw{ say state try }; no warnings qw{ experimental::try }; use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 use Getopt::Long; use Pod::Usage; use HTML::TreeBuilder 5 -weak; use URI; use MIME::Types; use Path::Tiny; use File::Copy; use Data::Dumper; # # Version number (Incremented by Vim) # our $VERSION = '0.0.3'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- my ( $notes, $typename, $uri ); my ( $fcount, $basename, $filename, $suffix, $fh ); my ( $updates, $bsuffix, $newURL ); my $backupcount = 5; #------------------------------------------------------------------------------- # Options and arguments #------------------------------------------------------------------------------- # {{{ # # Option defaults # my $DEFDEBUG = 0; my $DEFPREFIX = 'image'; my %options; Options( \%options ); # # Default help shows minimal information # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) if ( $options{'help'} ); # # The -documentation or -man option shows the full POD documentation through # a pager for convenience # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) if ( $options{'documentation'} ); # # Collect options # my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); my $force = ( defined( $options{force} ) ? $options{force} : 0 ); # # Check we have arguments # pod2usage( -msg => "Missing arguments. One or more HTML file names are needed\n", -exitval => 1, -verbose => 0 ) unless ( scalar(@ARGV) > 0 ); # # Clean up the prefix # $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); # # Backup suffix (the dot gets added later) # $bsuffix = 'bak'; # # Debug the options # _debug( $DEBUG > 1, '$silent = ' . $silent, '$prefix = ' . $prefix, '$backup = ' . $backup, '$force = ' . $force, ); # }}} #------------------------------------------------------------------------------- # Prepare a MIME::Types object #------------------------------------------------------------------------------- my $mt = MIME::Types->new; #------------------------------------------------------------------------------- # Loop through the arguments #------------------------------------------------------------------------------- foreach my $notesfile (@ARGV) { # # Get the absolute path of the argument # my $abs_nf = path($notesfile)->absolute; # # Get the 'dirname' from the absolute path # my $dirname = path($abs_nf)->parent->stringify; unless (-e $abs_nf) { warn "Unable to find $notesfile\n"; next; } # # Force the MIME type of $notesfile to a string # $typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' ); # # Check the MIME type and reject non-HTML # unless ($typename eq 'text/html') { warn "File $abs_nf is not HTML\n"; next } say "Reading from $abs_nf\n" unless $silent; # # Get HTML file basename (no path) without the suffix for building filenames # $basename = path($abs_nf)->basename('.html'); # # Read the HTML # open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; $notes = <$nfh>; close($nfh); # # Image files are to have an index/sequence number # $fcount = 0; # # Keep a note of HTML updates # $updates = 0; # # Initialise the TreeBuilder # my $tree = HTML::TreeBuilder->new; $tree->ignore_unknown(0); $tree->no_expand_entities(1); $tree->p_strict(1); $tree->store_comments(1); $tree->warn(1); # # Load the HTML obtained from the file into the TreeBuilder # $tree->parse_content($notes) or die "HTML::TreeBuilder failed to parse notes: $!\n"; # # Loop through the tree looking for 'data' scheme images # for ( @{ $tree->extract_links('img') } ) { my ( $link, $element, $attr, $tag ) = @$_; $uri = URI->new($link); unless ($silent) { say "Scheme: ", $uri->scheme; say "Media type: ", $uri->media_type; } # # We only care about 'data' scheme stuff - for now anyway, and only # images within this set # if ( $uri->scheme eq 'data' ) { # # Only images # if ( $uri->media_type =~ /^image/ ) { # # Extract the file name suffix from the MIME string, and give it # a leading '.' # ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; # # Construct the filename for this image making sure it's in # the directory where the HTML is located. # # ${fileprefix}_${prefix}_${increment}.${extension} # $fcount++; $filename = "$dirname/${basename}_${prefix}_${fcount}${suffix}"; say "Writing to: $filename" unless $silent; say '-' x 40 unless $silent; # # If the file exists and --force is not active, warn and skip. # Otherwise write the file. # if ( -e $filename && !$force ) { warn "File $filename exists; not overwriting\n"; } else { # # Write the data to the file in binary format. The URI # module does the conversion so it's already binary. # $fh = path($filename)->openw_raw; print $fh $uri->data; close($fh); } # # Update the HTML with a link to the file we created (or that # was already there from an earlier time). # $updates++; $newURL = path($filename)->basename; $element->attr( $attr, $newURL ); } } } # extract_links loop # # Output the changed HTML turning what became standalone back into # a fragment. # if ($updates > 0) { my $body = $tree->look_down( _tag => 'body' ); ( my $result = $body->as_HTML( undef, ' ', {} ) ) =~ s{(^]*>|$)}{}gi; #$notesfile = path($notesfile)->basename; if ($backup) { if (_backup( $abs_nf, $bsuffix, $backupcount )) { say "$notesfile backed up" unless $silent; } } else { say "$notesfile not backed up" unless $silent; } $fh = path($notesfile)->openw; say $fh $result; close($fh); say "$updates images converted, $notesfile updated" unless $silent; } else { say "No images found, no changes made to $notesfile" unless $silent; } } # $notesfile loop exit; #=== FUNCTION ================================================================ # NAME: _backup # PURPOSE: Given a file, make a backup of it by appending $suffix, and # handle previous backups # PARAMETERS: $filename path of file to backup # $suffix suffix to use, default 'bck' # $limit number of backups to keep # RETURNS: True or false depending on success # DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. # Rationalises the $limit to avoid it being 0 or less. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub _backup { my ( $filename, $suffix, $limit ) = @_; my ( $backupname, $limitname ); # # Check the target file exists # unless ( -e $filename ) { warn "Unable to find $filename to backup\n"; return 0; } # # Handle invalid $limit values # $limit = 1 unless ( $limit >= 1 ); # # Defaults # $suffix = 'bck' unless $suffix; $limitname = "$filename.$suffix.$limit"; $backupname = "$filename.$suffix"; # # Look for existing backups # if ( -e $backupname ) { # # If maximum version exists delete it # if ( -e $limitname ) { unlink($limitname); } # # Go backwards through the versions incrementing their version numbers # for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { if ( -e "$filename.$suffix.$vsn" ) { move( "$filename.$suffix.$vsn", sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); } } # # Make $filename.bck into $filename.bck.1 # move( "$filename.$suffix", "$filename.$suffix.1" ); } # # Finally save the $filename as $filename.bck # move( $filename, "$filename.$suffix" ); return 1; } #=== FUNCTION ================================================================ # NAME: coalesce # PURPOSE: To find the first defined argument and return it # PARAMETERS: Arbitrary number of arguments # RETURNS: The first defined argument or undef if there are none # DESCRIPTION: Just a simple way of ensuring an 'undef' value is never # returned when doing so might be a problem. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub coalesce { foreach (@_) { return $_ if defined($_); } return undef; ## no critic } #=== FUNCTION ================================================================ # NAME: _debug # PURPOSE: Prints debug reports # PARAMETERS: $active Boolean: 1 for print, 0 for no print # $messages... Arbitrary list of messages to print # RETURNS: Nothing # DESCRIPTION: Outputs messages if $active is true. It removes any trailing # newline from each one and then adds one in the 'print' to the # caller doesn't have to bother. Prepends each message with 'D>' # to show it's a debug message. # THROWS: No exceptions # COMMENTS: Differs from other functions of the same name # SEE ALSO: N/A #=============================================================================== sub _debug { my $active = shift; my $message; return unless $active; while ($message = shift) { chomp($message); print STDERR "D> $message\n"; } } #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: Process the options we want to offer. See the documentation # for details # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "documentation|man", "debug=i", "silent!", "prefix=s", "backup!", "force!", # "config=s", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME extract_images - extract embedded images from HTML and save as files =head1 VERSION This documentation refers to extract_images version 0.0.3 =head1 USAGE extract_images [--help] [--documentation|man] [-debug=N] [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] Examples: extract_images --prefix=picture_ --backup index.html extract_images --silent --nobackup shownotes.html =head1 REQUIRED ARGUMENTS The script requires the names of one or more HTML files which will be scanned for embedded images. =head1 OPTIONS Note that all on/off options can be written as B<--silent>, B<--nosilent> or B<--no-silent>. A single hyphen can be used at the start or a pair. =over 8 =item B<--help> Reports brief information about how to use the script and exits. To see the full documentation use the option B<--documentation> or B<--man>. Alternatively, to generate a PDF version use the I tool from I. This can be installed with the cpan tool as App::pod2pdf. =item B<--documentation> or B<--man> Reports full information about how to use the script and exits. Alternatively, to generate a PDF version use the I tool from I. This can be installed with the cpan tool as App::pod2pdf. =item B<--debug=N> Run in debug mode at the level specified by I. Possible values are: =over 4 =item B<0> No debugging (the default). =item B<1> TBA =item B<2> Displays the internal variables used to store the options. =item B<3> TBA =back =item B<--prefix=STRING> Since embedded images have no names, when they are written to image files names are generated for them. These names are built from the following components: =over 4 =item B The name of the HTML file without a suffix. So, if the name is 'index.html', the 'index' part will be used. =item B The prefix string provided by this option, or 'image' if not specified. =item B The images found in the HTML are counted, starting from 1, and this number is used here. =item B The image format extension, taken from the embedded image information. Examples would be 'jpg', 'jpeg' and 'png'. =back The B, B, B and B are joined together to make the file name as follows: __. So the following run of the script would generate the first file shown below assuming the first embedded picture was in 'PNG' format: extract_images --prefix=picture hpr4283.html hpr4283_picture_1.png =item B<--[no]backup> The HTML is modified to leave placeholders referring to any embedded images found as it it processed. The new HTML is written to the original file if B<--nobackup> is specified, or a backup of the original file is made, before writing the new HTML. Making a backup is the default behaviour. =item B<--[no]force> Images are written to the current directory. If an image already exists, and B<--force> has not been specified, the script will stop processing and exit. The default setting of this option is B<--noforce>. =item B<--[no]silent> By default the script reports its progress as it processes an HTML file, but this can be turned off by using this option. =back =head1 DESCRIPTION This Perl script B parses HTML looking for embedded images. Such images use a URI with the scheme 'data' followed by image details including the encoded binary contents. See the Wikipedia article https://en.wikipedia.org/wiki/Data_URI_scheme for details. When such images are found they are written as files (with generated names), and the HTML is updated with these names replacing the original URI. Further work is likely to be required to build full image links, but the majority of work will have been done. The script will not overwrite image files unless the B<--force> option is used, but will overwrite the original HTML file unless B<--backup> is specified. By default details of progress are written to standard output, but this can be preveneted by using the B<--silent> option. =head2 GENERATED IMAGE FILE NAMES These names are built from various elements: __. Where is the base name of the HTML input file, is the string provided with the B<--prefix=STRING> option (or B by default), is a count of image files found during the scan of the HTML, and is the appropriate file extension for the type of image being converted. See the option B<--prefix=STRING> for details of file name generation. =head1 DIAGNOSTICS =over 4 =item B Type: warning The script is attempting to find a file presented to it as an argument, but cannot. It will skip to the next argument and continue. =item B Type: warning The script is checking files presented to it as arguments. The named file has been checked to see if it contains HTML, and it appears it does not. It will skip to the next argument and continue. =item B Type: fatal The script is attempting to open a file presented to it as an argument. Its existence has already been checked but it cannot be opened, possibly due to a permissions issue. =item B Type: fatal The script has attempted to parse the HTML in a file presented to it. This has failed. The error message includes a report from the module used to do this. =item B Type: warning The script is attempting to write an image file copied from the HTML, but has found it already exists. Using the option B<--force> would cause it to be overwritten, but this option is not enabled. The script will not write the file, however, it will still modify the HTML to reference the existing image file. =item B Type: warning The script is attempting to make a backup of one of the HTML file arguments, but is unable to find it. Since this is a rather unusual circumstance (has the file been deleted?), it is assumed it can be regenerated if needed, so the writing of the modified HTML is continued. =head1 CONFIGURATION AND ENVIRONMENT TBA =head1 DEPENDENCIES Data::Dumper File::Copy Getopt::Long HTML::TreeBuilder MIME::Types Path::Tiny Pod::Usage URI =head1 BUGS AND LIMITATIONS There are no known bugs in this module. Please report problems to Dave Morriss (Dave.Morriss@gmail.com) Patches are welcome. =head1 AUTHOR Dave Morriss (Dave.Morriss@gmail.com) =head1 LICENCE AND COPYRIGHT Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See perldoc perlartistic. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. =cut #}}} # [zo to open fold, zc to close] # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker