#!/usr/bin/env perl #=============================================================================== # # FILE: extract_images # # USAGE: ./extract_images [--help] [--documentation|man] # [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] # HTML_file [ [HTML_file_2] [HTML_file_3] ... ] # # DESCRIPTION: Processes HTML files which may have 'data' URIs containing # images, and extracts these images into files in the same # directory. The 'data' scheme links are converted to 'https' # and reference the extracted files. The modified HTML is # output, and the original will be saved as a backup if # requested. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.3 # CREATED: 2024-12-25 10:53:15 # REVISION: 2024-12-30 11:52:59 # #=============================================================================== use v5.36; use strict; use warnings; use feature qw{ say state try }; no warnings qw{ experimental::try }; use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 use Getopt::Long; use Pod::Usage; use HTML::TreeBuilder 5 -weak; use URI; use MIME::Types; use Path::Tiny; use File::Copy; use Data::Dumper; # # Version number (Incremented by Vim) # our $VERSION = '0.0.3'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- my ( $notes, $typename, $uri ); my ( $fcount, $basename, $filename, $suffix, $fh ); my ( $updates, $bsuffix, $newURL ); my $backupcount = 5; #------------------------------------------------------------------------------- # Options and arguments #------------------------------------------------------------------------------- # {{{ # # Option defaults # my $DEFDEBUG = 0; my $DEFPREFIX = 'image'; my %options; Options( \%options ); # # Default help shows minimal information # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) if ( $options{'help'} ); # # The -documentation or -man option shows the full POD documentation through # a pager for convenience # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) if ( $options{'documentation'} ); # # Collect options # my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); my $force = ( defined( $options{force} ) ? $options{force} : 0 ); # # Check we have arguments # pod2usage( -msg => "Missing arguments. One or more HTML file names are needed\n", -exitval => 1, -verbose => 0 ) unless ( scalar(@ARGV) > 0 ); # # Clean up the prefix # $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); # # Backup suffix (the dot gets added later) # $bsuffix = 'bak'; # # Debug the options # _debug( $DEBUG > 1, '$silent = ' . $silent, '$prefix = ' . $prefix, '$backup = ' . $backup, '$force = ' . $force, ); # }}} #------------------------------------------------------------------------------- # Prepare a MIME::Types object #------------------------------------------------------------------------------- my $mt = MIME::Types->new; #------------------------------------------------------------------------------- # Loop through the arguments #------------------------------------------------------------------------------- foreach my $notesfile (@ARGV) { # # Get the absolute path of the argument # my $abs_nf = path($notesfile)->absolute; # # Get the 'dirname' from the absolute path # my $dirname = path($abs_nf)->parent->stringify; unless (-e $abs_nf) { warn "Unable to find $notesfile\n"; next; } # # Force the MIME type of $notesfile to a string # $typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' ); # # Check the MIME type and reject non-HTML # unless ($typename eq 'text/html') { warn "File $abs_nf is not HTML\n"; next } say "Reading from $abs_nf\n" unless $silent; # # Get HTML file basename (no path) without the suffix for building filenames # $basename = path($abs_nf)->basename('.html'); # # Read the HTML # open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; $notes = <$nfh>; close($nfh); # # Image files are to have an index/sequence number # $fcount = 0; # # Keep a note of HTML updates # $updates = 0; # # Initialise the TreeBuilder # my $tree = HTML::TreeBuilder->new; $tree->ignore_unknown(0); $tree->no_expand_entities(1); $tree->p_strict(1); $tree->store_comments(1); $tree->warn(1); # # Load the HTML obtained from the file into the TreeBuilder # $tree->parse_content($notes) or die "HTML::TreeBuilder failed to parse notes: $!\n"; # # Loop through the tree looking for 'data' scheme images # for ( @{ $tree->extract_links('img') } ) { my ( $link, $element, $attr, $tag ) = @$_; $uri = URI->new($link); unless ($silent) { say "Scheme: ", $uri->scheme; say "Media type: ", $uri->media_type; } # # We only care about 'data' scheme stuff - for now anyway, and only # images within this set # if ( $uri->scheme eq 'data' ) { # # Only images # if ( $uri->media_type =~ /^image/ ) { # # Extract the file name suffix from the MIME string, and give it # a leading '.' # ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; # # Construct the filename for this image making sure it's in # the directory the HTML is in. # $fcount++; # ${fileprefix}_${prefix}_${increment}.${extension} $filename = "$dirname/${basename}_${prefix}_${fcount}${suffix}"; say "Writing to: $filename" unless $silent; say '-' x 40 unless $silent; # # If the file exists and --force is not active, warn and skip. # Otherwise write the file. # if ( -e $filename && !$force ) { warn "File $filename exists; not overwriting\n"; } else { # # Write the data to the file in binary format. The URI # module does the conversion so it's already binary. # $fh = path($filename)->openw_raw; print $fh $uri->data; close($fh); } # # Update the HTML with a link to the file we created (or that # was already there from an earlier time). # $updates++; $newURL = path($filename)->basename; $element->attr( $attr, $newURL ); } } } # extract_links loop # # Output the changed HTML turning what became standalone back into # a
fragment. # if ($updates > 0) { my $body = $tree->look_down( _tag => 'body' ); ( my $result = $body->as_HTML( undef, ' ', {} ) ) =~ s{(^]*>|$)}{}gi; #$notesfile = path($notesfile)->basename; if ($backup) { _backup( $abs_nf, $bsuffix, $backupcount ); say "$notesfile backed up" unless $silent; } else { say "$notesfile not backed up" unless $silent; } $fh = path($notesfile)->openw; say $fh $result; close($fh); say "$updates images converted, $notesfile updated" unless $silent; } else { say "No images found, no changes made to $notesfile" unless $silent; } } # $notesfile loop exit; #=== FUNCTION ================================================================ # NAME: _backup # PURPOSE: Given a file, make a backup of it by appending $suffix, and # handle previous backups # PARAMETERS: $filename path of file to backup # $suffix suffix to use, default 'bck' # $limit number of backups to keep # RETURNS: True or false depending on success # DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. # Rationalises the $limit to avoid it being 0 or less. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub _backup { my ( $filename, $suffix, $limit ) = @_; my ( $backupname, $limitname ); # # Check the target file exists # unless ( -e $filename ) { warn "Unable to find $filename to backup\n"; return 0; } # # Handle invalid $limit values # $limit = 1 unless ( $limit >= 1 ); # # Defaults # $suffix = 'bck' unless $suffix; $limitname = "$filename.$suffix.$limit"; $backupname = "$filename.$suffix"; # # Look for existing backups # if ( -e $backupname ) { # # If maximum version exists delete it # if ( -e $limitname ) { unlink($limitname); } # # Go backwards through the versions incrementing their version numbers # for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { if ( -e "$filename.$suffix.$vsn" ) { move( "$filename.$suffix.$vsn", sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); } } # # Make $filename.bck into $filename.bck.1 # move( "$filename.$suffix", "$filename.$suffix.1" ); } # # Finally save the $filename as $filename.bck # move( $filename, "$filename.$suffix" ); return 1; } #=== FUNCTION ================================================================ # NAME: coalesce # PURPOSE: To find the first defined argument and return it # PARAMETERS: Arbitrary number of arguments # RETURNS: The first defined argument or undef if there are none # DESCRIPTION: Just a simple way of ensuring an 'undef' value is never # returned when doing so might be a problem. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub coalesce { foreach (@_) { return $_ if defined($_); } return undef; ## no critic } #=== FUNCTION ================================================================ # NAME: _debug # PURPOSE: Prints debug reports # PARAMETERS: $active Boolean: 1 for print, 0 for no print # $messages... Arbitrary list of messages to print # RETURNS: Nothing # DESCRIPTION: Outputs messages if $active is true. It removes any trailing # newline from each one and then adds one in the 'print' to the # caller doesn't have to bother. Prepends each message with 'D>' # to show it's a debug message. # THROWS: No exceptions # COMMENTS: Differs from other functions of the same name # SEE ALSO: N/A #=============================================================================== sub _debug { my $active = shift; my $message; return unless $active; while ($message = shift) { chomp($message); print STDERR "D> $message\n"; } } #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: Process the options we want to offer. See the documentation # for details # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "documentation|man", "debug=i", "silent!", "prefix=s", "backup!", "force!", # "config=s", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME extract_images - extract embedded images from HTML and save as files =head1 VERSION This documentation refers to extract_images version 0.0.3 =head1 USAGE extract_images [--help] [--documentation|man] [-debug=N] [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] Examples: extract_images --prefix=picture_ --backup index.html extract_images --silent --nobackup shownotes.html =head1 REQUIRED ARGUMENTS The script requires the names of one or more HTML files which will be scanned for embedded images. =head1 OPTIONS Note that all on/off options can be written as B<--silent>, B<--nosilent> or B<--no-silent>. A single hyphen can be used at the start or a pair. =over 8 =item B<--help> Reports brief information about how to use the script and exits. To see the full documentation use the option B<--documentation> or B<--man>. Alternatively, to generate a PDF version use the I