| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | #!/usr/bin/env perl | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #         FILE: extract_images | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #        USAGE: ./extract_images [--help] [--documentation|man] | 
					
						
							|  |  |  | #               [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] | 
					
						
							|  |  |  | #               HTML_file [ [HTML_file_2] [HTML_file_3] ... ] | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  DESCRIPTION: Processes HTML files which may have 'data' URIs containing | 
					
						
							|  |  |  | #               images, and extracts these images into files in the same | 
					
						
							|  |  |  | #               directory.  The 'data' scheme links are converted to 'https' | 
					
						
							|  |  |  | #               and reference the extracted files. The modified HTML is | 
					
						
							|  |  |  | #               output, and the original will be saved as a backup if | 
					
						
							|  |  |  | #               requested. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      OPTIONS: --- | 
					
						
							|  |  |  | # REQUIREMENTS: --- | 
					
						
							|  |  |  | #         BUGS: --- | 
					
						
							|  |  |  | #        NOTES: --- | 
					
						
							|  |  |  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | #      VERSION: 0.0.2 | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | #      CREATED: 2024-12-25 10:53:15 | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | #     REVISION: 2024-12-30 09:28:40 | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | # | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use v5.36; | 
					
						
							|  |  |  | use strict; | 
					
						
							|  |  |  | use warnings; | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | use feature qw{ say state try }; | 
					
						
							|  |  |  | no warnings qw{ experimental::try }; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Getopt::Long; | 
					
						
							|  |  |  | use Pod::Usage; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use HTML::TreeBuilder 5 -weak; | 
					
						
							|  |  |  | use URI; | 
					
						
							|  |  |  | use MIME::Types; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Path::Tiny; | 
					
						
							|  |  |  | use File::Copy; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use Data::Dumper; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Version number (Incremented by Vim) | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | our $VERSION = '0.0.2'; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Script and directory names | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ( my $PROG = $0 ) =~ s|.*/||mx; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Declarations | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | my ( $notes,   $typename, $uri ); | 
					
						
							|  |  |  | my ( $fcount,  $basename, $filename, $suffix, $fh ); | 
					
						
							|  |  |  | my ( $updates, $bsuffix,  $newURL ); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | my $backupcount = 5; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Options and arguments | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # {{{ | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Option defaults | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEFDEBUG = 0; | 
					
						
							|  |  |  | my $DEFPREFIX = 'image'; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | my %options; | 
					
						
							|  |  |  | Options( \%options ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Default help shows minimal information | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) | 
					
						
							|  |  |  |     if ( $options{'help'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The -documentation or -man option shows the full POD documentation through | 
					
						
							|  |  |  | # a pager for convenience | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) | 
					
						
							|  |  |  |     if ( $options{'documentation'} ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Collect options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | my $DEBUG  = ( defined( $options{debug} )  ? $options{debug}  : $DEFDEBUG ); | 
					
						
							|  |  |  | my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); | 
					
						
							|  |  |  | my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); | 
					
						
							|  |  |  | my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); | 
					
						
							|  |  |  | my $force  = ( defined( $options{force} )  ? $options{force}  : 0 ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Check we have arguments | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | pod2usage( | 
					
						
							|  |  |  |     -msg     => "Missing arguments. One or more HTML file names are needed\n", | 
					
						
							|  |  |  |     -exitval => 1, | 
					
						
							|  |  |  |     -verbose => 0 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | unless ( scalar(@ARGV) > 0 ); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Clean up the prefix | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Backup suffix (the dot gets added later) | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | $bsuffix = 'bak'; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Debug the options | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | _debug( | 
					
						
							|  |  |  |     $DEBUG > 1, | 
					
						
							|  |  |  |     '$silent = ' . $silent, | 
					
						
							|  |  |  |     '$prefix = ' . $prefix, | 
					
						
							|  |  |  |     '$backup = ' . $backup, | 
					
						
							|  |  |  |     '$force = ' . $force, | 
					
						
							|  |  |  | ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # }}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | # Prepare a MIME::Types object | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | my $mt = MIME::Types->new; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | # Loop through the arguments | 
					
						
							|  |  |  | #------------------------------------------------------------------------------- | 
					
						
							|  |  |  | foreach my $notesfile (@ARGV) { | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Get the absolute path of the argument | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $abs_nf = path($notesfile)->absolute; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Get the 'dirname' from the absolute path | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     my $dirname = path($abs_nf)->parent->stringify; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     unless (-e $abs_nf) { | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |         warn "Unable to find $notesfile\n"; | 
					
						
							|  |  |  |         next; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Force the MIME type of $notesfile to a string | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     $typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' ); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Check the MIME type and reject non-HTML | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ($typename eq 'text/html') { | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |         warn "File $abs_nf is not HTML\n"; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |         next | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     say "Reading from $abs_nf\n" unless $silent; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     # Get HTML file basename (no path) without the suffix for building filenames | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     $basename = path($abs_nf)->basename('.html'); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Read the HTML | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; | 
					
						
							|  |  |  |     $notes = <$nfh>; | 
					
						
							|  |  |  |     close($nfh); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     # Image files are to have an index/sequence number | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |     # | 
					
						
							|  |  |  |     $fcount = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Keep a note of HTML updates | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $updates = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Initialise the TreeBuilder | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     my $tree = HTML::TreeBuilder->new; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |     $tree->ignore_unknown(0); | 
					
						
							|  |  |  |     $tree->no_expand_entities(1); | 
					
						
							|  |  |  |     $tree->p_strict(1); | 
					
						
							|  |  |  |     $tree->store_comments(1); | 
					
						
							|  |  |  |     $tree->warn(1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     # Load the HTML obtained from the file into the TreeBuilder | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |     # | 
					
						
							|  |  |  |     $tree->parse_content($notes) | 
					
						
							|  |  |  |         or die "HTML::TreeBuilder failed to parse notes: $!\n"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Loop through the tree looking for 'data' scheme images | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |     for ( @{ $tree->extract_links('img') } ) { | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |         my ( $link, $element, $attr, $tag ) = @$_; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |         $uri = URI->new($link); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |         unless ($silent) { | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |             say "Scheme: ",     $uri->scheme; | 
					
						
							|  |  |  |             say "Media type: ", $uri->media_type; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # We only care about 'data' scheme stuff - for now anyway, and only | 
					
						
							|  |  |  |         # images within this set | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( $uri->scheme eq 'data' ) { | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             # Only images | 
					
						
							|  |  |  |             # | 
					
						
							|  |  |  |             if ( $uri->media_type =~ /^image/ ) { | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |               # | 
					
						
							|  |  |  |               # Extract the file name suffix from the MIME string, and give it | 
					
						
							|  |  |  |               # a leading '.' | 
					
						
							|  |  |  |               # | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                 # Construct the filename for this image making sure it's in | 
					
						
							|  |  |  |                 # the directory the HTML is in. | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 # | 
					
						
							|  |  |  |                 $fcount++; | 
					
						
							|  |  |  |                 $filename | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                     = "$dirname/${prefix}_${basename}_${fcount}${suffix}"; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 say "Writing to: $filename" unless $silent; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 say '-' x 40 unless $silent; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                 # If the file exists and --force is not active, warn and skip. | 
					
						
							|  |  |  |                 # Otherwise write the file. | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                 if ( -e $filename && !$force ) { | 
					
						
							|  |  |  |                     warn "File $filename exists; not overwriting\n"; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 } | 
					
						
							|  |  |  |                 else { | 
					
						
							|  |  |  |                     # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                     # Write the data to the file in binary format. The URI | 
					
						
							|  |  |  |                     # module does the conversion so it's already binary. | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                     # | 
					
						
							|  |  |  |                     $fh = path($filename)->openw_raw; | 
					
						
							|  |  |  |                     print $fh $uri->data; | 
					
						
							|  |  |  |                     close($fh); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                 # Update the HTML with a link to the file we created (or that | 
					
						
							|  |  |  |                 # was already there from an earlier time). | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 # | 
					
						
							|  |  |  |                 $updates++; | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |                 $newURL = path($filename)->basename; | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |                 $element->attr( $attr, $newURL ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     }    # extract_links loop | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Output the changed HTML turning what became standalone back into | 
					
						
							|  |  |  |     # a <body> fragment. | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ($updates > 0) { | 
					
						
							|  |  |  |         my $body = $tree->look_down( _tag => 'body' ); | 
					
						
							|  |  |  |         ( my $result = $body->as_HTML( undef, ' ', {} ) ) | 
					
						
							|  |  |  |             =~ s{(^<body[^>]*>|</body>$)}{}gi; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         #$notesfile = path($notesfile)->basename; | 
					
						
							|  |  |  |         if ($backup) { | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  |             _backup( $abs_nf, $bsuffix, $backupcount ); | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  |             say "$notesfile backed up" unless $silent; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         else { | 
					
						
							|  |  |  |             say "$notesfile not backed up" unless $silent; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $fh = path($notesfile)->openw; | 
					
						
							|  |  |  |         say $fh $result; | 
					
						
							|  |  |  |         close($fh); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         say "$updates images converted, $notesfile updated" unless $silent; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         say "No images found, no changes made to $notesfile" unless $silent; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } # $notesfile loop | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | exit; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _backup | 
					
						
							|  |  |  | #      PURPOSE: Given a file, make a backup of it by appending $suffix, and | 
					
						
							|  |  |  | #               handle previous backups | 
					
						
							|  |  |  | #   PARAMETERS: $filename               path of file to backup | 
					
						
							|  |  |  | #               $suffix                 suffix to use, default 'bck' | 
					
						
							|  |  |  | #               $limit                  number of backups to keep | 
					
						
							|  |  |  | #      RETURNS: True or false depending on success | 
					
						
							|  |  |  | #  DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. | 
					
						
							|  |  |  | #               Rationalises the $limit to avoid it being 0 or less. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _backup { | 
					
						
							|  |  |  |     my ( $filename, $suffix, $limit ) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my ( $backupname, $limitname ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Check the target file exists | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     unless ( -e $filename ) { | 
					
						
							|  |  |  |         warn "Unable to find $filename to backup\n"; | 
					
						
							|  |  |  |         return 0; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Handle invalid $limit values | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $limit = 1 unless ( $limit >= 1 ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Defaults | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     $suffix     = 'bck' unless $suffix; | 
					
						
							|  |  |  |     $limitname  = "$filename.$suffix.$limit"; | 
					
						
							|  |  |  |     $backupname = "$filename.$suffix"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Look for existing backups | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     if ( -e $backupname ) { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # If maximum version exists delete it | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         if ( -e $limitname ) { | 
					
						
							|  |  |  |             unlink($limitname); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Go backwards through the versions incrementing their version numbers | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { | 
					
						
							|  |  |  |             if ( -e "$filename.$suffix.$vsn" ) { | 
					
						
							|  |  |  |                 move( "$filename.$suffix.$vsn", | 
					
						
							|  |  |  |                     sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # Make $filename.bck into $filename.bck.1 | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         move( "$filename.$suffix", "$filename.$suffix.1" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     # Finally save the $filename as $filename.bck | 
					
						
							|  |  |  |     # | 
					
						
							|  |  |  |     move( $filename, "$filename.$suffix" ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return 1; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: coalesce | 
					
						
							|  |  |  | #      PURPOSE: To find the first defined argument and return it | 
					
						
							|  |  |  | #   PARAMETERS: Arbitrary number of arguments | 
					
						
							|  |  |  | #      RETURNS: The first defined argument or undef if there are none | 
					
						
							|  |  |  | #  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never | 
					
						
							|  |  |  | #               returned when doing so might be a problem. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: None | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub coalesce { | 
					
						
							|  |  |  |     foreach (@_) { | 
					
						
							|  |  |  |         return $_ if defined($_); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return undef;    ## no critic | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: _debug | 
					
						
							|  |  |  | #      PURPOSE: Prints debug reports | 
					
						
							|  |  |  | #   PARAMETERS: $active         Boolean: 1 for print, 0 for no print | 
					
						
							|  |  |  | #               $messages...    Arbitrary list of messages to print | 
					
						
							|  |  |  | #      RETURNS: Nothing | 
					
						
							|  |  |  | #  DESCRIPTION: Outputs messages if $active is true. It removes any trailing | 
					
						
							|  |  |  | #               newline from each one and then adds one in the 'print' to the | 
					
						
							|  |  |  | #               caller doesn't have to bother. Prepends each message with 'D>' | 
					
						
							|  |  |  | #               to show it's a debug message. | 
					
						
							|  |  |  | #       THROWS: No exceptions | 
					
						
							|  |  |  | #     COMMENTS: Differs from other functions of the same name | 
					
						
							|  |  |  | #     SEE ALSO: N/A | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub _debug { | 
					
						
							|  |  |  |     my $active = shift; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my $message; | 
					
						
							|  |  |  |     return unless $active; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     while ($message = shift) { | 
					
						
							|  |  |  |         chomp($message); | 
					
						
							|  |  |  |         print STDERR "D> $message\n"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #===  FUNCTION  ================================================================ | 
					
						
							|  |  |  | #         NAME: Options | 
					
						
							|  |  |  | #      PURPOSE: Processes command-line options | 
					
						
							|  |  |  | #   PARAMETERS: $optref     Hash reference to hold the options | 
					
						
							|  |  |  | #      RETURNS: Undef | 
					
						
							|  |  |  | #  DESCRIPTION: Process the options we want to offer. See the documentation | 
					
						
							|  |  |  | #               for details | 
					
						
							|  |  |  | #       THROWS: no exceptions | 
					
						
							|  |  |  | #     COMMENTS: none | 
					
						
							|  |  |  | #     SEE ALSO: n/a | 
					
						
							|  |  |  | #=============================================================================== | 
					
						
							|  |  |  | sub Options { | 
					
						
							|  |  |  |     my ($optref) = @_; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my @options = ( | 
					
						
							|  |  |  |         "help",     "documentation|man", "debug=i", "silent!", | 
					
						
							|  |  |  |         "prefix=s", "backup!",           "force!", | 
					
						
							|  |  |  |         # "config=s", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( !GetOptions( $optref, @options ) ) { | 
					
						
							|  |  |  |         pod2usage( | 
					
						
							|  |  |  |             -msg     => "$PROG version $VERSION\n", | 
					
						
							|  |  |  |             -exitval => 1, | 
					
						
							|  |  |  |             -verbose => 0 | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __END__ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #  Application Documentation | 
					
						
							|  |  |  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 
					
						
							|  |  |  | #{{{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 NAME | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | extract_images - extract embedded images from HTML and save as files | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 VERSION | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-30 12:15:04 +00:00
										 |  |  | This documentation refers to extract_images version 0.0.2 | 
					
						
							| 
									
										
										
										
											2024-12-29 16:33:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | =head1 USAGE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     extract_images | 
					
						
							|  |  |  |         [--help] [--documentation|man] [-debug=N] | 
					
						
							|  |  |  |         [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] | 
					
						
							|  |  |  |         HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Examples: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     extract_images --prefix=picture_ --backup index.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     extract_images --silent --nobackup shownotes.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 REQUIRED ARGUMENTS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script requires the names of one or more HTML files which will be scanned | 
					
						
							|  |  |  | for embedded images. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 OPTIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Note that all on/off options can be written as B<--silent>, B<--nosilent> or | 
					
						
							|  |  |  | B<--no-silent>. A single hyphen can be used at the start or a pair. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 8 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--help> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Reports brief information about how to use the script and exits. To see the | 
					
						
							|  |  |  | full documentation use the option B<--documentation> or B<--man>. Alternatively, | 
					
						
							|  |  |  | to generate a PDF version use the I<pod2pdf> tool from | 
					
						
							|  |  |  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | 
					
						
							|  |  |  | installed with the cpan tool as App::pod2pdf. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--documentation> or B<--man> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Reports full information about how to use the script and exits. Alternatively, | 
					
						
							|  |  |  | to generate a PDF version use the I<pod2pdf> tool from | 
					
						
							|  |  |  | I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be | 
					
						
							|  |  |  | installed with the cpan tool as App::pod2pdf. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--debug=N> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Run in debug mode at the level specified by I<N>. Possible values are: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<0> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | No debugging (the default). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<1> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<2> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<3> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--prefix=STRING> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Since embedded images have no names, when they are written to image files | 
					
						
							|  |  |  | names are generated for them. These names are built from the following | 
					
						
							|  |  |  | components: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =over 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<HTML filename> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The name of the HTML file without a suffix. So, if the name is 'index.html', | 
					
						
							|  |  |  | the 'index' part will be used. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<prefix> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The prefix string provided by this option, or 'image' if not specified. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<counter> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The images found in the HTML are counted, starting from 1, and this number is | 
					
						
							|  |  |  | used here. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<extension> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The image format extension, taken from the embedded image information. | 
					
						
							|  |  |  | Examples would be 'jpg', 'jpeg' and 'png'. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The B<HTML filename>, B<prefix>, B<counter> and B<extension> are joined | 
					
						
							|  |  |  | together to make the file name as follows: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     <HTML filename>_<prefix>_<counter>.<extension> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | So the following run of the script would generate the first file shown below | 
					
						
							|  |  |  | assuming the first embedded picture was in 'PNG' format: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     extract_images --prefix=picture hpr4283.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hpr4283_picture_1.png | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--[no]backup> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The HTML is modified to leave placeholders referring to any embedded images | 
					
						
							|  |  |  | found as it it processed. The new HTML is written to the original file if | 
					
						
							|  |  |  | B<--nobackup> is specified, or a backup of the original file is made, before | 
					
						
							|  |  |  | writing the new HTML. Making a backup is the default behaviour. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--[no]force> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Images are written to the current directory. If an image already exists, and | 
					
						
							|  |  |  | B<--force> has not been specified, the script will stop processing and exit. | 
					
						
							|  |  |  | The default setting of this option is B<--noforce>. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =item B<--[no]silent> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | By default the script reports its progress as it processes an HTML file, but | 
					
						
							|  |  |  | this can be turned off by using this option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =back | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DESCRIPTION | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This Perl script B<extract_images> parses HTML looking for embedded images. | 
					
						
							|  |  |  | Such images use a URI with the scheme 'data' followed by image details | 
					
						
							|  |  |  | including the encoded binary contents. See the Wikipedia article | 
					
						
							|  |  |  | https://en.wikipedia.org/wiki/Data_URI_scheme for details. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | When such images are found they are written as files (with generated names), | 
					
						
							|  |  |  | and the HTML is updated with these names replacing the original URI. Further | 
					
						
							|  |  |  | work is likely to be required to build full image links, but the majority of | 
					
						
							|  |  |  | work will have been done. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script will not overwrite image files unless the B<--force> option is | 
					
						
							|  |  |  | used, but will overwrite the original HTML file unless B<--backup> is | 
					
						
							|  |  |  | specified. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | By default details of progress are written to standard output, but this can be | 
					
						
							|  |  |  | preveneted by using the B<--silent> option. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head2 GENERATED IMAGE FILE NAMES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | These names are built from various elements: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     <HTML filename>_<prefix>_<counter>.<extension> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Where <HTML filename> is the base name of the HTML input file, <prefix> is the | 
					
						
							|  |  |  | string provided with the B<--prefix=STRING> option (or B<image> by default), | 
					
						
							|  |  |  | <counter> is a count of image files found during the scan of the HTML, and | 
					
						
							|  |  |  | <extension> is the appropriate file extension for the type of image being | 
					
						
							|  |  |  | converted. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See the option B<--prefix=STRING> for details of file name generation. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DIAGNOSTICS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 CONFIGURATION AND ENVIRONMENT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TBA | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 DEPENDENCIES | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Data::Dumper | 
					
						
							|  |  |  |     File::Copy | 
					
						
							|  |  |  |     Getopt::Long | 
					
						
							|  |  |  |     HTML::TreeBuilder | 
					
						
							|  |  |  |     MIME::Types | 
					
						
							|  |  |  |     Path::Tiny | 
					
						
							|  |  |  |     Pod::Usage | 
					
						
							|  |  |  |     URI | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 BUGS AND LIMITATIONS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are no known bugs in this module. | 
					
						
							|  |  |  | Please report problems to Dave Morriss (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | Patches are welcome. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 AUTHOR | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Dave Morriss  (Dave.Morriss@gmail.com) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =head1 LICENCE AND COPYRIGHT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com). | 
					
						
							|  |  |  | All rights reserved. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This module is free software; you can redistribute it and/or | 
					
						
							|  |  |  | modify it under the same terms as Perl itself. See perldoc perlartistic. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | =cut | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #}}} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # [zo to open fold, zc to close] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | 
					
						
							|  |  |  | 
 |