diff --git a/Show_Submission/extract_images b/Show_Submission/extract_images new file mode 100755 index 0000000..f7353fd --- /dev/null +++ b/Show_Submission/extract_images @@ -0,0 +1,654 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: extract_images +# +# USAGE: ./extract_images [--help] [--documentation|man] +# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] +# HTML_file [ [HTML_file_2] [HTML_file_3] ... ] +# +# DESCRIPTION: Processes HTML files which may have 'data' URIs containing +# images, and extracts these images into files in the same +# directory. The 'data' scheme links are converted to 'https' +# and reference the extracted files. The modified HTML is +# output, and the original will be saved as a backup if +# requested. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.1 +# CREATED: 2024-12-25 10:53:15 +# REVISION: 2024-12-29 15:19:35 +# +#=============================================================================== + +use v5.36; +use strict; +use warnings; +use feature qw{ postderef say signatures state try }; +no warnings + qw{ experimental::postderef experimental::signatures experimental::try }; + +use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 + +use Getopt::Long; +use Pod::Usage; + +use HTML::TreeBuilder 5 -weak; +use URI; +use MIME::Types; + +#use File::Slurper; +use Path::Tiny; +use File::Copy; + +use Data::Dumper; + +# +# Version number (Incremented by Vim) +# +our $VERSION = '0.0.1'; + +# +# Script and directory names +# +( my $PROG = $0 ) =~ s|.*/||mx; + +#------------------------------------------------------------------------------- +# Declarations +#------------------------------------------------------------------------------- +my ( $notes, $typename, $tree, $uri ); +my ( $fcount, $basename, $filename, $suffix, $fh ); +my ( $updates, $bsuffix, $new_basename, $newURL ); + +my $backupcount = 5; + +#------------------------------------------------------------------------------- +# Options and arguments +#------------------------------------------------------------------------------- +# {{{ +# +# Option defaults +# +my $DEFDEBUG = 0; +my $DEFPREFIX = 'image'; + +my %options; +Options( \%options ); + +# +# Default help shows minimal information +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) + if ( $options{'help'} ); + +# +# The -documentation or -man option shows the full POD documentation through +# a pager for convenience +# +pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) + if ( $options{'documentation'} ); + +# +# Collect options +# +my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); +my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); +my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); +my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); +my $force = ( defined( $options{force} ) ? $options{force} : 0 ); + +# +# Check we have arguments +# +die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n" + unless ( scalar(@ARGV) > 0 ); + +# +# Clean up the prefix +# +$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); + +# +# Backup suffix (the dot gets added later) +# +$bsuffix = 'bck'; + +# +# Debug the options +# +_debug( + $DEBUG > 1, + '$silent = ' . $silent, + '$prefix = ' . $prefix, + '$backup = ' . $backup, + '$force = ' . $force, +); + +# }}} + +#------------------------------------------------------------------------------- +# Prepare items for later use: current working directory and a MIME::Types +# object +#------------------------------------------------------------------------------- +my $curdir = Path::Tiny->cwd; + +my $mt = MIME::Types->new; + +#------------------------------------------------------------------------------- +# Loop through the arguments +#------------------------------------------------------------------------------- +foreach my $notesfile (@ARGV) { + unless (-e $notesfile) { + warn "Unable to find $notesfile\n"; + next; + } + + # + # Force the MIME type of $notesfile to a string + # + $typename = "" . $mt->mimeTypeOf("$notesfile"); + + # + # Check the MIME type and reject non-HTML + # + unless ($typename eq 'text/html') { + warn "File $notesfile is not HTML\n"; + next + } + + say "Reading from $notesfile\n" unless $silent; + + # + # Get HTML file basename without the suffix for building filenames + # + $basename = path($notesfile)->basename('.html'); + + # + # Read the HTML + # + open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; + $notes = <$nfh>; + close($nfh); + + # + # Image files are to have an index + # + $fcount = 0; + + # + # Keep a note of HTML updates + # + $updates = 0; + + # + # Initialise the TreeBuilder + # + $tree = HTML::TreeBuilder->new; + $tree->ignore_unknown(0); + $tree->no_expand_entities(1); + $tree->p_strict(1); + $tree->store_comments(1); + $tree->warn(1); + + # + # Load this HTML file into the TreeBuilder + # + $tree->parse_content($notes) + or die "HTML::TreeBuilder failed to parse notes: $!\n"; + + # + # Loop through the tree looking for 'data' scheme images + # + for ( @{ $tree->extract_links( 'img' ) } ) { + my ( $link, $element, $attr, $tag ) = @$_; + + $uri = URI->new( $link ); + unless ($silent) { + say "Scheme: ",$uri->scheme; + say "Media type: ",$uri->media_type; + } + + # + # We only care about 'data' scheme stuff - for now anyway, and only + # images within this set + # + if ( $uri->scheme eq 'data' ) { + # + # Only images + # + if ( $uri->media_type =~ /^image/ ) { + # + # Extract the file name suffix from the MIME string, and give it + # a leading '.' + # + ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; + + # + # Construct the filename for this image + # + $fcount++; + $filename + = "$curdir/${prefix}_${basename}_${fcount}${suffix}"; + say "Writing to: $filename" unless $silent; + + say '-' x 40 unless $silent; + + # + # Check if the file exists. Don't clobber it unless --force is active + # + if ( -e $filename ) { + unless ($force) { + warn "File $filename exists; not overwriting\n"; + } + } + else { + # + # Write the data to the file in binary format. The URI module + # does the conversion so it's already binary. + # + $fh = path($filename)->openw_raw; + print $fh $uri->data; + close($fh); + } + + # + # Update the HTML with a link to the file we created + # + $updates++; + $newURL = path($filename)->basename; + $element->attr( $attr, $newURL ); + } + } + } # extract_links loop + + # + # Output the changed HTML turning what became standalone back into + # a
fragment. + # + if ($updates > 0) { + my $body = $tree->look_down( _tag => 'body' ); + ( my $result = $body->as_HTML( undef, ' ', {} ) ) + =~ s{(^]*>|$)}{}gi; + + #$notesfile = path($notesfile)->basename; + if ($backup) { + _backup( $notesfile, $bsuffix, $backupcount ); + say "$notesfile backed up" unless $silent; + } + else { + say "$notesfile not backed up" unless $silent; + } + + $fh = path($notesfile)->openw; + say $fh $result; + close($fh); + + say "$updates images converted, $notesfile updated" unless $silent; + } + else { + say "No images found, no changes made to $notesfile" unless $silent; + } + +} # $notesfile loop + +exit; + +#=== FUNCTION ================================================================ +# NAME: _backup +# PURPOSE: Given a file, make a backup of it by appending $suffix, and +# handle previous backups +# PARAMETERS: $filename path of file to backup +# $suffix suffix to use, default 'bck' +# $limit number of backups to keep +# RETURNS: True or false depending on success +# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. +# Rationalises the $limit to avoid it being 0 or less. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub _backup { + my ( $filename, $suffix, $limit ) = @_; + + my ( $backupname, $limitname ); + + # + # Check the target file exists + # + unless ( -e $filename ) { + warn "Unable to find $filename to backup\n"; + return 0; + } + + # + # Handle invalid $limit values + # + $limit = 1 unless ( $limit >= 1 ); + + # + # Defaults + # + $suffix = 'bck' unless $suffix; + $limitname = "$filename.$suffix.$limit"; + $backupname = "$filename.$suffix"; + + # + # Look for existing backups + # + if ( -e $backupname ) { + + # + # If maximum version exists delete it + # + if ( -e $limitname ) { + unlink($limitname); + } + + # + # Go backwards through the versions incrementing their version numbers + # + for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { + if ( -e "$filename.$suffix.$vsn" ) { + move( "$filename.$suffix.$vsn", + sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); + } + } + + # + # Make $filename.bck into $filename.bck.1 + # + move( "$filename.$suffix", "$filename.$suffix.1" ); + + } + + # + # Finally save the $filename as $filename.bck + # + move( $filename, "$filename.$suffix" ); + + return 1; +} + +#=== FUNCTION ================================================================ +# NAME: _debug +# PURPOSE: Prints debug reports +# PARAMETERS: $active Boolean: 1 for print, 0 for no print +# $messages... Arbitrary list of messages to print +# RETURNS: Nothing +# DESCRIPTION: Outputs messages if $active is true. It removes any trailing +# newline from each one and then adds one in the 'print' to the +# caller doesn't have to bother. Prepends each message with 'D>' +# to show it's a debug message. +# THROWS: No exceptions +# COMMENTS: Differs from other functions of the same name +# SEE ALSO: N/A +#=============================================================================== +sub _debug { + my $active = shift; + + my $message; + return unless $active; + + while ($message = shift) { + chomp($message); + print STDERR "D> $message\n"; + } +} + +#=== FUNCTION ================================================================ +# NAME: Options +# PURPOSE: Processes command-line options +# PARAMETERS: $optref Hash reference to hold the options +# RETURNS: Undef +# DESCRIPTION: Process the options we want to offer. See the documentation +# for details +# THROWS: no exceptions +# COMMENTS: none +# SEE ALSO: n/a +#=============================================================================== +sub Options { + my ($optref) = @_; + + my @options = ( + "help", "documentation|man", "debug=i", "silent!", + "prefix=s", "backup!", "force!", + # "config=s", + ); + + if ( !GetOptions( $optref, @options ) ) { + pod2usage( + -msg => "$PROG version $VERSION\n", + -exitval => 1, + -verbose => 0 + ); + } + + return; +} + +__END__ + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Application Documentation +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#{{{ + +=head1 NAME + +extract_images - extract embedded images from HTML and save as files + +=head1 VERSION + +This documentation refers to extract_images version 0.0.1 + +=head1 USAGE + + extract_images + [--help] [--documentation|man] [-debug=N] + [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] + HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] + + Examples: + + extract_images --prefix=picture_ --backup index.html + + extract_images --silent --nobackup shownotes.html + +=head1 REQUIRED ARGUMENTS + +The script requires the names of one or more HTML files which will be scanned +for embedded images. + +=head1 OPTIONS + +Note that all on/off options can be written as B<--silent>, B<--nosilent> or +B<--no-silent>. A single hyphen can be used at the start or a pair. + +=over 8 + +=item B<--help> + +Reports brief information about how to use the script and exits. To see the +full documentation use the option B<--documentation> or B<--man>. Alternatively, +to generate a PDF version use the I