From 2f350dd1dbf1b782bf340d35cf1a5f1756c5cd01 Mon Sep 17 00:00:00 2001 From: Dave Morriss Date: Tue, 31 Dec 2024 17:32:27 +0000 Subject: [PATCH] Correction to the component order of the generated file name --- 37567b_extract_images | 654 --------------------------------- Show_Submission/extract_images | 11 +- 2 files changed, 6 insertions(+), 659 deletions(-) delete mode 100644 37567b_extract_images diff --git a/37567b_extract_images b/37567b_extract_images deleted file mode 100644 index f7353fd..0000000 --- a/37567b_extract_images +++ /dev/null @@ -1,654 +0,0 @@ -#!/usr/bin/env perl -#=============================================================================== -# -# FILE: extract_images -# -# USAGE: ./extract_images [--help] [--documentation|man] -# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent] -# HTML_file [ [HTML_file_2] [HTML_file_3] ... ] -# -# DESCRIPTION: Processes HTML files which may have 'data' URIs containing -# images, and extracts these images into files in the same -# directory. The 'data' scheme links are converted to 'https' -# and reference the extracted files. The modified HTML is -# output, and the original will be saved as a backup if -# requested. -# -# OPTIONS: --- -# REQUIREMENTS: --- -# BUGS: --- -# NOTES: --- -# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.1 -# CREATED: 2024-12-25 10:53:15 -# REVISION: 2024-12-29 15:19:35 -# -#=============================================================================== - -use v5.36; -use strict; -use warnings; -use feature qw{ postderef say signatures state try }; -no warnings - qw{ experimental::postderef experimental::signatures experimental::try }; - -use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 - -use Getopt::Long; -use Pod::Usage; - -use HTML::TreeBuilder 5 -weak; -use URI; -use MIME::Types; - -#use File::Slurper; -use Path::Tiny; -use File::Copy; - -use Data::Dumper; - -# -# Version number (Incremented by Vim) -# -our $VERSION = '0.0.1'; - -# -# Script and directory names -# -( my $PROG = $0 ) =~ s|.*/||mx; - -#------------------------------------------------------------------------------- -# Declarations -#------------------------------------------------------------------------------- -my ( $notes, $typename, $tree, $uri ); -my ( $fcount, $basename, $filename, $suffix, $fh ); -my ( $updates, $bsuffix, $new_basename, $newURL ); - -my $backupcount = 5; - -#------------------------------------------------------------------------------- -# Options and arguments -#------------------------------------------------------------------------------- -# {{{ -# -# Option defaults -# -my $DEFDEBUG = 0; -my $DEFPREFIX = 'image'; - -my %options; -Options( \%options ); - -# -# Default help shows minimal information -# -pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 ) - if ( $options{'help'} ); - -# -# The -documentation or -man option shows the full POD documentation through -# a pager for convenience -# -pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 ) - if ( $options{'documentation'} ); - -# -# Collect options -# -my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG ); -my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); -my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX ); -my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 ); -my $force = ( defined( $options{force} ) ? $options{force} : 0 ); - -# -# Check we have arguments -# -die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n" - unless ( scalar(@ARGV) > 0 ); - -# -# Clean up the prefix -# -$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix); - -# -# Backup suffix (the dot gets added later) -# -$bsuffix = 'bck'; - -# -# Debug the options -# -_debug( - $DEBUG > 1, - '$silent = ' . $silent, - '$prefix = ' . $prefix, - '$backup = ' . $backup, - '$force = ' . $force, -); - -# }}} - -#------------------------------------------------------------------------------- -# Prepare items for later use: current working directory and a MIME::Types -# object -#------------------------------------------------------------------------------- -my $curdir = Path::Tiny->cwd; - -my $mt = MIME::Types->new; - -#------------------------------------------------------------------------------- -# Loop through the arguments -#------------------------------------------------------------------------------- -foreach my $notesfile (@ARGV) { - unless (-e $notesfile) { - warn "Unable to find $notesfile\n"; - next; - } - - # - # Force the MIME type of $notesfile to a string - # - $typename = "" . $mt->mimeTypeOf("$notesfile"); - - # - # Check the MIME type and reject non-HTML - # - unless ($typename eq 'text/html') { - warn "File $notesfile is not HTML\n"; - next - } - - say "Reading from $notesfile\n" unless $silent; - - # - # Get HTML file basename without the suffix for building filenames - # - $basename = path($notesfile)->basename('.html'); - - # - # Read the HTML - # - open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n"; - $notes = <$nfh>; - close($nfh); - - # - # Image files are to have an index - # - $fcount = 0; - - # - # Keep a note of HTML updates - # - $updates = 0; - - # - # Initialise the TreeBuilder - # - $tree = HTML::TreeBuilder->new; - $tree->ignore_unknown(0); - $tree->no_expand_entities(1); - $tree->p_strict(1); - $tree->store_comments(1); - $tree->warn(1); - - # - # Load this HTML file into the TreeBuilder - # - $tree->parse_content($notes) - or die "HTML::TreeBuilder failed to parse notes: $!\n"; - - # - # Loop through the tree looking for 'data' scheme images - # - for ( @{ $tree->extract_links( 'img' ) } ) { - my ( $link, $element, $attr, $tag ) = @$_; - - $uri = URI->new( $link ); - unless ($silent) { - say "Scheme: ",$uri->scheme; - say "Media type: ",$uri->media_type; - } - - # - # We only care about 'data' scheme stuff - for now anyway, and only - # images within this set - # - if ( $uri->scheme eq 'data' ) { - # - # Only images - # - if ( $uri->media_type =~ /^image/ ) { - # - # Extract the file name suffix from the MIME string, and give it - # a leading '.' - # - ( $suffix = $uri->media_type ) =~ s{^.*/}{.}; - - # - # Construct the filename for this image - # - $fcount++; - $filename - = "$curdir/${prefix}_${basename}_${fcount}${suffix}"; - say "Writing to: $filename" unless $silent; - - say '-' x 40 unless $silent; - - # - # Check if the file exists. Don't clobber it unless --force is active - # - if ( -e $filename ) { - unless ($force) { - warn "File $filename exists; not overwriting\n"; - } - } - else { - # - # Write the data to the file in binary format. The URI module - # does the conversion so it's already binary. - # - $fh = path($filename)->openw_raw; - print $fh $uri->data; - close($fh); - } - - # - # Update the HTML with a link to the file we created - # - $updates++; - $newURL = path($filename)->basename; - $element->attr( $attr, $newURL ); - } - } - } # extract_links loop - - # - # Output the changed HTML turning what became standalone back into - # a fragment. - # - if ($updates > 0) { - my $body = $tree->look_down( _tag => 'body' ); - ( my $result = $body->as_HTML( undef, ' ', {} ) ) - =~ s{(^]*>|$)}{}gi; - - #$notesfile = path($notesfile)->basename; - if ($backup) { - _backup( $notesfile, $bsuffix, $backupcount ); - say "$notesfile backed up" unless $silent; - } - else { - say "$notesfile not backed up" unless $silent; - } - - $fh = path($notesfile)->openw; - say $fh $result; - close($fh); - - say "$updates images converted, $notesfile updated" unless $silent; - } - else { - say "No images found, no changes made to $notesfile" unless $silent; - } - -} # $notesfile loop - -exit; - -#=== FUNCTION ================================================================ -# NAME: _backup -# PURPOSE: Given a file, make a backup of it by appending $suffix, and -# handle previous backups -# PARAMETERS: $filename path of file to backup -# $suffix suffix to use, default 'bck' -# $limit number of backups to keep -# RETURNS: True or false depending on success -# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't. -# Rationalises the $limit to avoid it being 0 or less. -# THROWS: No exceptions -# COMMENTS: None -# SEE ALSO: N/A -#=============================================================================== -sub _backup { - my ( $filename, $suffix, $limit ) = @_; - - my ( $backupname, $limitname ); - - # - # Check the target file exists - # - unless ( -e $filename ) { - warn "Unable to find $filename to backup\n"; - return 0; - } - - # - # Handle invalid $limit values - # - $limit = 1 unless ( $limit >= 1 ); - - # - # Defaults - # - $suffix = 'bck' unless $suffix; - $limitname = "$filename.$suffix.$limit"; - $backupname = "$filename.$suffix"; - - # - # Look for existing backups - # - if ( -e $backupname ) { - - # - # If maximum version exists delete it - # - if ( -e $limitname ) { - unlink($limitname); - } - - # - # Go backwards through the versions incrementing their version numbers - # - for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) { - if ( -e "$filename.$suffix.$vsn" ) { - move( "$filename.$suffix.$vsn", - sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) ); - } - } - - # - # Make $filename.bck into $filename.bck.1 - # - move( "$filename.$suffix", "$filename.$suffix.1" ); - - } - - # - # Finally save the $filename as $filename.bck - # - move( $filename, "$filename.$suffix" ); - - return 1; -} - -#=== FUNCTION ================================================================ -# NAME: _debug -# PURPOSE: Prints debug reports -# PARAMETERS: $active Boolean: 1 for print, 0 for no print -# $messages... Arbitrary list of messages to print -# RETURNS: Nothing -# DESCRIPTION: Outputs messages if $active is true. It removes any trailing -# newline from each one and then adds one in the 'print' to the -# caller doesn't have to bother. Prepends each message with 'D>' -# to show it's a debug message. -# THROWS: No exceptions -# COMMENTS: Differs from other functions of the same name -# SEE ALSO: N/A -#=============================================================================== -sub _debug { - my $active = shift; - - my $message; - return unless $active; - - while ($message = shift) { - chomp($message); - print STDERR "D> $message\n"; - } -} - -#=== FUNCTION ================================================================ -# NAME: Options -# PURPOSE: Processes command-line options -# PARAMETERS: $optref Hash reference to hold the options -# RETURNS: Undef -# DESCRIPTION: Process the options we want to offer. See the documentation -# for details -# THROWS: no exceptions -# COMMENTS: none -# SEE ALSO: n/a -#=============================================================================== -sub Options { - my ($optref) = @_; - - my @options = ( - "help", "documentation|man", "debug=i", "silent!", - "prefix=s", "backup!", "force!", - # "config=s", - ); - - if ( !GetOptions( $optref, @options ) ) { - pod2usage( - -msg => "$PROG version $VERSION\n", - -exitval => 1, - -verbose => 0 - ); - } - - return; -} - -__END__ - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Application Documentation -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -#{{{ - -=head1 NAME - -extract_images - extract embedded images from HTML and save as files - -=head1 VERSION - -This documentation refers to extract_images version 0.0.1 - -=head1 USAGE - - extract_images - [--help] [--documentation|man] [-debug=N] - [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent] - HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ] - - Examples: - - extract_images --prefix=picture_ --backup index.html - - extract_images --silent --nobackup shownotes.html - -=head1 REQUIRED ARGUMENTS - -The script requires the names of one or more HTML files which will be scanned -for embedded images. - -=head1 OPTIONS - -Note that all on/off options can be written as B<--silent>, B<--nosilent> or -B<--no-silent>. A single hyphen can be used at the start or a pair. - -=over 8 - -=item B<--help> - -Reports brief information about how to use the script and exits. To see the -full documentation use the option B<--documentation> or B<--man>. Alternatively, -to generate a PDF version use the I tool from -I. This can be -installed with the cpan tool as App::pod2pdf. - -=item B<--documentation> or B<--man> - -Reports full information about how to use the script and exits. Alternatively, -to generate a PDF version use the I tool from -I. This can be -installed with the cpan tool as App::pod2pdf. - -=item B<--debug=N> - -Run in debug mode at the level specified by I. Possible values are: - -=over 4 - -=item B<0> - -No debugging (the default). - -=item B<1> - -TBA - -=item B<2> - -TBA - -=item B<3> - -TBA - -=back - -=item B<--prefix=STRING> - -Since embedded images have no names, when they are written to image files -names are generated for them. These names are built from the following -components: - -=over 4 - -=item B - -The name of the HTML file without a suffix. So, if the name is 'index.html', -the 'index' part will be used. - -=item B - -The prefix string provided by this option, or 'image' if not specified. - -=item B - -The images found in the HTML are counted, starting from 1, and this number is -used here. - -=item B - -The image format extension, taken from the embedded image information. -Examples would be 'jpg', 'jpeg' and 'png'. - -=back - -The B, B, B and B are joined -together to make the file name as follows: - - __. - -So the following run of the script would generate the first file shown below -assuming the first embedded picture was in 'PNG' format: - - extract_images --prefix=picture hpr4283.html - - hpr4283_picture_1.png - -=item B<--[no]backup> - -The HTML is modified to leave placeholders referring to any embedded images -found as it it processed. The new HTML is written to the original file if -B<--nobackup> is specified, or a backup of the original file is made, before -writing the new HTML. Making a backup is the default behaviour. - -=item B<--[no]force> - -Images are written to the current directory. If an image already exists, and -B<--force> has not been specified, the script will stop processing and exit. -The default setting of this option is B<--noforce>. - -=item B<--[no]silent> - -By default the script reports its progress as it processes an HTML file, but -this can be turned off by using this option. - -=back - -=head1 DESCRIPTION - -This Perl script B parses HTML looking for embedded images. -Such images use a URI with the scheme 'data' followed by image details -including the encoded binary contents. See the Wikipedia article -https://en.wikipedia.org/wiki/Data_URI_scheme for details. - -When such images are found they are written as files (with generated names), -and the HTML is updated with these names replacing the original URI. Further -work is likely to be required to build full image links, but the majority of -work will have been done. - -The script will not overwrite image files unless the B<--force> option is -used, but will overwrite the original HTML file unless B<--backup> is -specified. - -By default details of progress are written to standard output, but this can be -preveneted by using the B<--silent> option. - -=head2 GENERATED IMAGE FILE NAMES - -These names are built from various elements: - - __. - -Where is the base name of the HTML input file, is the -string provided with the B<--prefix=STRING> option (or B by default), - is a count of image files found during the scan of the HTML, and - is the appropriate file extension for the type of image being -converted. - -See the option B<--prefix=STRING> for details of file name generation. - -=head1 DIAGNOSTICS - -TBA - -=head1 CONFIGURATION AND ENVIRONMENT - -TBA - -=head1 DEPENDENCIES - - Data::Dumper - File::Copy - Getopt::Long - HTML::TreeBuilder - MIME::Types - Path::Tiny - Pod::Usage - URI - -=head1 BUGS AND LIMITATIONS - -There are no known bugs in this module. -Please report problems to Dave Morriss (Dave.Morriss@gmail.com) -Patches are welcome. - -=head1 AUTHOR - -Dave Morriss (Dave.Morriss@gmail.com) - -=head1 LICENCE AND COPYRIGHT - -Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com). -All rights reserved. - -This module is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. See perldoc perlartistic. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - -=cut - -#}}} - -# [zo to open fold, zc to close] - -# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker - diff --git a/Show_Submission/extract_images b/Show_Submission/extract_images index 757e217..f542ba0 100755 --- a/Show_Submission/extract_images +++ b/Show_Submission/extract_images @@ -19,9 +19,9 @@ # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.2 +# VERSION: 0.0.3 # CREATED: 2024-12-25 10:53:15 -# REVISION: 2024-12-30 09:28:40 +# REVISION: 2024-12-30 11:52:59 # #=============================================================================== @@ -48,7 +48,7 @@ use Data::Dumper; # # Version number (Incremented by Vim) # -our $VERSION = '0.0.2'; +our $VERSION = '0.0.3'; # # Script and directory names @@ -242,8 +242,9 @@ foreach my $notesfile (@ARGV) { # the directory the HTML is in. # $fcount++; + # ${fileprefix}_${prefix}_${increment}.${extension} $filename - = "$dirname/${prefix}_${basename}_${fcount}${suffix}"; + = "$dirname/${basename}_${prefix}_${fcount}${suffix}"; say "Writing to: $filename" unless $silent; say '-' x 40 unless $silent; @@ -473,7 +474,7 @@ extract_images - extract embedded images from HTML and save as files =head1 VERSION -This documentation refers to extract_images version 0.0.2 +This documentation refers to extract_images version 0.0.3 =head1 USAGE