Tidied and enhanced 'extract_images'

Show_Submission/extract_images: removed unwanted modules, added POD
    documentation, added 'coalesce' routine, adjusted to use the
    absolute paths to the input files and generated image files. The
    latter are always written to the directory where the HTML resides.
    Corrected logic around overwriting image files and '--force'.

    Still an 'alpha' version subject to more testing.
This commit is contained in:
Dave Morriss
2024-12-30 12:15:04 +00:00
parent 37567bfd16
commit e0c4545295
2 changed files with 732 additions and 48 deletions

View File

@@ -19,18 +19,17 @@
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.1
# VERSION: 0.0.2
# CREATED: 2024-12-25 10:53:15
# REVISION: 2024-12-29 15:19:35
# REVISION: 2024-12-30 09:28:40
#
#===============================================================================
use v5.36;
use strict;
use warnings;
use feature qw{ postderef say signatures state try };
no warnings
qw{ experimental::postderef experimental::signatures experimental::try };
use feature qw{ say state try };
no warnings qw{ experimental::try };
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
@@ -41,7 +40,6 @@ use HTML::TreeBuilder 5 -weak;
use URI;
use MIME::Types;
#use File::Slurper;
use Path::Tiny;
use File::Copy;
@@ -50,7 +48,7 @@ use Data::Dumper;
#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.1';
our $VERSION = '0.0.2';
#
# Script and directory names
@@ -60,9 +58,9 @@ our $VERSION = '0.0.1';
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
my ( $notes, $typename, $tree, $uri );
my ( $fcount, $basename, $filename, $suffix, $fh );
my ( $updates, $bsuffix, $new_basename, $newURL );
my ( $notes, $typename, $uri );
my ( $fcount, $basename, $filename, $suffix, $fh );
my ( $updates, $bsuffix, $newURL );
my $backupcount = 5;
@@ -104,8 +102,12 @@ my $force = ( defined( $options{force} ) ? $options{force} : 0 );
#
# Check we have arguments
#
die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n"
unless ( scalar(@ARGV) > 0 );
pod2usage(
-msg => "Missing arguments. One or more HTML file names are needed\n",
-exitval => 1,
-verbose => 0
)
unless ( scalar(@ARGV) > 0 );
#
# Clean up the prefix
@@ -115,7 +117,7 @@ $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);
#
# Backup suffix (the dot gets added later)
#
$bsuffix = 'bck';
$bsuffix = 'bak';
#
# Debug the options
@@ -131,18 +133,26 @@ _debug(
# }}}
#-------------------------------------------------------------------------------
# Prepare items for later use: current working directory and a MIME::Types
# object
# Prepare a MIME::Types object
#-------------------------------------------------------------------------------
my $curdir = Path::Tiny->cwd;
my $mt = MIME::Types->new;
#-------------------------------------------------------------------------------
# Loop through the arguments
#-------------------------------------------------------------------------------
foreach my $notesfile (@ARGV) {
unless (-e $notesfile) {
#
# Get the absolute path of the argument
#
my $abs_nf = path($notesfile)->absolute;
#
# Get the 'dirname' from the absolute path
#
my $dirname = path($abs_nf)->parent->stringify;
unless (-e $abs_nf) {
warn "Unable to find $notesfile\n";
next;
}
@@ -150,22 +160,22 @@ foreach my $notesfile (@ARGV) {
#
# Force the MIME type of $notesfile to a string
#
$typename = "" . $mt->mimeTypeOf("$notesfile");
$typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' );
#
# Check the MIME type and reject non-HTML
#
unless ($typename eq 'text/html') {
warn "File $notesfile is not HTML\n";
warn "File $abs_nf is not HTML\n";
next
}
say "Reading from $notesfile\n" unless $silent;
say "Reading from $abs_nf\n" unless $silent;
#
# Get HTML file basename without the suffix for building filenames
# Get HTML file basename (no path) without the suffix for building filenames
#
$basename = path($notesfile)->basename('.html');
$basename = path($abs_nf)->basename('.html');
#
# Read the HTML
@@ -175,7 +185,7 @@ foreach my $notesfile (@ARGV) {
close($nfh);
#
# Image files are to have an index
# Image files are to have an index/sequence number
#
$fcount = 0;
@@ -187,7 +197,7 @@ foreach my $notesfile (@ARGV) {
#
# Initialise the TreeBuilder
#
$tree = HTML::TreeBuilder->new;
my $tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->no_expand_entities(1);
$tree->p_strict(1);
@@ -195,7 +205,7 @@ foreach my $notesfile (@ARGV) {
$tree->warn(1);
#
# Load this HTML file into the TreeBuilder
# Load the HTML obtained from the file into the TreeBuilder
#
$tree->parse_content($notes)
or die "HTML::TreeBuilder failed to parse notes: $!\n";
@@ -203,13 +213,13 @@ foreach my $notesfile (@ARGV) {
#
# Loop through the tree looking for 'data' scheme images
#
for ( @{ $tree->extract_links( 'img' ) } ) {
for ( @{ $tree->extract_links('img') } ) {
my ( $link, $element, $attr, $tag ) = @$_;
$uri = URI->new( $link );
$uri = URI->new($link);
unless ($silent) {
say "Scheme: ",$uri->scheme;
say "Media type: ",$uri->media_type;
say "Scheme: ", $uri->scheme;
say "Media type: ", $uri->media_type;
}
#
@@ -221,34 +231,34 @@ foreach my $notesfile (@ARGV) {
# Only images
#
if ( $uri->media_type =~ /^image/ ) {
#
# Extract the file name suffix from the MIME string, and give it
# a leading '.'
#
#
# Extract the file name suffix from the MIME string, and give it
# a leading '.'
#
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
#
# Construct the filename for this image
# Construct the filename for this image making sure it's in
# the directory the HTML is in.
#
$fcount++;
$filename
= "$curdir/${prefix}_${basename}_${fcount}${suffix}";
= "$dirname/${prefix}_${basename}_${fcount}${suffix}";
say "Writing to: $filename" unless $silent;
say '-' x 40 unless $silent;
#
# Check if the file exists. Don't clobber it unless --force is active
# If the file exists and --force is not active, warn and skip.
# Otherwise write the file.
#
if ( -e $filename ) {
unless ($force) {
warn "File $filename exists; not overwriting\n";
}
if ( -e $filename && !$force ) {
warn "File $filename exists; not overwriting\n";
}
else {
#
# Write the data to the file in binary format. The URI module
# does the conversion so it's already binary.
# Write the data to the file in binary format. The URI
# module does the conversion so it's already binary.
#
$fh = path($filename)->openw_raw;
print $fh $uri->data;
@@ -256,14 +266,16 @@ foreach my $notesfile (@ARGV) {
}
#
# Update the HTML with a link to the file we created
# Update the HTML with a link to the file we created (or that
# was already there from an earlier time).
#
$updates++;
$newURL = path($filename)->basename;
$newURL = path($filename)->basename;
$element->attr( $attr, $newURL );
}
}
} # extract_links loop
} # extract_links loop
#
# Output the changed HTML turning what became standalone back into
@@ -276,7 +288,7 @@ foreach my $notesfile (@ARGV) {
#$notesfile = path($notesfile)->basename;
if ($backup) {
_backup( $notesfile, $bsuffix, $backupcount );
_backup( $abs_nf, $bsuffix, $backupcount );
say "$notesfile backed up" unless $silent;
}
else {
@@ -373,6 +385,24 @@ sub _backup {
return 1;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the first defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument or undef if there are none
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
# returned when doing so might be a problem.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return undef; ## no critic
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
@@ -443,7 +473,7 @@ extract_images - extract embedded images from HTML and save as files
=head1 VERSION
This documentation refers to extract_images version 0.0.1
This documentation refers to extract_images version 0.0.2
=head1 USAGE