Tidied and enhanced 'extract_images'
Show_Submission/extract_images: removed unwanted modules, added POD documentation, added 'coalesce' routine, adjusted to use the absolute paths to the input files and generated image files. The latter are always written to the directory where the HTML resides. Corrected logic around overwriting image files and '--force'. Still an 'alpha' version subject to more testing.
This commit is contained in:
parent
37567bfd16
commit
e0c4545295
654
37567b_extract_images
Normal file
654
37567b_extract_images
Normal file
@ -0,0 +1,654 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
#===============================================================================
|
||||||
|
#
|
||||||
|
# FILE: extract_images
|
||||||
|
#
|
||||||
|
# USAGE: ./extract_images [--help] [--documentation|man]
|
||||||
|
# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent]
|
||||||
|
# HTML_file [ [HTML_file_2] [HTML_file_3] ... ]
|
||||||
|
#
|
||||||
|
# DESCRIPTION: Processes HTML files which may have 'data' URIs containing
|
||||||
|
# images, and extracts these images into files in the same
|
||||||
|
# directory. The 'data' scheme links are converted to 'https'
|
||||||
|
# and reference the extracted files. The modified HTML is
|
||||||
|
# output, and the original will be saved as a backup if
|
||||||
|
# requested.
|
||||||
|
#
|
||||||
|
# OPTIONS: ---
|
||||||
|
# REQUIREMENTS: ---
|
||||||
|
# BUGS: ---
|
||||||
|
# NOTES: ---
|
||||||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||||||
|
# VERSION: 0.0.1
|
||||||
|
# CREATED: 2024-12-25 10:53:15
|
||||||
|
# REVISION: 2024-12-29 15:19:35
|
||||||
|
#
|
||||||
|
#===============================================================================
|
||||||
|
|
||||||
|
use v5.36;
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use feature qw{ postderef say signatures state try };
|
||||||
|
no warnings
|
||||||
|
qw{ experimental::postderef experimental::signatures experimental::try };
|
||||||
|
|
||||||
|
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
||||||
|
|
||||||
|
use Getopt::Long;
|
||||||
|
use Pod::Usage;
|
||||||
|
|
||||||
|
use HTML::TreeBuilder 5 -weak;
|
||||||
|
use URI;
|
||||||
|
use MIME::Types;
|
||||||
|
|
||||||
|
#use File::Slurper;
|
||||||
|
use Path::Tiny;
|
||||||
|
use File::Copy;
|
||||||
|
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Version number (Incremented by Vim)
|
||||||
|
#
|
||||||
|
our $VERSION = '0.0.1';
|
||||||
|
|
||||||
|
#
|
||||||
|
# Script and directory names
|
||||||
|
#
|
||||||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# Declarations
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
my ( $notes, $typename, $tree, $uri );
|
||||||
|
my ( $fcount, $basename, $filename, $suffix, $fh );
|
||||||
|
my ( $updates, $bsuffix, $new_basename, $newURL );
|
||||||
|
|
||||||
|
my $backupcount = 5;
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# Options and arguments
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# {{{
|
||||||
|
#
|
||||||
|
# Option defaults
|
||||||
|
#
|
||||||
|
my $DEFDEBUG = 0;
|
||||||
|
my $DEFPREFIX = 'image';
|
||||||
|
|
||||||
|
my %options;
|
||||||
|
Options( \%options );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Default help shows minimal information
|
||||||
|
#
|
||||||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
||||||
|
if ( $options{'help'} );
|
||||||
|
|
||||||
|
#
|
||||||
|
# The -documentation or -man option shows the full POD documentation through
|
||||||
|
# a pager for convenience
|
||||||
|
#
|
||||||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
||||||
|
if ( $options{'documentation'} );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Collect options
|
||||||
|
#
|
||||||
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
||||||
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
||||||
|
my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX );
|
||||||
|
my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 );
|
||||||
|
my $force = ( defined( $options{force} ) ? $options{force} : 0 );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check we have arguments
|
||||||
|
#
|
||||||
|
die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n"
|
||||||
|
unless ( scalar(@ARGV) > 0 );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Clean up the prefix
|
||||||
|
#
|
||||||
|
$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);
|
||||||
|
|
||||||
|
#
|
||||||
|
# Backup suffix (the dot gets added later)
|
||||||
|
#
|
||||||
|
$bsuffix = 'bck';
|
||||||
|
|
||||||
|
#
|
||||||
|
# Debug the options
|
||||||
|
#
|
||||||
|
_debug(
|
||||||
|
$DEBUG > 1,
|
||||||
|
'$silent = ' . $silent,
|
||||||
|
'$prefix = ' . $prefix,
|
||||||
|
'$backup = ' . $backup,
|
||||||
|
'$force = ' . $force,
|
||||||
|
);
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# Prepare items for later use: current working directory and a MIME::Types
|
||||||
|
# object
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
my $curdir = Path::Tiny->cwd;
|
||||||
|
|
||||||
|
my $mt = MIME::Types->new;
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# Loop through the arguments
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
foreach my $notesfile (@ARGV) {
|
||||||
|
unless (-e $notesfile) {
|
||||||
|
warn "Unable to find $notesfile\n";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Force the MIME type of $notesfile to a string
|
||||||
|
#
|
||||||
|
$typename = "" . $mt->mimeTypeOf("$notesfile");
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check the MIME type and reject non-HTML
|
||||||
|
#
|
||||||
|
unless ($typename eq 'text/html') {
|
||||||
|
warn "File $notesfile is not HTML\n";
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
say "Reading from $notesfile\n" unless $silent;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Get HTML file basename without the suffix for building filenames
|
||||||
|
#
|
||||||
|
$basename = path($notesfile)->basename('.html');
|
||||||
|
|
||||||
|
#
|
||||||
|
# Read the HTML
|
||||||
|
#
|
||||||
|
open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n";
|
||||||
|
$notes = <$nfh>;
|
||||||
|
close($nfh);
|
||||||
|
|
||||||
|
#
|
||||||
|
# Image files are to have an index
|
||||||
|
#
|
||||||
|
$fcount = 0;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Keep a note of HTML updates
|
||||||
|
#
|
||||||
|
$updates = 0;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Initialise the TreeBuilder
|
||||||
|
#
|
||||||
|
$tree = HTML::TreeBuilder->new;
|
||||||
|
$tree->ignore_unknown(0);
|
||||||
|
$tree->no_expand_entities(1);
|
||||||
|
$tree->p_strict(1);
|
||||||
|
$tree->store_comments(1);
|
||||||
|
$tree->warn(1);
|
||||||
|
|
||||||
|
#
|
||||||
|
# Load this HTML file into the TreeBuilder
|
||||||
|
#
|
||||||
|
$tree->parse_content($notes)
|
||||||
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
||||||
|
|
||||||
|
#
|
||||||
|
# Loop through the tree looking for 'data' scheme images
|
||||||
|
#
|
||||||
|
for ( @{ $tree->extract_links( 'img' ) } ) {
|
||||||
|
my ( $link, $element, $attr, $tag ) = @$_;
|
||||||
|
|
||||||
|
$uri = URI->new( $link );
|
||||||
|
unless ($silent) {
|
||||||
|
say "Scheme: ",$uri->scheme;
|
||||||
|
say "Media type: ",$uri->media_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# We only care about 'data' scheme stuff - for now anyway, and only
|
||||||
|
# images within this set
|
||||||
|
#
|
||||||
|
if ( $uri->scheme eq 'data' ) {
|
||||||
|
#
|
||||||
|
# Only images
|
||||||
|
#
|
||||||
|
if ( $uri->media_type =~ /^image/ ) {
|
||||||
|
#
|
||||||
|
# Extract the file name suffix from the MIME string, and give it
|
||||||
|
# a leading '.'
|
||||||
|
#
|
||||||
|
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
|
||||||
|
|
||||||
|
#
|
||||||
|
# Construct the filename for this image
|
||||||
|
#
|
||||||
|
$fcount++;
|
||||||
|
$filename
|
||||||
|
= "$curdir/${prefix}_${basename}_${fcount}${suffix}";
|
||||||
|
say "Writing to: $filename" unless $silent;
|
||||||
|
|
||||||
|
say '-' x 40 unless $silent;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check if the file exists. Don't clobber it unless --force is active
|
||||||
|
#
|
||||||
|
if ( -e $filename ) {
|
||||||
|
unless ($force) {
|
||||||
|
warn "File $filename exists; not overwriting\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
#
|
||||||
|
# Write the data to the file in binary format. The URI module
|
||||||
|
# does the conversion so it's already binary.
|
||||||
|
#
|
||||||
|
$fh = path($filename)->openw_raw;
|
||||||
|
print $fh $uri->data;
|
||||||
|
close($fh);
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Update the HTML with a link to the file we created
|
||||||
|
#
|
||||||
|
$updates++;
|
||||||
|
$newURL = path($filename)->basename;
|
||||||
|
$element->attr( $attr, $newURL );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} # extract_links loop
|
||||||
|
|
||||||
|
#
|
||||||
|
# Output the changed HTML turning what became standalone back into
|
||||||
|
# a <body> fragment.
|
||||||
|
#
|
||||||
|
if ($updates > 0) {
|
||||||
|
my $body = $tree->look_down( _tag => 'body' );
|
||||||
|
( my $result = $body->as_HTML( undef, ' ', {} ) )
|
||||||
|
=~ s{(^<body[^>]*>|</body>$)}{}gi;
|
||||||
|
|
||||||
|
#$notesfile = path($notesfile)->basename;
|
||||||
|
if ($backup) {
|
||||||
|
_backup( $notesfile, $bsuffix, $backupcount );
|
||||||
|
say "$notesfile backed up" unless $silent;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
say "$notesfile not backed up" unless $silent;
|
||||||
|
}
|
||||||
|
|
||||||
|
$fh = path($notesfile)->openw;
|
||||||
|
say $fh $result;
|
||||||
|
close($fh);
|
||||||
|
|
||||||
|
say "$updates images converted, $notesfile updated" unless $silent;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
say "No images found, no changes made to $notesfile" unless $silent;
|
||||||
|
}
|
||||||
|
|
||||||
|
} # $notesfile loop
|
||||||
|
|
||||||
|
exit;
|
||||||
|
|
||||||
|
#=== FUNCTION ================================================================
|
||||||
|
# NAME: _backup
|
||||||
|
# PURPOSE: Given a file, make a backup of it by appending $suffix, and
|
||||||
|
# handle previous backups
|
||||||
|
# PARAMETERS: $filename path of file to backup
|
||||||
|
# $suffix suffix to use, default 'bck'
|
||||||
|
# $limit number of backups to keep
|
||||||
|
# RETURNS: True or false depending on success
|
||||||
|
# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't.
|
||||||
|
# Rationalises the $limit to avoid it being 0 or less.
|
||||||
|
# THROWS: No exceptions
|
||||||
|
# COMMENTS: None
|
||||||
|
# SEE ALSO: N/A
|
||||||
|
#===============================================================================
|
||||||
|
sub _backup {
|
||||||
|
my ( $filename, $suffix, $limit ) = @_;
|
||||||
|
|
||||||
|
my ( $backupname, $limitname );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check the target file exists
|
||||||
|
#
|
||||||
|
unless ( -e $filename ) {
|
||||||
|
warn "Unable to find $filename to backup\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Handle invalid $limit values
|
||||||
|
#
|
||||||
|
$limit = 1 unless ( $limit >= 1 );
|
||||||
|
|
||||||
|
#
|
||||||
|
# Defaults
|
||||||
|
#
|
||||||
|
$suffix = 'bck' unless $suffix;
|
||||||
|
$limitname = "$filename.$suffix.$limit";
|
||||||
|
$backupname = "$filename.$suffix";
|
||||||
|
|
||||||
|
#
|
||||||
|
# Look for existing backups
|
||||||
|
#
|
||||||
|
if ( -e $backupname ) {
|
||||||
|
|
||||||
|
#
|
||||||
|
# If maximum version exists delete it
|
||||||
|
#
|
||||||
|
if ( -e $limitname ) {
|
||||||
|
unlink($limitname);
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Go backwards through the versions incrementing their version numbers
|
||||||
|
#
|
||||||
|
for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) {
|
||||||
|
if ( -e "$filename.$suffix.$vsn" ) {
|
||||||
|
move( "$filename.$suffix.$vsn",
|
||||||
|
sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Make $filename.bck into $filename.bck.1
|
||||||
|
#
|
||||||
|
move( "$filename.$suffix", "$filename.$suffix.1" );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Finally save the $filename as $filename.bck
|
||||||
|
#
|
||||||
|
move( $filename, "$filename.$suffix" );
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#=== FUNCTION ================================================================
|
||||||
|
# NAME: _debug
|
||||||
|
# PURPOSE: Prints debug reports
|
||||||
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
||||||
|
# $messages... Arbitrary list of messages to print
|
||||||
|
# RETURNS: Nothing
|
||||||
|
# DESCRIPTION: Outputs messages if $active is true. It removes any trailing
|
||||||
|
# newline from each one and then adds one in the 'print' to the
|
||||||
|
# caller doesn't have to bother. Prepends each message with 'D>'
|
||||||
|
# to show it's a debug message.
|
||||||
|
# THROWS: No exceptions
|
||||||
|
# COMMENTS: Differs from other functions of the same name
|
||||||
|
# SEE ALSO: N/A
|
||||||
|
#===============================================================================
|
||||||
|
sub _debug {
|
||||||
|
my $active = shift;
|
||||||
|
|
||||||
|
my $message;
|
||||||
|
return unless $active;
|
||||||
|
|
||||||
|
while ($message = shift) {
|
||||||
|
chomp($message);
|
||||||
|
print STDERR "D> $message\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#=== FUNCTION ================================================================
|
||||||
|
# NAME: Options
|
||||||
|
# PURPOSE: Processes command-line options
|
||||||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||||||
|
# RETURNS: Undef
|
||||||
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
||||||
|
# for details
|
||||||
|
# THROWS: no exceptions
|
||||||
|
# COMMENTS: none
|
||||||
|
# SEE ALSO: n/a
|
||||||
|
#===============================================================================
|
||||||
|
sub Options {
|
||||||
|
my ($optref) = @_;
|
||||||
|
|
||||||
|
my @options = (
|
||||||
|
"help", "documentation|man", "debug=i", "silent!",
|
||||||
|
"prefix=s", "backup!", "force!",
|
||||||
|
# "config=s",
|
||||||
|
);
|
||||||
|
|
||||||
|
if ( !GetOptions( $optref, @options ) ) {
|
||||||
|
pod2usage(
|
||||||
|
-msg => "$PROG version $VERSION\n",
|
||||||
|
-exitval => 1,
|
||||||
|
-verbose => 0
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
__END__
|
||||||
|
|
||||||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
# Application Documentation
|
||||||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
#{{{
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
extract_images - extract embedded images from HTML and save as files
|
||||||
|
|
||||||
|
=head1 VERSION
|
||||||
|
|
||||||
|
This documentation refers to extract_images version 0.0.1
|
||||||
|
|
||||||
|
=head1 USAGE
|
||||||
|
|
||||||
|
extract_images
|
||||||
|
[--help] [--documentation|man] [-debug=N]
|
||||||
|
[--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent]
|
||||||
|
HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
extract_images --prefix=picture_ --backup index.html
|
||||||
|
|
||||||
|
extract_images --silent --nobackup shownotes.html
|
||||||
|
|
||||||
|
=head1 REQUIRED ARGUMENTS
|
||||||
|
|
||||||
|
The script requires the names of one or more HTML files which will be scanned
|
||||||
|
for embedded images.
|
||||||
|
|
||||||
|
=head1 OPTIONS
|
||||||
|
|
||||||
|
Note that all on/off options can be written as B<--silent>, B<--nosilent> or
|
||||||
|
B<--no-silent>. A single hyphen can be used at the start or a pair.
|
||||||
|
|
||||||
|
=over 8
|
||||||
|
|
||||||
|
=item B<--help>
|
||||||
|
|
||||||
|
Reports brief information about how to use the script and exits. To see the
|
||||||
|
full documentation use the option B<--documentation> or B<--man>. Alternatively,
|
||||||
|
to generate a PDF version use the I<pod2pdf> tool from
|
||||||
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
||||||
|
installed with the cpan tool as App::pod2pdf.
|
||||||
|
|
||||||
|
=item B<--documentation> or B<--man>
|
||||||
|
|
||||||
|
Reports full information about how to use the script and exits. Alternatively,
|
||||||
|
to generate a PDF version use the I<pod2pdf> tool from
|
||||||
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
||||||
|
installed with the cpan tool as App::pod2pdf.
|
||||||
|
|
||||||
|
=item B<--debug=N>
|
||||||
|
|
||||||
|
Run in debug mode at the level specified by I<N>. Possible values are:
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item B<0>
|
||||||
|
|
||||||
|
No debugging (the default).
|
||||||
|
|
||||||
|
=item B<1>
|
||||||
|
|
||||||
|
TBA
|
||||||
|
|
||||||
|
=item B<2>
|
||||||
|
|
||||||
|
TBA
|
||||||
|
|
||||||
|
=item B<3>
|
||||||
|
|
||||||
|
TBA
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=item B<--prefix=STRING>
|
||||||
|
|
||||||
|
Since embedded images have no names, when they are written to image files
|
||||||
|
names are generated for them. These names are built from the following
|
||||||
|
components:
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item B<HTML filename>
|
||||||
|
|
||||||
|
The name of the HTML file without a suffix. So, if the name is 'index.html',
|
||||||
|
the 'index' part will be used.
|
||||||
|
|
||||||
|
=item B<prefix>
|
||||||
|
|
||||||
|
The prefix string provided by this option, or 'image' if not specified.
|
||||||
|
|
||||||
|
=item B<counter>
|
||||||
|
|
||||||
|
The images found in the HTML are counted, starting from 1, and this number is
|
||||||
|
used here.
|
||||||
|
|
||||||
|
=item B<extension>
|
||||||
|
|
||||||
|
The image format extension, taken from the embedded image information.
|
||||||
|
Examples would be 'jpg', 'jpeg' and 'png'.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
The B<HTML filename>, B<prefix>, B<counter> and B<extension> are joined
|
||||||
|
together to make the file name as follows:
|
||||||
|
|
||||||
|
<HTML filename>_<prefix>_<counter>.<extension>
|
||||||
|
|
||||||
|
So the following run of the script would generate the first file shown below
|
||||||
|
assuming the first embedded picture was in 'PNG' format:
|
||||||
|
|
||||||
|
extract_images --prefix=picture hpr4283.html
|
||||||
|
|
||||||
|
hpr4283_picture_1.png
|
||||||
|
|
||||||
|
=item B<--[no]backup>
|
||||||
|
|
||||||
|
The HTML is modified to leave placeholders referring to any embedded images
|
||||||
|
found as it it processed. The new HTML is written to the original file if
|
||||||
|
B<--nobackup> is specified, or a backup of the original file is made, before
|
||||||
|
writing the new HTML. Making a backup is the default behaviour.
|
||||||
|
|
||||||
|
=item B<--[no]force>
|
||||||
|
|
||||||
|
Images are written to the current directory. If an image already exists, and
|
||||||
|
B<--force> has not been specified, the script will stop processing and exit.
|
||||||
|
The default setting of this option is B<--noforce>.
|
||||||
|
|
||||||
|
=item B<--[no]silent>
|
||||||
|
|
||||||
|
By default the script reports its progress as it processes an HTML file, but
|
||||||
|
this can be turned off by using this option.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
This Perl script B<extract_images> parses HTML looking for embedded images.
|
||||||
|
Such images use a URI with the scheme 'data' followed by image details
|
||||||
|
including the encoded binary contents. See the Wikipedia article
|
||||||
|
https://en.wikipedia.org/wiki/Data_URI_scheme for details.
|
||||||
|
|
||||||
|
When such images are found they are written as files (with generated names),
|
||||||
|
and the HTML is updated with these names replacing the original URI. Further
|
||||||
|
work is likely to be required to build full image links, but the majority of
|
||||||
|
work will have been done.
|
||||||
|
|
||||||
|
The script will not overwrite image files unless the B<--force> option is
|
||||||
|
used, but will overwrite the original HTML file unless B<--backup> is
|
||||||
|
specified.
|
||||||
|
|
||||||
|
By default details of progress are written to standard output, but this can be
|
||||||
|
preveneted by using the B<--silent> option.
|
||||||
|
|
||||||
|
=head2 GENERATED IMAGE FILE NAMES
|
||||||
|
|
||||||
|
These names are built from various elements:
|
||||||
|
|
||||||
|
<HTML filename>_<prefix>_<counter>.<extension>
|
||||||
|
|
||||||
|
Where <HTML filename> is the base name of the HTML input file, <prefix> is the
|
||||||
|
string provided with the B<--prefix=STRING> option (or B<image> by default),
|
||||||
|
<counter> is a count of image files found during the scan of the HTML, and
|
||||||
|
<extension> is the appropriate file extension for the type of image being
|
||||||
|
converted.
|
||||||
|
|
||||||
|
See the option B<--prefix=STRING> for details of file name generation.
|
||||||
|
|
||||||
|
=head1 DIAGNOSTICS
|
||||||
|
|
||||||
|
TBA
|
||||||
|
|
||||||
|
=head1 CONFIGURATION AND ENVIRONMENT
|
||||||
|
|
||||||
|
TBA
|
||||||
|
|
||||||
|
=head1 DEPENDENCIES
|
||||||
|
|
||||||
|
Data::Dumper
|
||||||
|
File::Copy
|
||||||
|
Getopt::Long
|
||||||
|
HTML::TreeBuilder
|
||||||
|
MIME::Types
|
||||||
|
Path::Tiny
|
||||||
|
Pod::Usage
|
||||||
|
URI
|
||||||
|
|
||||||
|
=head1 BUGS AND LIMITATIONS
|
||||||
|
|
||||||
|
There are no known bugs in this module.
|
||||||
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
||||||
|
Patches are welcome.
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Dave Morriss (Dave.Morriss@gmail.com)
|
||||||
|
|
||||||
|
=head1 LICENCE AND COPYRIGHT
|
||||||
|
|
||||||
|
Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com).
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
This module is free software; you can redistribute it and/or
|
||||||
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
#}}}
|
||||||
|
|
||||||
|
# [zo to open fold, zc to close]
|
||||||
|
|
||||||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||||||
|
|
@ -19,18 +19,17 @@
|
|||||||
# BUGS: ---
|
# BUGS: ---
|
||||||
# NOTES: ---
|
# NOTES: ---
|
||||||
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||||||
# VERSION: 0.0.1
|
# VERSION: 0.0.2
|
||||||
# CREATED: 2024-12-25 10:53:15
|
# CREATED: 2024-12-25 10:53:15
|
||||||
# REVISION: 2024-12-29 15:19:35
|
# REVISION: 2024-12-30 09:28:40
|
||||||
#
|
#
|
||||||
#===============================================================================
|
#===============================================================================
|
||||||
|
|
||||||
use v5.36;
|
use v5.36;
|
||||||
use strict;
|
use strict;
|
||||||
use warnings;
|
use warnings;
|
||||||
use feature qw{ postderef say signatures state try };
|
use feature qw{ say state try };
|
||||||
no warnings
|
no warnings qw{ experimental::try };
|
||||||
qw{ experimental::postderef experimental::signatures experimental::try };
|
|
||||||
|
|
||||||
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
||||||
|
|
||||||
@ -41,7 +40,6 @@ use HTML::TreeBuilder 5 -weak;
|
|||||||
use URI;
|
use URI;
|
||||||
use MIME::Types;
|
use MIME::Types;
|
||||||
|
|
||||||
#use File::Slurper;
|
|
||||||
use Path::Tiny;
|
use Path::Tiny;
|
||||||
use File::Copy;
|
use File::Copy;
|
||||||
|
|
||||||
@ -50,7 +48,7 @@ use Data::Dumper;
|
|||||||
#
|
#
|
||||||
# Version number (Incremented by Vim)
|
# Version number (Incremented by Vim)
|
||||||
#
|
#
|
||||||
our $VERSION = '0.0.1';
|
our $VERSION = '0.0.2';
|
||||||
|
|
||||||
#
|
#
|
||||||
# Script and directory names
|
# Script and directory names
|
||||||
@ -60,9 +58,9 @@ our $VERSION = '0.0.1';
|
|||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
# Declarations
|
# Declarations
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
my ( $notes, $typename, $tree, $uri );
|
my ( $notes, $typename, $uri );
|
||||||
my ( $fcount, $basename, $filename, $suffix, $fh );
|
my ( $fcount, $basename, $filename, $suffix, $fh );
|
||||||
my ( $updates, $bsuffix, $new_basename, $newURL );
|
my ( $updates, $bsuffix, $newURL );
|
||||||
|
|
||||||
my $backupcount = 5;
|
my $backupcount = 5;
|
||||||
|
|
||||||
@ -104,8 +102,12 @@ my $force = ( defined( $options{force} ) ? $options{force} : 0 );
|
|||||||
#
|
#
|
||||||
# Check we have arguments
|
# Check we have arguments
|
||||||
#
|
#
|
||||||
die "Usage: $PROG [options] file1 [file2 [file3 [fileN]]]\n"
|
pod2usage(
|
||||||
unless ( scalar(@ARGV) > 0 );
|
-msg => "Missing arguments. One or more HTML file names are needed\n",
|
||||||
|
-exitval => 1,
|
||||||
|
-verbose => 0
|
||||||
|
)
|
||||||
|
unless ( scalar(@ARGV) > 0 );
|
||||||
|
|
||||||
#
|
#
|
||||||
# Clean up the prefix
|
# Clean up the prefix
|
||||||
@ -115,7 +117,7 @@ $prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);
|
|||||||
#
|
#
|
||||||
# Backup suffix (the dot gets added later)
|
# Backup suffix (the dot gets added later)
|
||||||
#
|
#
|
||||||
$bsuffix = 'bck';
|
$bsuffix = 'bak';
|
||||||
|
|
||||||
#
|
#
|
||||||
# Debug the options
|
# Debug the options
|
||||||
@ -131,18 +133,26 @@ _debug(
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
# Prepare items for later use: current working directory and a MIME::Types
|
# Prepare a MIME::Types object
|
||||||
# object
|
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
my $curdir = Path::Tiny->cwd;
|
|
||||||
|
|
||||||
my $mt = MIME::Types->new;
|
my $mt = MIME::Types->new;
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
# Loop through the arguments
|
# Loop through the arguments
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
foreach my $notesfile (@ARGV) {
|
foreach my $notesfile (@ARGV) {
|
||||||
unless (-e $notesfile) {
|
|
||||||
|
#
|
||||||
|
# Get the absolute path of the argument
|
||||||
|
#
|
||||||
|
my $abs_nf = path($notesfile)->absolute;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Get the 'dirname' from the absolute path
|
||||||
|
#
|
||||||
|
my $dirname = path($abs_nf)->parent->stringify;
|
||||||
|
|
||||||
|
unless (-e $abs_nf) {
|
||||||
warn "Unable to find $notesfile\n";
|
warn "Unable to find $notesfile\n";
|
||||||
next;
|
next;
|
||||||
}
|
}
|
||||||
@ -150,22 +160,22 @@ foreach my $notesfile (@ARGV) {
|
|||||||
#
|
#
|
||||||
# Force the MIME type of $notesfile to a string
|
# Force the MIME type of $notesfile to a string
|
||||||
#
|
#
|
||||||
$typename = "" . $mt->mimeTypeOf("$notesfile");
|
$typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' );
|
||||||
|
|
||||||
#
|
#
|
||||||
# Check the MIME type and reject non-HTML
|
# Check the MIME type and reject non-HTML
|
||||||
#
|
#
|
||||||
unless ($typename eq 'text/html') {
|
unless ($typename eq 'text/html') {
|
||||||
warn "File $notesfile is not HTML\n";
|
warn "File $abs_nf is not HTML\n";
|
||||||
next
|
next
|
||||||
}
|
}
|
||||||
|
|
||||||
say "Reading from $notesfile\n" unless $silent;
|
say "Reading from $abs_nf\n" unless $silent;
|
||||||
|
|
||||||
#
|
#
|
||||||
# Get HTML file basename without the suffix for building filenames
|
# Get HTML file basename (no path) without the suffix for building filenames
|
||||||
#
|
#
|
||||||
$basename = path($notesfile)->basename('.html');
|
$basename = path($abs_nf)->basename('.html');
|
||||||
|
|
||||||
#
|
#
|
||||||
# Read the HTML
|
# Read the HTML
|
||||||
@ -175,7 +185,7 @@ foreach my $notesfile (@ARGV) {
|
|||||||
close($nfh);
|
close($nfh);
|
||||||
|
|
||||||
#
|
#
|
||||||
# Image files are to have an index
|
# Image files are to have an index/sequence number
|
||||||
#
|
#
|
||||||
$fcount = 0;
|
$fcount = 0;
|
||||||
|
|
||||||
@ -187,7 +197,7 @@ foreach my $notesfile (@ARGV) {
|
|||||||
#
|
#
|
||||||
# Initialise the TreeBuilder
|
# Initialise the TreeBuilder
|
||||||
#
|
#
|
||||||
$tree = HTML::TreeBuilder->new;
|
my $tree = HTML::TreeBuilder->new;
|
||||||
$tree->ignore_unknown(0);
|
$tree->ignore_unknown(0);
|
||||||
$tree->no_expand_entities(1);
|
$tree->no_expand_entities(1);
|
||||||
$tree->p_strict(1);
|
$tree->p_strict(1);
|
||||||
@ -195,7 +205,7 @@ foreach my $notesfile (@ARGV) {
|
|||||||
$tree->warn(1);
|
$tree->warn(1);
|
||||||
|
|
||||||
#
|
#
|
||||||
# Load this HTML file into the TreeBuilder
|
# Load the HTML obtained from the file into the TreeBuilder
|
||||||
#
|
#
|
||||||
$tree->parse_content($notes)
|
$tree->parse_content($notes)
|
||||||
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
||||||
@ -203,13 +213,13 @@ foreach my $notesfile (@ARGV) {
|
|||||||
#
|
#
|
||||||
# Loop through the tree looking for 'data' scheme images
|
# Loop through the tree looking for 'data' scheme images
|
||||||
#
|
#
|
||||||
for ( @{ $tree->extract_links( 'img' ) } ) {
|
for ( @{ $tree->extract_links('img') } ) {
|
||||||
my ( $link, $element, $attr, $tag ) = @$_;
|
my ( $link, $element, $attr, $tag ) = @$_;
|
||||||
|
|
||||||
$uri = URI->new( $link );
|
$uri = URI->new($link);
|
||||||
unless ($silent) {
|
unless ($silent) {
|
||||||
say "Scheme: ",$uri->scheme;
|
say "Scheme: ", $uri->scheme;
|
||||||
say "Media type: ",$uri->media_type;
|
say "Media type: ", $uri->media_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -221,34 +231,34 @@ foreach my $notesfile (@ARGV) {
|
|||||||
# Only images
|
# Only images
|
||||||
#
|
#
|
||||||
if ( $uri->media_type =~ /^image/ ) {
|
if ( $uri->media_type =~ /^image/ ) {
|
||||||
#
|
#
|
||||||
# Extract the file name suffix from the MIME string, and give it
|
# Extract the file name suffix from the MIME string, and give it
|
||||||
# a leading '.'
|
# a leading '.'
|
||||||
#
|
#
|
||||||
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
|
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
|
||||||
|
|
||||||
#
|
#
|
||||||
# Construct the filename for this image
|
# Construct the filename for this image making sure it's in
|
||||||
|
# the directory the HTML is in.
|
||||||
#
|
#
|
||||||
$fcount++;
|
$fcount++;
|
||||||
$filename
|
$filename
|
||||||
= "$curdir/${prefix}_${basename}_${fcount}${suffix}";
|
= "$dirname/${prefix}_${basename}_${fcount}${suffix}";
|
||||||
say "Writing to: $filename" unless $silent;
|
say "Writing to: $filename" unless $silent;
|
||||||
|
|
||||||
say '-' x 40 unless $silent;
|
say '-' x 40 unless $silent;
|
||||||
|
|
||||||
#
|
#
|
||||||
# Check if the file exists. Don't clobber it unless --force is active
|
# If the file exists and --force is not active, warn and skip.
|
||||||
|
# Otherwise write the file.
|
||||||
#
|
#
|
||||||
if ( -e $filename ) {
|
if ( -e $filename && !$force ) {
|
||||||
unless ($force) {
|
warn "File $filename exists; not overwriting\n";
|
||||||
warn "File $filename exists; not overwriting\n";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
#
|
#
|
||||||
# Write the data to the file in binary format. The URI module
|
# Write the data to the file in binary format. The URI
|
||||||
# does the conversion so it's already binary.
|
# module does the conversion so it's already binary.
|
||||||
#
|
#
|
||||||
$fh = path($filename)->openw_raw;
|
$fh = path($filename)->openw_raw;
|
||||||
print $fh $uri->data;
|
print $fh $uri->data;
|
||||||
@ -256,14 +266,16 @@ foreach my $notesfile (@ARGV) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
# Update the HTML with a link to the file we created
|
# Update the HTML with a link to the file we created (or that
|
||||||
|
# was already there from an earlier time).
|
||||||
#
|
#
|
||||||
$updates++;
|
$updates++;
|
||||||
$newURL = path($filename)->basename;
|
$newURL = path($filename)->basename;
|
||||||
$element->attr( $attr, $newURL );
|
$element->attr( $attr, $newURL );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} # extract_links loop
|
|
||||||
|
} # extract_links loop
|
||||||
|
|
||||||
#
|
#
|
||||||
# Output the changed HTML turning what became standalone back into
|
# Output the changed HTML turning what became standalone back into
|
||||||
@ -276,7 +288,7 @@ foreach my $notesfile (@ARGV) {
|
|||||||
|
|
||||||
#$notesfile = path($notesfile)->basename;
|
#$notesfile = path($notesfile)->basename;
|
||||||
if ($backup) {
|
if ($backup) {
|
||||||
_backup( $notesfile, $bsuffix, $backupcount );
|
_backup( $abs_nf, $bsuffix, $backupcount );
|
||||||
say "$notesfile backed up" unless $silent;
|
say "$notesfile backed up" unless $silent;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -373,6 +385,24 @@ sub _backup {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#=== FUNCTION ================================================================
|
||||||
|
# NAME: coalesce
|
||||||
|
# PURPOSE: To find the first defined argument and return it
|
||||||
|
# PARAMETERS: Arbitrary number of arguments
|
||||||
|
# RETURNS: The first defined argument or undef if there are none
|
||||||
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
||||||
|
# returned when doing so might be a problem.
|
||||||
|
# THROWS: No exceptions
|
||||||
|
# COMMENTS: None
|
||||||
|
# SEE ALSO: N/A
|
||||||
|
#===============================================================================
|
||||||
|
sub coalesce {
|
||||||
|
foreach (@_) {
|
||||||
|
return $_ if defined($_);
|
||||||
|
}
|
||||||
|
return undef; ## no critic
|
||||||
|
}
|
||||||
|
|
||||||
#=== FUNCTION ================================================================
|
#=== FUNCTION ================================================================
|
||||||
# NAME: _debug
|
# NAME: _debug
|
||||||
# PURPOSE: Prints debug reports
|
# PURPOSE: Prints debug reports
|
||||||
@ -443,7 +473,7 @@ extract_images - extract embedded images from HTML and save as files
|
|||||||
|
|
||||||
=head1 VERSION
|
=head1 VERSION
|
||||||
|
|
||||||
This documentation refers to extract_images version 0.0.1
|
This documentation refers to extract_images version 0.0.2
|
||||||
|
|
||||||
=head1 USAGE
|
=head1 USAGE
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user