2024-12-29 16:33:52 +00:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
#===============================================================================
|
|
|
|
#
|
|
|
|
# FILE: extract_images
|
|
|
|
#
|
|
|
|
# USAGE: ./extract_images [--help] [--documentation|man]
|
|
|
|
# [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent]
|
|
|
|
# HTML_file [ [HTML_file_2] [HTML_file_3] ... ]
|
|
|
|
#
|
|
|
|
# DESCRIPTION: Processes HTML files which may have 'data' URIs containing
|
|
|
|
# images, and extracts these images into files in the same
|
|
|
|
# directory. The 'data' scheme links are converted to 'https'
|
|
|
|
# and reference the extracted files. The modified HTML is
|
|
|
|
# output, and the original will be saved as a backup if
|
|
|
|
# requested.
|
|
|
|
#
|
|
|
|
# OPTIONS: ---
|
|
|
|
# REQUIREMENTS: ---
|
|
|
|
# BUGS: ---
|
|
|
|
# NOTES: ---
|
|
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
2024-12-31 17:32:27 +00:00
|
|
|
# VERSION: 0.0.3
|
2024-12-29 16:33:52 +00:00
|
|
|
# CREATED: 2024-12-25 10:53:15
|
2024-12-31 17:32:27 +00:00
|
|
|
# REVISION: 2024-12-30 11:52:59
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
#===============================================================================
|
|
|
|
|
|
|
|
use v5.36;
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
2024-12-30 12:15:04 +00:00
|
|
|
use feature qw{ say state try };
|
|
|
|
no warnings qw{ experimental::try };
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
|
|
|
|
|
|
|
|
use Getopt::Long;
|
|
|
|
use Pod::Usage;
|
|
|
|
|
|
|
|
use HTML::TreeBuilder 5 -weak;
|
|
|
|
use URI;
|
|
|
|
use MIME::Types;
|
|
|
|
|
|
|
|
use Path::Tiny;
|
|
|
|
use File::Copy;
|
|
|
|
|
|
|
|
use Data::Dumper;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Version number (Incremented by Vim)
|
|
|
|
#
|
2024-12-31 17:32:27 +00:00
|
|
|
our $VERSION = '0.0.3';
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Script and directory names
|
|
|
|
#
|
|
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Declarations
|
|
|
|
#-------------------------------------------------------------------------------
|
2024-12-30 12:15:04 +00:00
|
|
|
my ( $notes, $typename, $uri );
|
|
|
|
my ( $fcount, $basename, $filename, $suffix, $fh );
|
|
|
|
my ( $updates, $bsuffix, $newURL );
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
my $backupcount = 5;
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Options and arguments
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# {{{
|
|
|
|
#
|
|
|
|
# Option defaults
|
|
|
|
#
|
|
|
|
my $DEFDEBUG = 0;
|
|
|
|
my $DEFPREFIX = 'image';
|
|
|
|
|
|
|
|
my %options;
|
|
|
|
Options( \%options );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Default help shows minimal information
|
|
|
|
#
|
|
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
|
|
|
|
if ( $options{'help'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# The -documentation or -man option shows the full POD documentation through
|
|
|
|
# a pager for convenience
|
|
|
|
#
|
|
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
|
|
|
|
if ( $options{'documentation'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect options
|
|
|
|
#
|
|
|
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEFDEBUG );
|
|
|
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
|
|
my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX );
|
|
|
|
my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 );
|
|
|
|
my $force = ( defined( $options{force} ) ? $options{force} : 0 );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check we have arguments
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
pod2usage(
|
|
|
|
-msg => "Missing arguments. One or more HTML file names are needed\n",
|
|
|
|
-exitval => 1,
|
|
|
|
-verbose => 0
|
|
|
|
)
|
|
|
|
unless ( scalar(@ARGV) > 0 );
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Clean up the prefix
|
|
|
|
#
|
|
|
|
$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Backup suffix (the dot gets added later)
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
$bsuffix = 'bak';
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Debug the options
|
|
|
|
#
|
|
|
|
_debug(
|
|
|
|
$DEBUG > 1,
|
|
|
|
'$silent = ' . $silent,
|
|
|
|
'$prefix = ' . $prefix,
|
|
|
|
'$backup = ' . $backup,
|
|
|
|
'$force = ' . $force,
|
|
|
|
);
|
|
|
|
|
|
|
|
# }}}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
2024-12-30 12:15:04 +00:00
|
|
|
# Prepare a MIME::Types object
|
2024-12-29 16:33:52 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $mt = MIME::Types->new;
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Loop through the arguments
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
foreach my $notesfile (@ARGV) {
|
2024-12-30 12:15:04 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Get the absolute path of the argument
|
|
|
|
#
|
|
|
|
my $abs_nf = path($notesfile)->absolute;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the 'dirname' from the absolute path
|
|
|
|
#
|
|
|
|
my $dirname = path($abs_nf)->parent->stringify;
|
|
|
|
|
|
|
|
unless (-e $abs_nf) {
|
2024-12-29 16:33:52 +00:00
|
|
|
warn "Unable to find $notesfile\n";
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Force the MIME type of $notesfile to a string
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
$typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' );
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Check the MIME type and reject non-HTML
|
|
|
|
#
|
|
|
|
unless ($typename eq 'text/html') {
|
2024-12-30 12:15:04 +00:00
|
|
|
warn "File $abs_nf is not HTML\n";
|
2024-12-29 16:33:52 +00:00
|
|
|
next
|
|
|
|
}
|
|
|
|
|
2024-12-30 12:15:04 +00:00
|
|
|
say "Reading from $abs_nf\n" unless $silent;
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Get HTML file basename (no path) without the suffix for building filenames
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
$basename = path($abs_nf)->basename('.html');
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Read the HTML
|
|
|
|
#
|
|
|
|
open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n";
|
|
|
|
$notes = <$nfh>;
|
|
|
|
close($nfh);
|
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Image files are to have an index/sequence number
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
$fcount = 0;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Keep a note of HTML updates
|
|
|
|
#
|
|
|
|
$updates = 0;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Initialise the TreeBuilder
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
my $tree = HTML::TreeBuilder->new;
|
2024-12-29 16:33:52 +00:00
|
|
|
$tree->ignore_unknown(0);
|
|
|
|
$tree->no_expand_entities(1);
|
|
|
|
$tree->p_strict(1);
|
|
|
|
$tree->store_comments(1);
|
|
|
|
$tree->warn(1);
|
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Load the HTML obtained from the file into the TreeBuilder
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
$tree->parse_content($notes)
|
|
|
|
or die "HTML::TreeBuilder failed to parse notes: $!\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Loop through the tree looking for 'data' scheme images
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
for ( @{ $tree->extract_links('img') } ) {
|
2024-12-29 16:33:52 +00:00
|
|
|
my ( $link, $element, $attr, $tag ) = @$_;
|
|
|
|
|
2024-12-30 12:15:04 +00:00
|
|
|
$uri = URI->new($link);
|
2024-12-29 16:33:52 +00:00
|
|
|
unless ($silent) {
|
2024-12-30 12:15:04 +00:00
|
|
|
say "Scheme: ", $uri->scheme;
|
|
|
|
say "Media type: ", $uri->media_type;
|
2024-12-29 16:33:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# We only care about 'data' scheme stuff - for now anyway, and only
|
|
|
|
# images within this set
|
|
|
|
#
|
|
|
|
if ( $uri->scheme eq 'data' ) {
|
|
|
|
#
|
|
|
|
# Only images
|
|
|
|
#
|
|
|
|
if ( $uri->media_type =~ /^image/ ) {
|
2024-12-30 12:15:04 +00:00
|
|
|
#
|
|
|
|
# Extract the file name suffix from the MIME string, and give it
|
|
|
|
# a leading '.'
|
|
|
|
#
|
2024-12-29 16:33:52 +00:00
|
|
|
( $suffix = $uri->media_type ) =~ s{^.*/}{.};
|
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Construct the filename for this image making sure it's in
|
|
|
|
# the directory the HTML is in.
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
$fcount++;
|
2024-12-31 17:32:27 +00:00
|
|
|
# ${fileprefix}_${prefix}_${increment}.${extension}
|
2024-12-29 16:33:52 +00:00
|
|
|
$filename
|
2024-12-31 17:32:27 +00:00
|
|
|
= "$dirname/${basename}_${prefix}_${fcount}${suffix}";
|
2024-12-29 16:33:52 +00:00
|
|
|
say "Writing to: $filename" unless $silent;
|
|
|
|
|
|
|
|
say '-' x 40 unless $silent;
|
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# If the file exists and --force is not active, warn and skip.
|
|
|
|
# Otherwise write the file.
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
if ( -e $filename && !$force ) {
|
|
|
|
warn "File $filename exists; not overwriting\n";
|
2024-12-29 16:33:52 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Write the data to the file in binary format. The URI
|
|
|
|
# module does the conversion so it's already binary.
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
$fh = path($filename)->openw_raw;
|
|
|
|
print $fh $uri->data;
|
|
|
|
close($fh);
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
2024-12-30 12:15:04 +00:00
|
|
|
# Update the HTML with a link to the file we created (or that
|
|
|
|
# was already there from an earlier time).
|
2024-12-29 16:33:52 +00:00
|
|
|
#
|
|
|
|
$updates++;
|
2024-12-30 12:15:04 +00:00
|
|
|
$newURL = path($filename)->basename;
|
2024-12-29 16:33:52 +00:00
|
|
|
$element->attr( $attr, $newURL );
|
|
|
|
}
|
|
|
|
}
|
2024-12-30 12:15:04 +00:00
|
|
|
|
|
|
|
} # extract_links loop
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Output the changed HTML turning what became standalone back into
|
|
|
|
# a <body> fragment.
|
|
|
|
#
|
|
|
|
if ($updates > 0) {
|
|
|
|
my $body = $tree->look_down( _tag => 'body' );
|
|
|
|
( my $result = $body->as_HTML( undef, ' ', {} ) )
|
|
|
|
=~ s{(^<body[^>]*>|</body>$)}{}gi;
|
|
|
|
|
|
|
|
#$notesfile = path($notesfile)->basename;
|
|
|
|
if ($backup) {
|
2024-12-30 12:15:04 +00:00
|
|
|
_backup( $abs_nf, $bsuffix, $backupcount );
|
2024-12-29 16:33:52 +00:00
|
|
|
say "$notesfile backed up" unless $silent;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
say "$notesfile not backed up" unless $silent;
|
|
|
|
}
|
|
|
|
|
|
|
|
$fh = path($notesfile)->openw;
|
|
|
|
say $fh $result;
|
|
|
|
close($fh);
|
|
|
|
|
|
|
|
say "$updates images converted, $notesfile updated" unless $silent;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
say "No images found, no changes made to $notesfile" unless $silent;
|
|
|
|
}
|
|
|
|
|
|
|
|
} # $notesfile loop
|
|
|
|
|
|
|
|
exit;
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: _backup
|
|
|
|
# PURPOSE: Given a file, make a backup of it by appending $suffix, and
|
|
|
|
# handle previous backups
|
|
|
|
# PARAMETERS: $filename path of file to backup
|
|
|
|
# $suffix suffix to use, default 'bck'
|
|
|
|
# $limit number of backups to keep
|
|
|
|
# RETURNS: True or false depending on success
|
|
|
|
# DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't.
|
|
|
|
# Rationalises the $limit to avoid it being 0 or less.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub _backup {
|
|
|
|
my ( $filename, $suffix, $limit ) = @_;
|
|
|
|
|
|
|
|
my ( $backupname, $limitname );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the target file exists
|
|
|
|
#
|
|
|
|
unless ( -e $filename ) {
|
|
|
|
warn "Unable to find $filename to backup\n";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Handle invalid $limit values
|
|
|
|
#
|
|
|
|
$limit = 1 unless ( $limit >= 1 );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Defaults
|
|
|
|
#
|
|
|
|
$suffix = 'bck' unless $suffix;
|
|
|
|
$limitname = "$filename.$suffix.$limit";
|
|
|
|
$backupname = "$filename.$suffix";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Look for existing backups
|
|
|
|
#
|
|
|
|
if ( -e $backupname ) {
|
|
|
|
|
|
|
|
#
|
|
|
|
# If maximum version exists delete it
|
|
|
|
#
|
|
|
|
if ( -e $limitname ) {
|
|
|
|
unlink($limitname);
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Go backwards through the versions incrementing their version numbers
|
|
|
|
#
|
|
|
|
for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) {
|
|
|
|
if ( -e "$filename.$suffix.$vsn" ) {
|
|
|
|
move( "$filename.$suffix.$vsn",
|
|
|
|
sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Make $filename.bck into $filename.bck.1
|
|
|
|
#
|
|
|
|
move( "$filename.$suffix", "$filename.$suffix.1" );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Finally save the $filename as $filename.bck
|
|
|
|
#
|
|
|
|
move( $filename, "$filename.$suffix" );
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2024-12-30 12:15:04 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: coalesce
|
|
|
|
# PURPOSE: To find the first defined argument and return it
|
|
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
|
|
# RETURNS: The first defined argument or undef if there are none
|
|
|
|
# DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
|
|
|
|
# returned when doing so might be a problem.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub coalesce {
|
|
|
|
foreach (@_) {
|
|
|
|
return $_ if defined($_);
|
|
|
|
}
|
|
|
|
return undef; ## no critic
|
|
|
|
}
|
|
|
|
|
2024-12-29 16:33:52 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: _debug
|
|
|
|
# PURPOSE: Prints debug reports
|
|
|
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
|
|
|
# $messages... Arbitrary list of messages to print
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION: Outputs messages if $active is true. It removes any trailing
|
|
|
|
# newline from each one and then adds one in the 'print' to the
|
|
|
|
# caller doesn't have to bother. Prepends each message with 'D>'
|
|
|
|
# to show it's a debug message.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: Differs from other functions of the same name
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub _debug {
|
|
|
|
my $active = shift;
|
|
|
|
|
|
|
|
my $message;
|
|
|
|
return unless $active;
|
|
|
|
|
|
|
|
while ($message = shift) {
|
|
|
|
chomp($message);
|
|
|
|
print STDERR "D> $message\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: Options
|
|
|
|
# PURPOSE: Processes command-line options
|
|
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
|
|
# RETURNS: Undef
|
|
|
|
# DESCRIPTION: Process the options we want to offer. See the documentation
|
|
|
|
# for details
|
|
|
|
# THROWS: no exceptions
|
|
|
|
# COMMENTS: none
|
|
|
|
# SEE ALSO: n/a
|
|
|
|
#===============================================================================
|
|
|
|
sub Options {
|
|
|
|
my ($optref) = @_;
|
|
|
|
|
|
|
|
my @options = (
|
|
|
|
"help", "documentation|man", "debug=i", "silent!",
|
|
|
|
"prefix=s", "backup!", "force!",
|
|
|
|
# "config=s",
|
|
|
|
);
|
|
|
|
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
|
|
pod2usage(
|
|
|
|
-msg => "$PROG version $VERSION\n",
|
|
|
|
-exitval => 1,
|
|
|
|
-verbose => 0
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
__END__
|
|
|
|
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
# Application Documentation
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
#{{{
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
extract_images - extract embedded images from HTML and save as files
|
|
|
|
|
|
|
|
=head1 VERSION
|
|
|
|
|
2024-12-31 17:32:27 +00:00
|
|
|
This documentation refers to extract_images version 0.0.3
|
2024-12-29 16:33:52 +00:00
|
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
|
|
|
|
extract_images
|
|
|
|
[--help] [--documentation|man] [-debug=N]
|
|
|
|
[--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent]
|
|
|
|
HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ]
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
extract_images --prefix=picture_ --backup index.html
|
|
|
|
|
|
|
|
extract_images --silent --nobackup shownotes.html
|
|
|
|
|
|
|
|
=head1 REQUIRED ARGUMENTS
|
|
|
|
|
|
|
|
The script requires the names of one or more HTML files which will be scanned
|
|
|
|
for embedded images.
|
|
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
|
|
|
|
Note that all on/off options can be written as B<--silent>, B<--nosilent> or
|
|
|
|
B<--no-silent>. A single hyphen can be used at the start or a pair.
|
|
|
|
|
|
|
|
=over 8
|
|
|
|
|
|
|
|
=item B<--help>
|
|
|
|
|
|
|
|
Reports brief information about how to use the script and exits. To see the
|
|
|
|
full documentation use the option B<--documentation> or B<--man>. Alternatively,
|
|
|
|
to generate a PDF version use the I<pod2pdf> tool from
|
|
|
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
|
|
|
installed with the cpan tool as App::pod2pdf.
|
|
|
|
|
|
|
|
=item B<--documentation> or B<--man>
|
|
|
|
|
|
|
|
Reports full information about how to use the script and exits. Alternatively,
|
|
|
|
to generate a PDF version use the I<pod2pdf> tool from
|
|
|
|
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
|
|
|
|
installed with the cpan tool as App::pod2pdf.
|
|
|
|
|
|
|
|
=item B<--debug=N>
|
|
|
|
|
|
|
|
Run in debug mode at the level specified by I<N>. Possible values are:
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
|
|
|
|
=item B<0>
|
|
|
|
|
|
|
|
No debugging (the default).
|
|
|
|
|
|
|
|
=item B<1>
|
|
|
|
|
|
|
|
TBA
|
|
|
|
|
|
|
|
=item B<2>
|
|
|
|
|
|
|
|
TBA
|
|
|
|
|
|
|
|
=item B<3>
|
|
|
|
|
|
|
|
TBA
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
|
|
|
=item B<--prefix=STRING>
|
|
|
|
|
|
|
|
Since embedded images have no names, when they are written to image files
|
|
|
|
names are generated for them. These names are built from the following
|
|
|
|
components:
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
|
|
|
|
=item B<HTML filename>
|
|
|
|
|
|
|
|
The name of the HTML file without a suffix. So, if the name is 'index.html',
|
|
|
|
the 'index' part will be used.
|
|
|
|
|
|
|
|
=item B<prefix>
|
|
|
|
|
|
|
|
The prefix string provided by this option, or 'image' if not specified.
|
|
|
|
|
|
|
|
=item B<counter>
|
|
|
|
|
|
|
|
The images found in the HTML are counted, starting from 1, and this number is
|
|
|
|
used here.
|
|
|
|
|
|
|
|
=item B<extension>
|
|
|
|
|
|
|
|
The image format extension, taken from the embedded image information.
|
|
|
|
Examples would be 'jpg', 'jpeg' and 'png'.
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
|
|
|
The B<HTML filename>, B<prefix>, B<counter> and B<extension> are joined
|
|
|
|
together to make the file name as follows:
|
|
|
|
|
|
|
|
<HTML filename>_<prefix>_<counter>.<extension>
|
|
|
|
|
|
|
|
So the following run of the script would generate the first file shown below
|
|
|
|
assuming the first embedded picture was in 'PNG' format:
|
|
|
|
|
|
|
|
extract_images --prefix=picture hpr4283.html
|
|
|
|
|
|
|
|
hpr4283_picture_1.png
|
|
|
|
|
|
|
|
=item B<--[no]backup>
|
|
|
|
|
|
|
|
The HTML is modified to leave placeholders referring to any embedded images
|
|
|
|
found as it it processed. The new HTML is written to the original file if
|
|
|
|
B<--nobackup> is specified, or a backup of the original file is made, before
|
|
|
|
writing the new HTML. Making a backup is the default behaviour.
|
|
|
|
|
|
|
|
=item B<--[no]force>
|
|
|
|
|
|
|
|
Images are written to the current directory. If an image already exists, and
|
|
|
|
B<--force> has not been specified, the script will stop processing and exit.
|
|
|
|
The default setting of this option is B<--noforce>.
|
|
|
|
|
|
|
|
=item B<--[no]silent>
|
|
|
|
|
|
|
|
By default the script reports its progress as it processes an HTML file, but
|
|
|
|
this can be turned off by using this option.
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
This Perl script B<extract_images> parses HTML looking for embedded images.
|
|
|
|
Such images use a URI with the scheme 'data' followed by image details
|
|
|
|
including the encoded binary contents. See the Wikipedia article
|
|
|
|
https://en.wikipedia.org/wiki/Data_URI_scheme for details.
|
|
|
|
|
|
|
|
When such images are found they are written as files (with generated names),
|
|
|
|
and the HTML is updated with these names replacing the original URI. Further
|
|
|
|
work is likely to be required to build full image links, but the majority of
|
|
|
|
work will have been done.
|
|
|
|
|
|
|
|
The script will not overwrite image files unless the B<--force> option is
|
|
|
|
used, but will overwrite the original HTML file unless B<--backup> is
|
|
|
|
specified.
|
|
|
|
|
|
|
|
By default details of progress are written to standard output, but this can be
|
|
|
|
preveneted by using the B<--silent> option.
|
|
|
|
|
|
|
|
=head2 GENERATED IMAGE FILE NAMES
|
|
|
|
|
|
|
|
These names are built from various elements:
|
|
|
|
|
|
|
|
<HTML filename>_<prefix>_<counter>.<extension>
|
|
|
|
|
|
|
|
Where <HTML filename> is the base name of the HTML input file, <prefix> is the
|
|
|
|
string provided with the B<--prefix=STRING> option (or B<image> by default),
|
|
|
|
<counter> is a count of image files found during the scan of the HTML, and
|
|
|
|
<extension> is the appropriate file extension for the type of image being
|
|
|
|
converted.
|
|
|
|
|
|
|
|
See the option B<--prefix=STRING> for details of file name generation.
|
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
|
|
|
|
TBA
|
|
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
|
|
|
|
TBA
|
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
|
|
|
|
Data::Dumper
|
|
|
|
File::Copy
|
|
|
|
Getopt::Long
|
|
|
|
HTML::TreeBuilder
|
|
|
|
MIME::Types
|
|
|
|
Path::Tiny
|
|
|
|
Pod::Usage
|
|
|
|
URI
|
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
|
|
|
|
There are no known bugs in this module.
|
|
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
Patches are welcome.
|
|
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
|
|
|
|
Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com).
|
|
|
|
All rights reserved.
|
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or
|
|
|
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
#}}}
|
|
|
|
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
|
|
|
|