#!/usr/bin/env perl
#===============================================================================
#
#         FILE: extract_images
#
#        USAGE: ./extract_images [--help] [--documentation|man]
#               [--prefix=STRING] [--[no-]backup] [--force] [--[no]silent]
#               HTML_file [ [HTML_file_2] [HTML_file_3] ... ]
#
#  DESCRIPTION: Processes HTML files which may have 'data' URIs containing
#               images, and extracts these images into files in the same
#               directory.  The 'data' scheme links are converted to 'https'
#               and reference the extracted files. The modified HTML is
#               output, and the original will be saved as a backup if
#               requested.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.3
#      CREATED: 2024-12-25 10:53:15
#     REVISION: 2024-12-30 11:52:59
#
#===============================================================================

use v5.36;
use strict;
use warnings;
use feature qw{ say state try };
no warnings qw{ experimental::try };

use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8

use Getopt::Long;
use Pod::Usage;

use HTML::TreeBuilder 5 -weak;
use URI;
use MIME::Types;

use Path::Tiny;
use File::Copy;

use Data::Dumper;

#
# Version number (Incremented by Vim)
#
our $VERSION = '0.0.3';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
my ( $notes,   $typename, $uri );
my ( $fcount,  $basename, $filename, $suffix, $fh );
my ( $updates, $bsuffix,  $newURL );

my $backupcount = 5;

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
# {{{
#
# Option defaults
#
my $DEFDEBUG = 0;
my $DEFPREFIX = 'image';

my %options;
Options( \%options );

#
# Default help shows minimal information
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 0 )
    if ( $options{'help'} );

#
# The -documentation or -man option shows the full POD documentation through
# a pager for convenience
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1, -verbose => 2 )
    if ( $options{'documentation'} );

#
# Collect options
#
my $DEBUG  = ( defined( $options{debug} )  ? $options{debug}  : $DEFDEBUG );
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $prefix = ( defined( $options{prefix} ) ? $options{prefix} : $DEFPREFIX );
my $backup = ( defined( $options{backup} ) ? $options{backup} : 1 );
my $force  = ( defined( $options{force} )  ? $options{force}  : 0 );

#
# Check we have arguments
#
pod2usage(
    -msg     => "Missing arguments. One or more HTML file names are needed\n",
    -exitval => 1,
    -verbose => 0
    )
unless ( scalar(@ARGV) > 0 );

#
# Clean up the prefix
#
$prefix = ($prefix =~ /(.*)_$/ ? $1 : $prefix);

#
# Backup suffix (the dot gets added later)
#
$bsuffix = 'bak';

#
# Debug the options
#
_debug(
    $DEBUG > 1,
    '$silent = ' . $silent,
    '$prefix = ' . $prefix,
    '$backup = ' . $backup,
    '$force = ' . $force,
);

# }}}

#-------------------------------------------------------------------------------
# Prepare a MIME::Types object
#-------------------------------------------------------------------------------
my $mt = MIME::Types->new;

#-------------------------------------------------------------------------------
# Loop through the arguments
#-------------------------------------------------------------------------------
foreach my $notesfile (@ARGV) {

    #
    # Get the absolute path of the argument
    #
    my $abs_nf = path($notesfile)->absolute;

    #
    # Get the 'dirname' from the absolute path
    #
    my $dirname = path($abs_nf)->parent->stringify;

    unless (-e $abs_nf) {
        warn "Unable to find $notesfile\n";
        next;
    }

    #
    # Force the MIME type of $notesfile to a string
    #
    $typename = "" . coalesce( $mt->mimeTypeOf("$abs_nf"), '' );

    #
    # Check the MIME type and reject non-HTML
    #
    unless ($typename eq 'text/html') {
        warn "File $abs_nf is not HTML\n";
        next
    }

    say "Reading from $abs_nf\n" unless $silent;

    #
    # Get HTML file basename (no path) without the suffix for building filenames
    #
    $basename = path($abs_nf)->basename('.html');

    #
    # Read the HTML
    #
    open (my $nfh, '<', $notesfile) or die "Unable to open $notesfile\n";
    $notes = <$nfh>;
    close($nfh);

    #
    # Image files are to have an index/sequence number
    #
    $fcount = 0;

    #
    # Keep a note of HTML updates
    #
    $updates = 0;

    #
    # Initialise the TreeBuilder
    #
    my $tree = HTML::TreeBuilder->new;
    $tree->ignore_unknown(0);
    $tree->no_expand_entities(1);
    $tree->p_strict(1);
    $tree->store_comments(1);
    $tree->warn(1);

    #
    # Load the HTML obtained from the file into the TreeBuilder
    #
    $tree->parse_content($notes)
        or die "HTML::TreeBuilder failed to parse notes: $!\n";

    #
    # Loop through the tree looking for 'data' scheme images
    #
    for ( @{ $tree->extract_links('img') } ) {
        my ( $link, $element, $attr, $tag ) = @$_;

        $uri = URI->new($link);
        unless ($silent) {
            say "Scheme: ",     $uri->scheme;
            say "Media type: ", $uri->media_type;
        }

        #
        # We only care about 'data' scheme stuff - for now anyway, and only
        # images within this set
        #
        if ( $uri->scheme eq 'data' ) {
            #
            # Only images
            #
            if ( $uri->media_type =~ /^image/ ) {
              #
              # Extract the file name suffix from the MIME string, and give it
              # a leading '.'
              #
                ( $suffix = $uri->media_type ) =~ s{^.*/}{.};

                #
                # Construct the filename for this image making sure it's in
                # the directory the HTML is in.
                #
                $fcount++;
                # ${fileprefix}_${prefix}_${increment}.${extension}
                $filename
                    = "$dirname/${basename}_${prefix}_${fcount}${suffix}";
                say "Writing to: $filename" unless $silent;

                say '-' x 40 unless $silent;

                #
                # If the file exists and --force is not active, warn and skip.
                # Otherwise write the file.
                #
                if ( -e $filename && !$force ) {
                    warn "File $filename exists; not overwriting\n";
                }
                else {
                    #
                    # Write the data to the file in binary format. The URI
                    # module does the conversion so it's already binary.
                    #
                    $fh = path($filename)->openw_raw;
                    print $fh $uri->data;
                    close($fh);
                }

                #
                # Update the HTML with a link to the file we created (or that
                # was already there from an earlier time).
                #
                $updates++;
                $newURL = path($filename)->basename;
                $element->attr( $attr, $newURL );
            }
        }

    }    # extract_links loop

    #
    # Output the changed HTML turning what became standalone back into
    # a <body> fragment.
    #
    if ($updates > 0) {
        my $body = $tree->look_down( _tag => 'body' );
        ( my $result = $body->as_HTML( undef, ' ', {} ) )
            =~ s{(^<body[^>]*>|</body>$)}{}gi;

        #$notesfile = path($notesfile)->basename;
        if ($backup) {
            _backup( $abs_nf, $bsuffix, $backupcount );
            say "$notesfile backed up" unless $silent;
        }
        else {
            say "$notesfile not backed up" unless $silent;
        }

        $fh = path($notesfile)->openw;
        say $fh $result;
        close($fh);

        say "$updates images converted, $notesfile updated" unless $silent;
    }
    else {
        say "No images found, no changes made to $notesfile" unless $silent;
    }

} # $notesfile loop

exit;

#===  FUNCTION  ================================================================
#         NAME: _backup
#      PURPOSE: Given a file, make a backup of it by appending $suffix, and
#               handle previous backups
#   PARAMETERS: $filename               path of file to backup
#               $suffix                 suffix to use, default 'bck'
#               $limit                  number of backups to keep
#      RETURNS: True or false depending on success
#  DESCRIPTION: Checks that the file exists and returns FALSE if it doesn't.
#               Rationalises the $limit to avoid it being 0 or less.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub _backup {
    my ( $filename, $suffix, $limit ) = @_;

    my ( $backupname, $limitname );

    #
    # Check the target file exists
    #
    unless ( -e $filename ) {
        warn "Unable to find $filename to backup\n";
        return 0;
    }

    #
    # Handle invalid $limit values
    #
    $limit = 1 unless ( $limit >= 1 );

    #
    # Defaults
    #
    $suffix     = 'bck' unless $suffix;
    $limitname  = "$filename.$suffix.$limit";
    $backupname = "$filename.$suffix";

    #
    # Look for existing backups
    #
    if ( -e $backupname ) {

        #
        # If maximum version exists delete it
        #
        if ( -e $limitname ) {
            unlink($limitname);
        }

        #
        # Go backwards through the versions incrementing their version numbers
        #
        for ( my $vsn = $limit - 1; $vsn > 0; $vsn-- ) {
            if ( -e "$filename.$suffix.$vsn" ) {
                move( "$filename.$suffix.$vsn",
                    sprintf( '%s.%s.%s', $filename, $suffix, $vsn + 1 ) );
            }
        }

        #
        # Make $filename.bck into $filename.bck.1
        #
        move( "$filename.$suffix", "$filename.$suffix.1" );

    }

    #
    # Finally save the $filename as $filename.bck
    #
    move( $filename, "$filename.$suffix" );

    return 1;
}

#===  FUNCTION  ================================================================
#         NAME: coalesce
#      PURPOSE: To find the first defined argument and return it
#   PARAMETERS: Arbitrary number of arguments
#      RETURNS: The first defined argument or undef if there are none
#  DESCRIPTION: Just a simple way of ensuring an 'undef' value is never
#               returned when doing so might be a problem.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub coalesce {
    foreach (@_) {
        return $_ if defined($_);
    }
    return undef;    ## no critic
}

#===  FUNCTION  ================================================================
#         NAME: _debug
#      PURPOSE: Prints debug reports
#   PARAMETERS: $active         Boolean: 1 for print, 0 for no print
#               $messages...    Arbitrary list of messages to print
#      RETURNS: Nothing
#  DESCRIPTION: Outputs messages if $active is true. It removes any trailing
#               newline from each one and then adds one in the 'print' to the
#               caller doesn't have to bother. Prepends each message with 'D>'
#               to show it's a debug message.
#       THROWS: No exceptions
#     COMMENTS: Differs from other functions of the same name
#     SEE ALSO: N/A
#===============================================================================
sub _debug {
    my $active = shift;

    my $message;
    return unless $active;

    while ($message = shift) {
        chomp($message);
        print STDERR "D> $message\n";
    }
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION: Process the options we want to offer. See the documentation
#               for details
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = (
        "help",     "documentation|man", "debug=i", "silent!",
        "prefix=s", "backup!",           "force!",
        # "config=s",
    );

    if ( !GetOptions( $optref, @options ) ) {
        pod2usage(
            -msg     => "$PROG version $VERSION\n",
            -exitval => 1,
            -verbose => 0
        );
    }

    return;
}

__END__

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#  Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{

=head1 NAME

extract_images - extract embedded images from HTML and save as files

=head1 VERSION

This documentation refers to extract_images version 0.0.3

=head1 USAGE

    extract_images
        [--help] [--documentation|man] [-debug=N]
        [--prefix=STRING] [--[no]backup] [--[no]force] [--[no]silent]
        HTML_file [ [HTML_file_2] [ [HTML_file_3] ... ] ]

    Examples:

    extract_images --prefix=picture_ --backup index.html

    extract_images --silent --nobackup shownotes.html

=head1 REQUIRED ARGUMENTS

The script requires the names of one or more HTML files which will be scanned
for embedded images.

=head1 OPTIONS

Note that all on/off options can be written as B<--silent>, B<--nosilent> or
B<--no-silent>. A single hyphen can be used at the start or a pair.

=over 8

=item B<--help>

Reports brief information about how to use the script and exits. To see the
full documentation use the option B<--documentation> or B<--man>. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.

=item B<--documentation> or B<--man>

Reports full information about how to use the script and exits. Alternatively,
to generate a PDF version use the I<pod2pdf> tool from
I<http://search.cpan.org/~jonallen/pod2pdf-0.42/bin/pod2pdf>. This can be
installed with the cpan tool as App::pod2pdf.

=item B<--debug=N>

Run in debug mode at the level specified by I<N>. Possible values are:

=over 4

=item B<0>

No debugging (the default).

=item B<1>

TBA

=item B<2>

TBA

=item B<3>

TBA

=back

=item B<--prefix=STRING>

Since embedded images have no names, when they are written to image files
names are generated for them. These names are built from the following
components:

=over 4

=item B<HTML filename>

The name of the HTML file without a suffix. So, if the name is 'index.html',
the 'index' part will be used.

=item B<prefix>

The prefix string provided by this option, or 'image' if not specified.

=item B<counter>

The images found in the HTML are counted, starting from 1, and this number is
used here.

=item B<extension>

The image format extension, taken from the embedded image information.
Examples would be 'jpg', 'jpeg' and 'png'.

=back

The B<HTML filename>, B<prefix>, B<counter> and B<extension> are joined
together to make the file name as follows:

    <HTML filename>_<prefix>_<counter>.<extension>

So the following run of the script would generate the first file shown below
assuming the first embedded picture was in 'PNG' format:

    extract_images --prefix=picture hpr4283.html

    hpr4283_picture_1.png

=item B<--[no]backup>

The HTML is modified to leave placeholders referring to any embedded images
found as it it processed. The new HTML is written to the original file if
B<--nobackup> is specified, or a backup of the original file is made, before
writing the new HTML. Making a backup is the default behaviour.

=item B<--[no]force>

Images are written to the current directory. If an image already exists, and
B<--force> has not been specified, the script will stop processing and exit.
The default setting of this option is B<--noforce>.

=item B<--[no]silent>

By default the script reports its progress as it processes an HTML file, but
this can be turned off by using this option.

=back

=head1 DESCRIPTION

This Perl script B<extract_images> parses HTML looking for embedded images.
Such images use a URI with the scheme 'data' followed by image details
including the encoded binary contents. See the Wikipedia article
https://en.wikipedia.org/wiki/Data_URI_scheme for details.

When such images are found they are written as files (with generated names),
and the HTML is updated with these names replacing the original URI. Further
work is likely to be required to build full image links, but the majority of
work will have been done.

The script will not overwrite image files unless the B<--force> option is
used, but will overwrite the original HTML file unless B<--backup> is
specified.

By default details of progress are written to standard output, but this can be
preveneted by using the B<--silent> option.

=head2 GENERATED IMAGE FILE NAMES

These names are built from various elements:

    <HTML filename>_<prefix>_<counter>.<extension>

Where <HTML filename> is the base name of the HTML input file, <prefix> is the
string provided with the B<--prefix=STRING> option (or B<image> by default),
<counter> is a count of image files found during the scan of the HTML, and
<extension> is the appropriate file extension for the type of image being
converted.

See the option B<--prefix=STRING> for details of file name generation.

=head1 DIAGNOSTICS

TBA

=head1 CONFIGURATION AND ENVIRONMENT

TBA

=head1 DEPENDENCIES

    Data::Dumper
    File::Copy
    Getopt::Long
    HTML::TreeBuilder
    MIME::Types
    Path::Tiny
    Pod::Usage
    URI

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.

=head1 AUTHOR

Dave Morriss  (Dave.Morriss@gmail.com)

=head1 LICENCE AND COPYRIGHT

Copyright (c) 2024 Dave Morriss (Dave.Morriss@gmail.com).
All rights reserved.

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=cut

#}}}

# [zo to open fold, zc to close]

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker