#!/usr/bin/env perl
#===============================================================================
#
#         FILE: process_mail_tags
#
#        USAGE: ./process_mail_tags [-help] [-[no]dry-run] [-[no]silent]
#               [-threshold=N] [-config=FILE] [-json=FILE] [-debug=N]
#
#  DESCRIPTION: Parses mail files found in a directory, finding show, summary
#               and tag information. Optionally writes this in JSON format to
#               a file and, if requested, updates the shows in the HPR database.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: The project (which got called TSU) is finished. We have
#               processed all the shows in the HPR database missing tags and
#               summaries thanks to many helpers.
#               The script has just been updated to fix a problem spotted in
#               its last usage. See the Journal for October 2021 for the
#               details and for another problem noticed but not fixed.
#               We're keeping this version around for reference in case we
#               build another project of a similar sort - and we may!
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.1.8
#      CREATED: 2015-07-26 15:17:16
#     REVISION: 2021-10-16 21:55:15
#
#===============================================================================

use 5.010;
use strict;
use warnings;
use utf8;

use Getopt::Long;
use Pod::Usage;

use Config::General;
use File::Find::Rule;
use File::Copy;
use Text::CSV::Encoded;
use DBI;

use List::Util qw{any};
use JSON;
use SQL::Abstract;

use Mail::Address;
use Mail::Field;
use Mail::Internet;
use MIME::Parser;
use MIME::QuotedPrint;
use Encode qw(decode encode);

#use Lingua::EN::Inflexion qw/ inflect /;

use Log::Handler;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.1.8';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my $logfile    = "$basedir/${PROG}.log";

my $maildrop  = "$ENV{HOME}/HPR/MailSpool";
my $processed = "$maildrop/processed";
my $rejected  = "$maildrop/rejected";

my ( $dbh, $sth1, $h1, $rc );
#my ( $minshow, $maxshow, $key, $lastkey, $value, $show );
my ( $minshow, $maxshow );
my ( %updates, %final,   @errors );

#
# RE for pre-processing the mail message body where we don't want to be too
# specific. This one doesn't care what the key is and doesn't capture
# anything.
#
my $keyval_re1 = qr{^\s*.+\s*:\s*.*\s*$};

#
# RE for matching lines in the mail message. Expected to be in the format
# 'key:value' where the 'key' is one of a set of three.
#
my $knownkeys = qr{(?i)show|summary|tags};
my $keyval_re2 = qr{^\s*($knownkeys)\s*:\s*(.*)\s*$};

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#
# File names to ignore
#
my @ignore = ( '*~', '.*.swp', 'processed', 'rejected' );

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
my $DEF_DEBUG = 0;
my $DEF_THRESHOLD = 30;

#
# Process options
#
my %options;
Options( \%options );

#
# Default help
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 0, -exitval => 1 )
    if ( $options{'help'} );

#
# Full documentation if requested with -doc
#
pod2usage(
    -msg => "$PROG version $VERSION\n",
    -verbose => 2,
    -exitval => 1,
    -noperldoc => 0,
) if ( $options{'doc'} );

#
# Collect options
#
my $DEBUG  = ( defined( $options{debug} )  ? $options{debug}  : $DEF_DEBUG );

my $cfgfile
    = ( defined( $options{config} ) ? $options{config} : $configfile );

my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $threshold = (
    defined( $options{threshold} ) ? $options{threshold} : $DEF_THRESHOLD );
my $json = $options{'json'};

#
# Check values
#
$threshold = $DEF_THRESHOLD if ($threshold < 0 || $threshold > 100);

#
# Sanity check
#
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );

#
# Load configuration data
#
my $conf = new Config::General(
    -ConfigFile      => $cfgfile,
    -InterPolateVars => 1,
    -ExtendedAccess  => 1
);
my %config = $conf->getall();

#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd  = $config{database}->{password};
$dbh = DBI->connect( "DBI:MariaDB:host=$dbhost;port=$dbport;database=$dbname",
    $dbuser, $dbpwd, { AutoCommit => 1, RaiseError => 1 } )
    or die $DBI::errstr;

#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();

$log->add(
    file => {
        timeformat => "%Y/%m/%d %H:%M:%S",
        filename   => $logfile,
        maxlevel   => 7,
        minlevel   => 0
    }
);

#-------------------------------------------------------------------------------
# Look for work in the maildrop
#-------------------------------------------------------------------------------
my $mailfileRE = qr{(\.(?i:eml)|[^.]+)$};
                                                        #<<<
my @files = File::Find::Rule
    ->file()
    ->name($mailfileRE)
    ->not( File::Find::Rule->new->name(@ignore) )
    ->maxdepth(1)
    ->in($maildrop);
                                                        #>>>

if ( $DEBUG > 2 ) {
    print "D> Files found in $maildrop\n";
    print 'D> ', Dumper( \@files ), "\n";
}

#
# There may be nothing there
#
unless (@files) {
    warn "No mail found; nothing to do\n";
    exit;
}

#-------------------------------------------------------------------------------
#  Find today's show number
#-------------------------------------------------------------------------------
#$sth1 = $dbh->prepare('SELECT id FROM eps WHERE date = curdate()')
$sth1 = $dbh->prepare(
    q{SELECT id FROM eps
        WHERE DATEDIFF(date,CURDATE()) <= 0
        ORDER BY id DESC LIMIT 1}
) or die $DBI::errstr;
if ( $dbh->err ) {
    warn $dbh->errstr;
}

$sth1->execute;
if ( $dbh->err ) {
    warn $dbh->errstr;
}

$h1 = $sth1->fetchrow_hashref();
( $minshow, $maxshow ) = ( 1, $h1->{id}+$threshold );

if ( $DEBUG > 0 ) {
    print "D> Show number limits\n";
    print "D> Show min/max: $minshow/$maxshow\n";
}

#-------------------------------------------------------------------------------
# Prepare to process MIME messages
#-------------------------------------------------------------------------------
my $parser = new MIME::Parser;
$parser->output_under("/tmp");

#-------------------------------------------------------------------------------
# Process the files we found
#-------------------------------------------------------------------------------
foreach my $file ( sort(@files) ) {
    print ">> $file\n" unless $silent;

    #
    # Open the current file and load its contents into a Mail::Internet object
    #
    open( my $mfh, '<', $file )
        or die "$PROG : failed to open input file '$file' : $!\n";

    my $mi_obj = new Mail::Internet($mfh);

    close($mfh)
        or warn "$PROG : failed to close input file '$file' : $!\n";

    #
    # This should be to tags@hackerpublicradio.org in one of these headers
    #
    my @addresses = (
        Mail::Address->parse( $mi_obj->head->get('Resent-to') ),
        Mail::Address->parse( $mi_obj->head->get('Resent-cc') )
    );
    if ( !@addresses ) {
        @addresses = (
            Mail::Address->parse( $mi_obj->head->get('To') ),
            Mail::Address->parse( $mi_obj->head->get('Cc') )
        );
    }

    #
    # Report the headers we're interested in
    #
    foreach my $addr (@addresses) {
        my $dest = lc( $addr->address );
        print "To: $dest\n" unless $silent;
    }
    print '~ ' x 40,"\n" unless $silent;

    #
    # TODO Check the message is actually for us
    #

    my ( $ct, $cte, @body, $results );

    #
    # Parse the message in the assumption it's MIME (with MIME::Parser,
    # returning MIME::Entity objects).  Report any errors and warnings.
    #
    my $entity = $parser->parse_data( $mi_obj->as_string() );
    unless ($entity) {
        $results = $parser->results;
        foreach my $pmsg ( $results->msgs ) {
            print STDERR "$pmsg\n";
        }
    }

    #
    # If we got multiple MIME parts then look for the first 'text/plain'
    # element and save it as the body we'll work on. Otherwise the whole
    # Mail::Internet body is what we want.
    # ----
    # NOTE: 2021-09-27 as an experiment we'll collect all the text/plain
    # attachments into @body and then process them. It's dangerous in that
    # junk in the first entity (the standard message body in a MIME message)
    # will screw up a valid following attachment. We strip signatures so they
    # will not be an issue.
    # ----
    #
    if ( scalar( $entity->parts ) > 0 ) {
        #
        # MIME message
        #
        if ( $DEBUG > 2 ) {
            print "D> Message is MIME with multiple parts\n";
            foreach my $ent ( $entity->parts ) {
                print "D> Type: ", $ent->mime_type, "\n";
            }
        }

        foreach my $ent ( $entity->parts ) {
            if ( $ent->mime_type eq 'text/plain' ) {
                print "D> Processing text/plain part\n" if ( $DEBUG > 2 );

                #$ct = $ent->mime_type;
                $ct = $ent->effective_type;

                $ent->remove_sig();

                #
                # FIXME: Using 'bodyhandle' is supposed to decode the entity
                # body, but this is untested. Also, we have to use 'as_string'
                # then split on "\n" and stick the newlines back on the array
                # elements, which seems a bit weird. Have to monitor this to
                # see how it behaves.
                # Note: 2021-09-27 collecting the body in @ebody and appending
                # it to @body as an experiment.
                #
                #@body = @{ $ent->body() };
                my @ebody
                    = map {"$_\n"} split( "\n", $ent->bodyhandle->as_string );
                print "D> Length of body=", scalar(@ebody), "\n"
                    if ( $DEBUG > 2 );

                #
                # If we got a non-empty body then exit the loop
                #
                # last if @body;

                #
                # Experimentally add the body contents to the collected
                # contents
                #
                push(@body,@ebody) if @ebody;
            }
        }

        #
        # We found no suitable part so there's nothing to process here. We'll
        # let the later phases detect this though
        #
        unless (@body) {
            warn "MIME message has no valid text/plain elements\n";
        }
    }
    else {
        #
        # Simple (non-MIME) message
        #
        if ( $DEBUG > 2 ) {
            print "D> Message is simple with a single part\n";
        }

        #
        # Look to see if we have quoted-printable data in the body
        # TODO: Detect content types we can't deal with.
        #
        $ct  = $mi_obj->head->get('Content-type');
        $cte = $mi_obj->head->get('Content-transfer-encoding');

        #
        # Grab the body after removing any signature
        #
        $mi_obj->remove_sig();
        @body = @{ $mi_obj->body() };

        #
        # Decode any QP we found. Note that if we went the other route because
        # it's a multipart MIME message, the use of 'bodyhandle' should have
        # decoded any QP.
        #
        if ( defined($cte) && $cte =~ /^quoted-printable/ ) {
            print "D> Decoding QP\n" if ( $DEBUG > 2 );
            @body = @{ process_qp( \@body ) };
        }
    }

    #
    # Display the body unless we're being silent
    #
    unless ($silent) {
        print '-' x 80, "\n";
        print join( "", @body ), "\n";
        print '-' x 80, "\n";
    }

    #
    # Initialise for this message
    #
    $updates{$file} = {};
    $updates{$file}->{valid} = 0;

    #
    # Rationalise the body, removing unnecessary stuff and dealing with
    # continuation lines
    #
    my @buffer = processBody(\@body, $keyval_re1);
    print "D> \@buffer contents after 'processBody'\n", Dumper( \@buffer ), "\n"
        if ( $DEBUG > 2 );

    #--------------------------------------------------------------------------
    # Search the message body for updates
    #--------------------------------------------------------------------------
    parseBuffer( $file, \@buffer, $keyval_re2, \%updates, \@errors );

}

#
# Print the captured data if requested
#
if ( $DEBUG > 1 ) {
    print "D> Data captured from the mail file(s), first pass\n";
    print "D> ", Dumper( \%updates ), "\n";
}

#
# The queries we'll use next
#
$sth1 = $dbh->prepare(
    q{SELECT id,length(tags) l_tags,length(summary) l_summary
        FROM eps WHERE id = ?}
) or die $DBI::errstr;
if ( $dbh->err ) {
    warn $dbh->errstr;
}

#-------------------------------------------------------------------------------
# Walk the captured data doing some more checks
#-------------------------------------------------------------------------------
foreach my $file ( sort( keys(%updates) ) ) {
    #
    # Check the shows we got against (our copy of) the database
    #
    if ( $updates{$file}->{valid} > 0 ) {
        #
        # Look at numeric keys (ignoring the 'valid' key)
        #
        foreach my $sh ( grep {/^\d+$/} keys( %{ $updates{$file} } ) ) {
            #
            # Look up the show number and get the tag and summary details
            #
            $sth1->execute($sh);
            if ( $dbh->err ) {
                warn $dbh->errstr;
            }
            $h1 = $sth1->fetchrow_hashref();

            #
            # If we have tags to apply do we have tags already?
            #
            if ( exists( $updates{$file}->{$sh}->{tags} ) ) {
                #print "   Found tags\n";
                if ( $h1->{l_tags} > 0 ) {
                    printf "** Warning ** File %s, show %s - trying to add "
                        . "tags that already exist!\n", $file, $sh
                        unless $silent;
                    $log->warn("$sh: Trying to add duplicate tags")
                        unless $dry_run;
                    delete( $updates{$file}->{$sh}->{tags} );
                }
            }

            #
            # If we have a summary to apply do we have a summary already?
            #
            if ( exists( $updates{$file}->{$sh}->{summary} ) ) {
                #print "   Found summary\n";
                if ( $h1->{l_summary} > 0 ) {
                    printf
                        "** Warning ** File %s, show %s - trying to add "
                        . "a summary that already exists!\n", $file, $sh
                        unless $silent;
                    $log->warn("$sh: Trying to add a duplicate summary")
                        unless $dry_run;
                    delete( $updates{$file}->{$sh}->{summary} );
                }
            }

            #
            # If there are (now) no tags or keys for this show now delete the
            # show
            #
            unless ( scalar( keys( %{ $updates{$file}->{$sh} } ) ) > 0 ) {
                delete( $updates{$file}->{$sh} );
            }

            #
            # If there are (now) no shows relating to the file, mark the file
            # as invalid
            #
            unless ( scalar( keys( %{ $updates{$file} } ) ) > 1 ) {
                $updates{$file}->{valid} = 0;
            }

        }
    }

    #
    # Deal with each file depending on whether it contained any work
    #
    if ( $updates{$file}->{valid} > 0 ) {
        unless ($dry_run) {
            print "Moving $file to 'processed'\n" unless $silent;
            $log->info("Moving $file to 'processed'");
            warn "Unable to move $file\n"
                unless ( moveFile( $file, $processed ) );
        }
        else {
            print "Would move $file to 'processed' (dry run)\n"
                unless $silent;
        }
        delete( $updates{$file}->{valid} );
    }
    else {
        unless ($dry_run) {
            print "Moving $file to 'rejected'\n" unless $silent;
            $log->info("Moving $file to 'rejected'");
            warn "Unable to move $file\n"
                unless ( moveFile( $file, $rejected ) );
        }
        else {
            print "Would move $file to 'rejected' (dry run)\n" unless $silent;
        }
        delete( $updates{$file} );
    }
}

#$sth1->finish();

#
# Print the collected data after validation, if requested
#
if ( $DEBUG > 1 ) {
    print "D> Captured data after being validated against the database\n";
    print "D> ", Dumper( \%updates ), "\n";
}

#-------------------------------------------------------------------------------
# Pick out just the show details for the final stage, de-duplicating if
# necessary
#-------------------------------------------------------------------------------
foreach my $file ( sort( keys(%updates) ) ) {
    foreach my $sh ( sort( keys( %{ $updates{$file} } ) ) ) {
        if ( exists( $final{$sh} ) ) {
            foreach my $key ( keys( %{ $updates{$file}->{$sh} } ) ) {
                if ( exists( $final{$sh}->{$key} ) ) {
                    print "** Duplicates on show $sh key '$key'\n"
                        unless $silent;
                    $log->warn("Duplicates on show $sh key '$key'")
                        unless $dry_run;
                }
                else {
                    $final{$sh}->{$key} = $updates{$file}->{$sh}->{$key};
                }
            }
        }
        else {
            $final{$sh} = $updates{$file}->{$sh};
        }
    }
}

#
# Print the final data
#
if ( $DEBUG > 1 ) {
    print "D> Final data form\n";
    print "D> ", Dumper( \%final ), "\n";
}

#
# Output JSON if requested
#
if ( defined($json) ) {
    DumpJSON( $json, \%final );
}

#-------------------------------------------------------------------------------
# Make changes to the database if so requested
#-------------------------------------------------------------------------------
unless ($dry_run) {
    my ( $stmt, @bind );

    #
    # We will dynamically build SQL as we go
    #
    my $sql = SQL::Abstract->new;

    #
    # Loop though the collected show updates making the changes to the
    # database
    #
    foreach my $sh ( sort( keys(%final) ) ) {
        #
        # For these updates the use of 'begin_work' will turn off Autocommit up to
        # the next commit or rollback, so it needs to be in the loop
        #
        $rc = $dbh->begin_work or die $dbh->errstr;

        ( $stmt, @bind ) = $sql->update( 'eps', $final{$sh}, { id => $sh } );

        #
        # Display the statement we constructed if requested
        #
        print "D> $stmt\n'", join( "','", @bind ), "'\n\n" if ( $DEBUG > 2 );

        #
        # Apply the SQL. Any failure results in a rollback and skipping to the
        # next show, otherwise we commit
        #
        $dbh->do( $stmt, undef, @bind );
        if ( $dbh->err ) {
            warn $dbh->errstr;
            eval{ $dbh->rollback };
            next;
        }
        else {
            $dbh->commit;
        }

        $log->info( "$sh:" . join( ":", keys( %{ $final{$sh} } ) ) );

        print "Updated show $sh\n" unless $silent;

    }

}

#
# We need these if we're not in AutoCommit mode it seems
#
#$sth1->finish();
#$dbh->disconnect();

exit;

#===  FUNCTION  ================================================================
#         NAME: processBody
#      PURPOSE: Processes the lines of the body of the email (or an
#               attachment)
#   PARAMETERS: $body           Reference to the body array
#               $regex          Regex to detect 'key:value' lines
#      RETURNS: A cleaned up array for further processing
#  DESCRIPTION: Simply removes comments and blank lines, then performs
#               a 'chomp' on the result. Searches for continuation lines and
#               attaches them to the previous 'key:value' line. Then removes
#               all the continuation lines and returns the result.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub processBody {
    my ( $body, $regex ) = @_;

    my ( @buffer, $line, $save );

    #
    # Strip comments and blank lines while copying the body
    #
    @buffer = grep { !/^(#|\s*$)/ } @$body;

    #
    # Chomp everything in our local copy, catering for MSDOS line endings if
    # found
    #
    local $/ = "\r\n";
    chomp(@buffer);

    #
    # Find what look like continuation lines and join them to the previous
    # line that is a 'key:value' line.
    #
    $save = 0;
    for ( my $i = 1; $i <= $#buffer; $i++ ) {
        $line = $buffer[$i];
        if ( $line =~ $regex ) {
            $save = $i;
        }
        else {
            $buffer[$save] .= " $line";
        }
    }

    #
    # Strip away the continuation lines
    #
    @buffer = grep { /$regex/ } @buffer;

    #
    # Return the processed array
    #
    return @buffer;
}

#===  FUNCTION  ================================================================
#         NAME: parseBuffer
#      PURPOSE: Parses the buffer built from the email body collecting the
#               valid 'key:value' data info the "stash".
#   PARAMETERS: $file           The name of the mail file being processed
#               $buffer         Reference to the processed body array
#               $regex          Regular expression for parsing 'key:value'
#               $stash          Reference to the hash containing the
#                               accumulating data parsed from the email
#               $errors         Reference to the error buffer
#      RETURNS: Nothing
#  DESCRIPTION: Replaces the old in-line code.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub parseBuffer {
    my ( $file, $buffer, $regex, $stash, $errors ) = @_;

    my ( $key, $value, $show );

    for my $line (@$buffer) {
        #
        # Deal with DOS-style line terminators
        #
        $line =~ s/\r$//g;
        $line =~ s/\s*$//g;

        #
        # Parse out the key and value from a 'key:value' line
        #
        if ( ( $key, $value ) = ( $line =~ $regex ) ) {
            #
            # Make the key lower case (so people can use 'Tags', for example,
            # and not be rejected)
            #
            $key = lc($key);

            if ( $key eq 'show' ) {
                #
                # It's a show. If it's not just numbers ignore it and signal
                # that subsequent keys are to be ignored.
                #
                unless ( $value =~ /\d+/ ) {
                    $show = $value = undef;
                    push( @$errors,
                        "Invalid show specification: '$line'; skipped" );
                    next;
                }

                #
                # Spot a show transition. Validate it and print any
                # accumulated errors
                #
                if ( defined($show) && $show != int($value) ) {
                    $stash->{$file}->{valid} = 1;
                    if (@$errors) {
                        printErrors( $show, $errors, !$silent );
                    }
                }

                #
                # Store the integer value rather than a string with leading
                # zeroes
                #
                $show = int($value);

                #
                # Save the show if it's in range
                #
                unless ( $show < $minshow || $show > $maxshow ) {
                    putStash( $stash, $file, $show );
                }
                else {
                    $show = $value = undef;
                    push( @$errors, "Show number $show out of range" );
                    next;
                }
            }
            elsif ( $key eq 'summary' ) {
                next unless $show;
                $value = formatSummary($value, $errors);
                if ($value) {
                    unless (
                        putStash( $stash, $file, $show, $key, $value, $errors ) )
                    {
                        print "** Error saving '$key:$value' for show $show\n"
                        unless $silent;
                    }
                }
            }
            elsif ( $key eq 'tags' ) {
                next unless $show;
                $value = formatTags($value, $errors);
                if ($value) {
                    unless (
                        putStash( $stash, $file, $show, $key, $value, $errors ) )
                    {
                        print "** Error saving '$key:$value' for show $show\n"
                        unless $silent;
                    }
                }
            }
        }
        else {
            print "** Warning ** Unrecognised line:\n$line\n";
        }

        #
        # If the show is defined by this stage then the file is valid
        #
        $stash->{$file}->{valid} = 1 if defined($show);

    }
    continue {
        if (@$errors) {
            printErrors( $show, $errors, !$silent );
        }
    }
}

#===  FUNCTION  ================================================================
#         NAME: printErrors
#      PURPOSE: Prints the contents of an array of error messages then clears
#               it
#   PARAMETERS: $show           The show number (for information)
#               $errors         Reference to the array of errors
#               $print          Boolean: 1 = print, 0 don't print, just flush
#      RETURNS: Nothing
#  DESCRIPTION: Error messages accumulated elsewhere are printed if requested
#               and the error buffer flushed.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub printErrors {
    my ( $show, $errors, $print ) = @_;

    $print = 1 unless defined($print);
    if ($print) {
        @$errors = map {'** Error ** ' . $_} @$errors;
        print "Show: $show\n", join( "\n", @$errors ), "\n";
    }
    @$errors = ();

    return;
}

#===  FUNCTION  ================================================================
#         NAME: putStash
#      PURPOSE: Puts values into the 'stash' structure for a given file and
#               for a given show within it
#   PARAMETERS: $stash          Reference to the hash containing the
#                               accumulating data
#               $file           Name of the file being processed - used as
#                               a hash key
#               $show           Number of the show being processed (converted
#                               to an integer) - used as a hash key
#               $key            The key to be stashed below the show
#               $value          The value to be stored with the key
#               $errors         Reference to an array for errors
#      RETURNS: True (1) if the storage action succeeded, false (0) otherwise.
#  DESCRIPTION: Hides a lot of the complexity of storing items in the 'stash'
#               structure. Note that the way in which this function is used is
#               a little tricky. If called without values for $key, $value and
#               $errors it simply creates an empty hashref under
#               $stash->{$file}->{$show} ready to be filled with keys and
#               values. It is called in this way once in the main code.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub putStash {
    my ( $stash, $file, $show, $key, $value, $errors ) = @_;

    if ( exists( $stash->{$file}->{$show} ) ) {
        if ( exists( $stash->{$file}->{$show}->{$key} ) ) {
            push(
                @$errors,
                "The $key already exist" . ( $key eq 'tags' ? "" : "s")
                    . " for show $show"
            ) if $errors;
            return 0;
        }
        else {
            $stash->{$file}->{$show}->{$key} = $value;
            return 1;
        }
    }
    else {
        $stash->{$file}->{$show} = {};
        return 1;
    }
}

#===  FUNCTION  ================================================================
#         NAME: formatSummary
#      PURPOSE: Trims and cleans up a summary string
#   PARAMETERS: $summary        String containing the summary
#               $errors         Reference to an array for errors
#      RETURNS: The reformatted summary
#  DESCRIPTION: The summary length must not exceed 100 characters, and we want
#               to reduce multiple whitespace down to the minimum.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub formatSummary {
    my ($summary, $errors) = @_;

    $summary =~ s/(^\s*|\s*$)//g;
    $summary =~ s/\s+/ /g;
    if ( length($summary) > 100 ) {
        push( @$errors,"Summary is too long; truncated" );
        $summary = substr( $summary, 0, 100 );
    }

    return $summary;
}

#===  FUNCTION  ================================================================
#         NAME: formatTags
#      PURPOSE: Normalises a series of tags presented in CSV format
#   PARAMETERS: $tags           String containing tags in CSV form
#               $errors         Reference to an array for errors
#      RETURNS: The formatted tags as a string
#  DESCRIPTION: Parses the incoming CSV, breaks it down into fields then
#               recombines the result. This seems to be the only way to turn
#               quoted fields into the unquoted ones we want. On the way in
#               the need for good quality of CSV is relaxed a little, and on
#               the way out the need to quote is enforced.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub formatTags {
    my ( $tags, $errors ) = @_;

    my $result;
    my $csv = Text::CSV::Encoded->new(
        {   always_quote     => 0,      # Don't quote everything
            quote_char       => '"',    # Use double quotes
            allow_whitespace => 1,      # Parse and strip spaces around
                                        # delimiters
            quote_space      => 0,      # Prevent internal spaces triggering
                                        # quotes
            encoding_in      => "utf8",
            encoding_out     => "utf8",
        }
    );

    #
    # Parse the CSV. Return nothing if it wouldn't validate. Drop all null
    # items and rebuild as a string.
    #
    if ( $csv->parse($tags) ) {
        my @flds = $csv->fields();
        @flds = grep { !/^$/ } @flds;

        $csv->combine(@flds);
        $result = $csv->string();

        if ( length($result) > 200 ) {
            push( @$errors, "Tags are too long after quoting; removed" );
            return;
        }
        else {
            return $result;
        }
    }
    else {
        #
        # Save the error details from the CSV parse. In list context
        # 'error_diag' returns the error number, the error message and (in
        # some circumstances?) the byte index in the current record being
        # parsed (1-based), the index of the record parsed by the CSV
        # instance and the field number of what the parser thinks it's
        # currently parsing.
        # TODO: make this more resilient.
        #
        push( @$errors,
            "Invalid CSV data found in tags: "
                . sprintf( "%d: %s [%d,%d,%d]", ( $csv->error_diag() ) ) );
        #        warn "Invalid CSV data found\n";
        #        $csv->error_diag();
        return;
    }
}

#===  FUNCTION  ================================================================
#         NAME: moveFile
#      PURPOSE: Moves a file (renames it) taking account of whether the name
#               exists at the destination
#   PARAMETERS: $file           file to move
#               $dest           destination of the file
#      RETURNS: True (1) if the move succeeded, otherwise false (0)
#  DESCRIPTION: A wrapper around the File::Copy 'move' function. If the target
#               file exists then a version number is appended to the name so
#               that the original is not clobbered.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub moveFile {
    my ( $file, $dest ) = @_;

    my ( $target, $basename, $prefix, $vsn );

    return 0 unless -e $file;

    #
    # Handle a destination directory
    #
    if ( -d $dest ) {
        ( $basename = $file ) =~ s|.*/||mx;
        $target = "$dest/$basename";
    }
    else {
        $target = $dest;
    }

    #
    # If the target exists we'll have to do some work, otherwise just do the
    # move
    #
    if ( -e $target ) {
        if ( ( $prefix, $vsn ) = ( $target =~ /(\.)(\d+)$/ ) ) {
            while ( -e "$prefix$vsn" ) {
                $vsn++;
                $target = "$prefix$vsn";
            }
            return move( $file, $target );
        }
        else {
            $vsn    = 1;
            $prefix = $target;
            $target = "$prefix.$vsn";
            while ( -e "$prefix.$vsn" ) {
                $vsn++;
                $target = "$prefix.$vsn";
            }
            return move( $file, $target );
        }
    }
    else {
        return move( $file, $dest );
    }
}

#===  FUNCTION  ================================================================
#         NAME: process_qp
#      PURPOSE: Process quoted-printable encoded text in the mail message body
#   PARAMETERS: $body           Arrayref containing the lines of the body
#      RETURNS: Reference to an array of processed lines
#  DESCRIPTION: Removes newlines from the body array and any `\r` sequences.
#               It then loops through the lines looking for eny ending in '='
#               and joins them with the next (removing the '=' first). It then
#               deletes the second part of the line. Then it feeds the array
#               through the `decode_qp` routine and adds the newline back on
#               the end. TODO: this works for the present case but may blow up
#               with more challenging cases.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub process_qp {
    my ($body) = @_;

    #
    # Trim trailing newlines
    #
    chomp( my @body = @$body );

    #
    # Find split lines and join them
    #
    my $i   = 0;
    my $max = $#body;
    while ( $i <= $max ) {
        $body[$i] =~ s/\r//;
        if ( $body[$i] =~ /=$/ ) {
            $body[$i] =~ s/=$//;
            $body[$i] .= $body[ $i + 1 ];
            splice( @body, $i + 1, 1 );
            $max--;
        }
        else {
            $i++;
        }
    }

    #
    # Decode all quoted-printable stuff from QP and then from UTF-8
    #
    @body = map { decode( 'UTF-8', decode_qp($_) ) . "\n" } @body;

    return \@body;
}

#===  FUNCTION  ================================================================
#         NAME: LoadJSON
#      PURPOSE: Load a JSON file
#   PARAMETERS: $file           Name of the file containing JSON
#      RETURNS: The resulting JSON as a scalar or a reference (as determined
#               by the JSON module)
#  DESCRIPTION: Opens the nominated file and collects its contents into an
#               array. Joins all of the rows of the array into a string and
#               then converts the result to JSON.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO:
#===============================================================================
sub LoadJSON {
    my ($file) = @_;

    open( my $fh, '<', $file )
        or die "Unable to open $file for reading: $!\n";
    my @jtxt = <$fh>;
    close($fh);

    my $jtxt = join( '', @jtxt );
    my $json = JSON->new->utf8;

    return $json->decode($jtxt);
}

#===  FUNCTION  ================================================================
#         NAME: DumpJSON
#      PURPOSE: Dump JSON to a file
#   PARAMETERS: $file           Name of file to contain JSON
#               $data           Data to be encoded into JSON
#      RETURNS: Nothing
#  DESCRIPTION: Opens the nominated file and writes the JSON-encoded data to
#               it
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub DumpJSON {
    my ( $file, $data ) = @_;

    my $json = JSON->new->utf8->pretty;
    open( my $fh, '>', $file )
        or die "Unable to open $file for writing: $!\n";
    print $fh $json->encode($data);
    close($fh);

    return;
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION:
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = (
        "help",    "doc",    "debug=i",     "dry-run!",
        "silent!", "json=s", "threshold=i", "config=s",
    );

    if ( !GetOptions( $optref, @options ) ) {
        pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
    }

    return;
}

__END__

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#  Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{


=head1 NAME

process_mail_tags - parse email for updates to the HPR database

=head1 VERSION

This documentation refers to B<process_mail_tags> version 0.1.8

=head1 USAGE

    ./process_mail_tags [-help] [-doc] [-debug=N] [-[no]dry-run] [-[no]silent]
        [-threshold=N] [-config=FILE] [-json=FILE]

    Examples:

        ./process_mail_tags -dry-run
        ./process_mail_tags -silent -json=temp.json

=head1 OPTIONS

=over 8

=item B<-help>

Prints a brief help message describing the usage of the program, and then exits.

=item B<-doc>

Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use:

    pod2pdf process_mail_tags --out=process_mail_tags.pdf

=item B<-debug=N>

Selects a level of debugging. Debug information consists of a line or series
of lines prefixed with the characters 'D>':

=over 4

=item B<0>

No debug output is generated: this is the default

=item B<1>

Displays the minimum and maximum show numbers which will be used to validate
incoming data.

=item B<2>

Dumps the structure containing the information parsed from the mail files
found by the script. Also prints this structure after verification against the
database.

As well as this the script displays the information for the lower debug
levels.

=item B<3>

Dumps the array of files found by the script and displays the SQL statements
built to update the database. Note that the SQL is only displayed in
B<-nodry-run> mode.

As well as this the script displays the information for the lower debug levels

=back

=item B<-[no]dry-run>

Controls whether the program runs in a mode where it performs all the steps,
omitting the stage where the mail files are moved and the database is updated.
The default B<-nodry-run> allows the program to perform the changes.

=item B<-[no]silent>

Controls how much output the program produces. The default, B<-nosilent>,
results in the program writing details of the email it is processing and what
it is doing with the files it is reading.

=item B<-threshold=N>

The program checks that each show number it is given is in range. The bottom
of the range is always 1. The top of the range is computed by looking in the
database for the last show released and adding a 'threshold' value to it. The
default value is 30 to allow looking forward into future shows when adding
summaries or tags.

This threshold can be changed through this option. It is limited to a range of
0 to 100.

=item B<-config=FILE>

TBA

=item B<-json=FILE>

If specified causes the program to write JSON data to the nominated file
containing the show number, summary (if given) and tag list (if given) which
has been parsed from the mail messages that have been processed successfully.

JSON output is generated even if the B<-dry-run> option has been selected.
This is a useful way of determining what will be changed when the program is
allowed to alter the database.

=back

=head1 DESCRIPTION

This script processes email which has been saved into files, searching for
updates to episodes in the HPR database. These updates are for episodes which
are missing summaries and/or tags, and are being requested from volunteer
members of the HPR community, who have the option of sending them in via
email. Other routes for making these changes are planned.

The shows which need summaries and/or tags are listed in an HTML page at
I<http://hackerpublicradio.org/missing_summaries_and_tags.php>. This page is
generated by another script called B<report_missing_tags>.

The email messages which drive the update process must be simple ASCII
messages (no attachments, HTML, quoted-printable, etc, though see below) and
must contain the following elements for each show:

    show:<show number>
    summary:<summary of up to 100 characters>
    tags:<tag strings separated by commas>

This format may include comment lines beginning with a '#' as well as blank
lines (for legibility). The 'show' line is mandatory but either of the other
two lines may be omitted if the show already has tags or a summary.

The script is now capable of handling very simple MIME messages. It needs an
empty body element with one text/plain attachment. If more attachments are
added then they are ignored. The script searches for the first text/plain part
amongst all of the parts. This aspect may be developed in a later release to
allow multiple text/plain attachments.

=head1 DIAGNOSTICS

=over 4

=item B<[DBI error messages]>

Type: fatal or warning

Generated when a database interface error has been detected, such as failure
to connect to the database or failure to prepare or execute a query.

=item B<process_mail_tags : failed to open input file ...>

Type: fatal

The script has failed to open one of the saved mail files it has found.

=item B<No mail found; nothing to do>

Type: warning

The script found no saved mail files and exited.

=item B<process_mail_tags : failed to close input file ...>

Type: warning

The script has failed to close a mail file.

=item B<MIME message has no valid text/plain element>

Type: warning

While processing a saved mail file in MIME format the script failed to find
any I<text/plain> part to work on. This message will not be processed further.

=item B<Unable to move ...>

Type: warning

While attempting to move a saved mail file from the main spool area to either
the I<processed> or I<rejecterd> directory the move command failed. The script
will have left the file where it was.

=item B<Invalid CSV data found>

Type: warning

While processing tags in a saved mail message the script found invalid CSV
data. The tags in question will have been ignored.

=item B<Unable to open ... for writing: ...>

Type: fatal

The script has failed to open a JSON output file for writing. Further
information about the fault should be in the error message.

=back

=head1 CONFIGURATION AND ENVIRONMENT

A full explanation of any configuration system(s) used by the application,
including the names and locations of any configuration files, and the
meaning of any environment variables or properties that can be set. These
descriptions must also include details of any configuration language used


=head1 DEPENDENCIES

    Config::General
    DBI
    Data::Dumper
    Encode
    File::Copy
    File::Find::Rule
    Getopt::Long
    JSON
    List::Util
    Log::Handler
    MIME::Parser
    MIME::QuotedPrint
    Mail::Address
    Mail::Field
    Mail::Internet
    Pod::Usage
    SQL::Abstract
    Text::CSV::Encoded

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.

=head1 AUTHOR

Dave Morriss (Dave.Morriss@gmail.com)

=head1 LICENCE AND COPYRIGHT

Copyright (c) 2015-2021 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.

=cut

#}}}

# [zo to open fold, zc to close]

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker

