#!/usr/bin/env perl
#===============================================================================
#
#         FILE: clean_csv_tags
#
#        USAGE: ./clean_csv_tags
#
#  DESCRIPTION: Make sure tags in the eps.tags field of the HPR database
#               conform to CSV format.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.2
#      CREATED: 2017-01-30 15:32:04
#     REVISION: 2019-10-06 21:50:52
#
#===============================================================================

use 5.010;
use strict;
use warnings;
use utf8;

use Carp;
use Getopt::Long;
use Config::General;
use Text::CSV_XS;
use List::MoreUtils qw{uniq};
use SQL::Abstract;
use DBI;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.0.2';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";

my ( $dbh, $sth1, $h1, $rv );
my ( %eps_tags, %diffs );
my $status;

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );

Usage() if ( $options{'help'} );

#
# Collect options
#
my $cfgfile
    = ( defined( $options{config} ) ? $options{config} : $configfile );

my $aq      = ( defined( $options{aq} )        ? $options{aq}        : 0 );
my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 1 );

#
# Load configuration data
#
my $conf = new Config::General(
    -ConfigFile      => $cfgfile,
    -InterPolateVars => 1,
    -ExtendedAccess  => 1
);
my %config = $conf->getall();

#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd  = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
    $dbuser, $dbpwd, { AutoCommit => 1 } )
    or croak $DBI::errstr;

#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;

#-------------------------------------------------------------------------------
# Collect and process the id numbers and tags from the 'eps' table
#-------------------------------------------------------------------------------
%eps_tags = %{ collect_eps_tags( $dbh ) };

#
# Dump all id numbers and tags if the verbose level is high enough
#
if ( $verbose >= 2 ) {
    my $csv = Text::CSV_XS->new( { always_quote => $aq } );

    print "\nTags collected from the 'eps' table\n\n";
    foreach my $id ( sort { $a <=> $b } keys(%eps_tags) ) {
        $status = $csv->combine( @{ $eps_tags{$id} } );
        printf "%04d: %s\n", $id, $csv->string();
    }
}

if ($dry_run) {
    print "\nNo changes made, dry-run mode\n";
    exit;
}

#-------------------------------------------------------------------------------
# Turn all the saved and cleaned tags into CSV strings again and save them
# back to the database. TODO: find differences and only write those back
#-------------------------------------------------------------------------------
#
# Force quoting everywhere
#
my $csv = Text::CSV_XS->new( { always_quote => $aq } );

#
# Loop through the hash in order of show number
#
for my $id ( sort keys %eps_tags ) {
    #
    # Put the array fields back together
    #
    $status = $csv->combine( @{ $eps_tags{$id} } );

    #
    # Write them to the database
    #
    $dbh->do( q{UPDATE eps SET tags = ? WHERE id = ?},
        undef, $csv->string(), $id );
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

}

exit;

#===  FUNCTION  ================================================================
#         NAME: collect_eps_tags
#      PURPOSE: Collects the tags from the eps.tags field
#   PARAMETERS: $dbh            Database handle
#      RETURNS: A reference to the hash created by collecting all the tags
#  DESCRIPTION: Read the 'id' and tags' fields from the database. Parse the
#               tags as CSV data, flagging any errors. Trim each one and store
#               them in a hash keyed on the id number. The list of tags is
#               stored as an array in sorted order after ensuring there are
#               no duplicates.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub collect_eps_tags {
    my ( $dbh ) = @_;

    my ( $status, @fields, %hash );
    my ( $sth,    $h );

    #
    # For parsing the field as CSV
    # NOTE: Unexplained error in [E. E. "Doc" Smith] (show 2462). Works with
    # double replaced by single quote, but doesn't work if quotes escaped (by
    # doubling) whether all tags are quoted or not. With 'auto_diag' enabled
    # get the error:
    # CSV_XS ERROR: 2034 - EIF - Loose unescaped quote @ rec 1632 pos 40 field 3
    #
    # NOTE: Adding 'allow_loose_quotes' avoids the issue
    #
    my $csv = Text::CSV_XS->new(
        { binary => 1, auto_diag => 1, allow_loose_quotes => 1 } );

    #
    # Query the eps table for all the id and tags
    #
    $sth = $dbh->prepare(
        q{SELECT id,tags FROM eps
            WHERE length(tags) > 0
            ORDER BY id}
    ) or die $DBI::errstr;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    $sth->execute;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    #
    # Loop through what we got
    #
    while ( $h = $sth->fetchrow_hashref ) {
        #
        # Parse the tag list
        #
        $status = $csv->parse( $h->{tags} );
        unless ($status) {
            #
            # Report and skip any errors
            #
            print "Parse error on episode ", $h->{id}, "\n";
            print $csv->error_input(), "\n";
            next;
        }
        @fields = $csv->fields();

        next unless (@fields);

        #
        # Trim all tags (don't alter $_ when doing it)
        #
        @fields = map {
            my $t = $_;
            $t =~ s/(^\s+|\s+$)//g;
            $t;
        } @fields;

        #
        # De-duplicate
        #
        @fields = uniq(@fields);

        #print "$h->{id}: ",join(",",@fields),"\n";

        #
        # Save the id and its tags, sorted for comparison, with empty elements
        # removed too
        #
        $hash{ $h->{id} } = [ sort grep {!/^$/} @fields ];

    }

    #print Dumper(\%hash),"\n";

    return \%hash;

}

#===  FUNCTION  ================================================================
#         NAME: Usage
#      PURPOSE: Display a usage message and exit
#   PARAMETERS: None
#      RETURNS: To command line level with exit value 1
#  DESCRIPTION: Builds the usage message using global values
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Usage {
    print STDERR <<EOD;
Usage: $PROG [options] project

$PROG v$VERSION

    -help               Display this information
    -config=FILE        Define an alternative configuration file (e.g. that
                        for the live database)
    -[no]aq             Turn on/off the 'always_quote' setting in Text::CSV_XS,
                        Default is off (0)
    -[no]dry-run        Display what would have been done but make no changes.
                        Default is -dry-run.
    -verbose            A repeatable option which turns up the verbosity from
                        0 (silent) to 2 (lots of stuff). Default is 0.

EOD
    exit(1);
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION:
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options
        = ( "help", "verbose+", "dry-run!", "config=s", "aq!", );

    if ( !GetOptions( $optref, @options ) ) {
        Usage();
    }

    return;
}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker

