#!/usr/bin/env perl
#===============================================================================
#
#         FILE: refresh_tags_2
#
#        USAGE: ./refresh_tags_2
#
#  DESCRIPTION: Parse tags from the eps.tags field and use them to populate
#               the eps_tags2_xref and tags2 tables. The eps tag list is
#               definitive (though it's quite limited since it's only 200
#               characters long), and so the junction table eps_tags2_xref and
#               the normalised tags table tags2 are kept in step by adding
#               and deleting.
#               This script is for demonstration purposes. It is not the
#               definitive answer to the tag management problem in the HPR
#               database, though it's close :-)
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.3
#      CREATED: 2016-07-22 16:48:49
#     REVISION: 2017-03-14 21:11:33
#
#===============================================================================

use 5.010;
use strict;
use warnings;
use utf8;

use Carp;
use Getopt::Long;
use Config::General;
use Text::CSV;
use SQL::Abstract;
use DBI;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.0.3';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";

my ( $dbh, $sth1, $h1, $rv );
my ( %eps_tags, %tags_tags, %diffs );

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#
# Load configuration data
#
my $conf = Config::General->new(
    -ConfigFile      => $configfile,
    -InterPolateVars => 1,
    -ExtendedAccess  => 1,
);
my %config = $conf->getall();

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );

Usage() if ( $options{'help'} );

#
# Collect options
#
my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 1 );

#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd  = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
    $dbuser, $dbpwd, { AutoCommit => 1 } )
    or croak $DBI::errstr;

#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;

#-------------------------------------------------------------------------------
#  Collect and process the id numbers and tags from the 'eps' table
#-------------------------------------------------------------------------------
%eps_tags = %{ collect_eps_tags( $dbh, $verbose ) };

#-------------------------------------------------------------------------------
# Collect any tags we've already stashed in the database.
#-------------------------------------------------------------------------------
%tags_tags = %{ collect_db_tags( $dbh, $verbose ) };

#-------------------------------------------------------------------------------
# Now compare the two sources to look for differences
#-------------------------------------------------------------------------------
%diffs = %{ find_differences(\%eps_tags,\%tags_tags) };

#-------------------------------------------------------------------------------
# Perform the updates if there are any
#-------------------------------------------------------------------------------
if (%diffs) {
    print "Differences found\n\n";
    unless ($dry_run) {
        #
        # Scan for all deletions in the %diffs hash by traversing it by sorted
        # episode number. If deletions are found for an episode they are
        # performed.
        #
        foreach my $id ( sort { $a <=> $b } keys(%diffs) ) {
            if ( exists( $diffs{$id}->{deletions} ) ) {
                do_deletions( $dbh, $verbose, $id, $diffs{$id}->{deletions} );
            }
        }

        #
        # Prepare to search for tags
        #
        $sth1 = $dbh->prepare(q{SELECT * FROM tags2 WHERE tag = ?})
            or die $DBI::errstr;
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }

        #
        # Scan for all additions in the %diffs hash
        #
        foreach my $id ( sort { $a <=> $b } keys(%diffs) ) {
            if ( exists( $diffs{$id}->{additions} ) ) {
                do_additions( $dbh, $sth1, $verbose, $id,
                    $diffs{$id}->{additions} );
            }
        }

        #
        # Having deleted all the requested rows from the junction table remove
        # any tags that are "orphaned" as a consequence. If we were using
        # foreign keys we could let the database do this.
        #
        $sth1 = $dbh->prepare(
            q{DELETE FROM tags2
              WHERE id NOT IN (SELECT DISTINCT tags2_id FROM eps_tags2_xref)}
        ) or die $DBI::errstr;
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }

        $rv = $sth1->execute;
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }
        $rv = 0 if ( $rv eq '0E0' );

        #
        # Report the action
        #
        if ($rv) {
            print "Deleted ", $rv, " orphan tag", ( $rv != 1 ? 's' : '' ),
                "\n";
        }

    }
    else {
        print "No changes made - dry run\n";
    }
}
else {
    print "No differences found\n";
}

exit;

#===  FUNCTION  ================================================================
#         NAME: collect_eps_tags
#      PURPOSE: Collects the tags from the eps.tags field
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#      RETURNS: A reference to the hash created by collecting all the tags
#  DESCRIPTION: FIXME
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub collect_eps_tags {
    my ( $dbh, $verbose ) = @_;

    my ( $status, @fields, %hash );
    my ( $sth,    $h );

    #
    # For parsing the field as CSV
    #
    my $csv = Text::CSV_XS->new(
        {   binary             => 1,
            auto_diag          => 1,
            allow_loose_quotes => 1
        }
    );

    #
    # Query the eps table for all the id and tags
    #
    $sth = $dbh->prepare(
        q{SELECT id,tags FROM eps
            WHERE length(tags) > 0
            ORDER BY id}
    ) or die $DBI::errstr;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    $sth->execute;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    #
    # Loop through what we got
    #
    while ( $h = $sth->fetchrow_hashref ) {
       #
       # Parse the tag list
       #
        $status = $csv->parse( $h->{tags} );
        unless ($status) {
            #
            # Report any errors
            #
            print "Parse error on episode ", $h->{id}, "\n";
            print $csv->error_input(), "\n";
            next;
        }
        @fields = $csv->fields();

        next unless (@fields);

        #
        # Trim all tags (don't alter $_ when doing it)
        #
        @fields = map {
            my $t = $_;
            $t =~ s/(^\s+|\s+$)//g;
            $t;
        } @fields;

        #print "$h->{id}: ",join(",",@fields),"\n";

        #
        # Save the id and its tags, sorted for comparison, with empty elements
        # removed too
        #
        $hash{ $h->{id} } = [ sort grep {!/^$/} @fields ];

    }

    #print Dumper(\%hash),"\n";

    #
    # Dump all id numbers and tags if the verbose level is high enough
    #
    if ( $verbose >= 3 ) {
        print "\nTags collected from the 'eps' table\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
        }
    }

    return \%hash;

}

#===  FUNCTION  ================================================================
#         NAME: collect_db_tags
#      PURPOSE: Collects the tags already stored in the database
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#      RETURNS: A reference to the hash created by collecting all the tags
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub collect_db_tags {
    my ( $dbh, $verbose ) = @_;

    my %hash;
    my ( $sth, $h );

    #
    # Query the database for tag data
    #
    # We use the junction table (eps_tags2_xref), traversing it by episode number
    # and linking the table of tags (tags2). This results in a list of the tags
    # relating to an episode, which should be similar to (if not the same as) the
    # 'tags' field in the 'eps' table.
    #
    $sth = $dbh->prepare(
        q{SELECT et.eps_id AS id,t.tag,t.lctag
            FROM eps_tags2_xref et
            JOIN tags2 t ON et.tags2_id = t.id
            ORDER BY et.eps_id}
    ) or die $DBI::errstr;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    $sth->execute;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    #
    # Loop through what we got, building an array of tags per episode number
    #
    while ( $h = $sth->fetchrow_hashref ) {
        if ( defined( $hash{ $h->{id} } ) ) {
            push( @{ $hash{ $h->{id} } }, $h->{tag} );
        }
        else {
            $hash{ $h->{id} } = [ $h->{tag} ];
        }
    }

    #
    # Sort all the tag arrays for comparison
    #
    foreach my $id ( keys(%hash) ) {
        $hash{$id} = [ sort @{ $hash{$id} } ];
    }

    #
    # Dump all id numbers and tags if the verbose level is high enough
    #
    if ( $verbose >= 3 ) {
        print "\nTags collected from the 'tags2' table\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
        }
        print '=-' x 40,"\n";
    }

    return \%hash;

}

#===  FUNCTION  ================================================================
#         NAME: find_differences
#      PURPOSE: Find the differences between two hashes containing tags
#   PARAMETERS: $master         Reference to the master hash
#               $slave          Reference to the slave hash
#      RETURNS: A reference to the hash created checking for differences
#  DESCRIPTION: The function is presented with two hashes. The 'master' hash
#               has come from the CSV string in the 'eps' table. The 'slave'
#               hash has come from the table of tags 'tags2'. These hashes are
#               keyed by episode number and each element contains a reference
#               to a sorted array of tags.
#               This function compares two tag arrays for an episode using
#               function 'array_compare' and receives back a hash of additions
#               and deletions:
#               {
#                   additions => [ tag1, tag2 .. tagn ],
#                   deletions => [ tag1, tag2 .. tagn ],
#               }
#               These are stored in a result hash keyed by episode number, and
#               a reference to this hash is returned to the caller.
#               This function can report a lot of details about what has been
#               found if the level of verbosity is high enough.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub find_differences {
    my ($master,$slave) = @_;

    my %hash;

    foreach my $id ( sort { $a <=> $b } keys(%$master) ) {
        my %iddiffs = array_compare( $master->{$id}, $slave->{$id} );
        if (%iddiffs) {
            if ( $verbose >= 1 ) {
                #
                # Report what was found if asked to
                #
                print "Episode: $id\n";
                print "Update:\n\teps:  ", join( ",", @{ $master->{$id} } ), "\n";
                print "\ttags: ",
                (
                    defined( $slave->{$id} )
                    ? join( ",", @{ $slave->{$id} } )
                    : '--None--' ), "\n";
                print '-' x 80,"\n";
            }
            $hash{$id} = {%iddiffs};
        }
    }

    #
    # Report differences and actions if the verbose level is high enough
    #
    if ( $verbose >= 2 ) {
        print "\nDifferences and actions\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            print "Episode: $id\n";
            if ( exists( $hash{$id}->{deletions} ) ) {
                print "Deletions: ";
                print join( ",", @{ $hash{$id}->{deletions} } ), "\n";
            }
            if ( exists( $hash{$id}->{additions} ) ) {
                print "Additions: ";
                print join( ",", @{ $hash{$id}->{additions} } ), "\n";
            }
            print '-' x 80, "\n";
        }
    }

    return \%hash;
}

#===  FUNCTION  ================================================================
#         NAME: do_deletions
#      PURPOSE: Perform any deletions indicated in an array for a given
#               episode
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#               $id             Episode number
#               $tags           Reference to an array of tags for this episode
#      RETURNS: Nothing
#  DESCRIPTION: A tag deletion consists of its removal from the joining table.
#               Only when there are no more references to the actual tag can
#               it then be deleted. If the tables were in a database with
#               foreign keys then we could leave the database itself to handle
#               this (MariaDB could do it but we'd need to redefine the tables
#               to use InnoDB rather than MyISAM. The latter is the legacy
#               table structure from the days when MySQL didn't have foreign
#               keys).
#               This function does not perform the tag deletion since this
#               easier to leave until all deletions have finished.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub do_deletions {
    my ( $dbh, $verbose, $id, $tags ) = @_;

    my ( $stmt, @bind, %data, %where );

    #
    # We will dynamically build SQL as we go
    #
    my $sql = SQL::Abstract->new;

    #
    # Process the list of tags we have been given
    #
    for my $i ( 0 .. $#$tags ) {
        #
        # Set up a deletion '... where eps_id = ? and
        # tags2 = (select id from tags2 where tag = ?)'
        #
        my ( $sub_stmt, @sub_bind )
            = ( "SELECT id FROM tags2 WHERE tag = ?", $tags->[$i] );

        %where = (
            eps_id      => $id,
            tags2_id => \[ "= ($sub_stmt)" => @sub_bind ]
        );

        ( $stmt, @bind ) = $sql->delete( 'eps_tags2_xref', \%where );
        if ( $verbose >= 2 ) {
            print "Statement: $stmt\n";
            print "Bind: ", join( ",", @bind ), "\n";
        }

        #
        # Do the deletion
        #
        my $sth = $dbh->prepare($stmt);
        my $rv = $sth->execute(@bind);
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }
        $rv = 0 if ( $rv eq '0E0' );

        #
        # Report the action
        #
        if ($rv) {
            print "Deleted tag for show $id ($tags->[$i])\n";
        }

    }

    print "Deleted ", scalar(@$tags), " row",
        ( scalar(@$tags) != 1 ? 's' : '' ), "\n";

}

#===  FUNCTION  ================================================================
#         NAME: do_additions
#      PURPOSE: Perform any additions indicated in an array for a given
#               episode
#   PARAMETERS: $dbh            Database handle
#               $sth            A prepared database handle with a query to
#                               search for the target tag
#               $verbose        Verbosity level
#               $id             Episode number
#               $tags           Reference to an array of tags for this episode
#      RETURNS: Nothing
#  DESCRIPTION: The addition of a tag for an episode consists of creating the
#               tag in the 'tags2' table (unless it already exists) and
#               making a joining table entry for it. This what this function
#               does.
#               FIXME: Not very resilient to failure.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub do_additions {
    my ( $dbh, $sth, $verbose, $id, $tags ) = @_;

    my ( $sth1, $rv, $h, $tid, $stmt, @bind, %data );

    #
    # We will dynamically build SQL as we go
    #
    my $sql = SQL::Abstract->new;

    my @lctags = map { lc($_) } @$tags;

    #
    # Loop through the array of tags (using an integer so we can index the
    # current tag)
    #
    for my $i ( 0 .. $#$tags ) {
        #
        # Look to see if this tag exists
        #
        $sth->execute( $tags->[$i] );
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }

        #
        # If it's already in the table just store the id otherwise
        # add a new entry
        #
        if ( $h = $sth->fetchrow_hashref ) {
            $tid = $h->{id};
        }
        else {
            #
            # Build the row we're going to add
            #
            %data = (
                tag   => $tags->[$i],
                lctag => $lctags[$i]
            );

            #
            # Build the SQL, reporting the result if asked
            #
            ( $stmt, @bind ) = $sql->insert( 'tags2', \%data );
            if ( $verbose >= 2 ) {
                print "Statement: $stmt\n";
                print "Bind: ", join( ",", @bind ), "\n";
            }

            #
            # Add the tag to 'tags2'
            #
            $sth1 = $dbh->prepare($stmt);
            $rv   = $sth1->execute(@bind);
            if ( $dbh->err ) {
                warn $dbh->errstr;
            }
            $rv = 0 if ( $rv eq '0E0' );

            #
            # Ask the database for the id we just added
            # FIXME: what if it failed?
            #
            $tid = $sth1->{mysql_insertid};

            #
            # Report the action
            #
            if ($rv) {
                print "Added new tag '$tags->[$i]' ($tid)\n";
            }
        }

        #
        # Now we know we have a tag in the tags2 table so now we can create
        # the eps_tags2_xref entry
        #
        %data = (
            eps_id   => $id,
            tags2_id => $tid
        );

        #
        # Build the SQL, reporting the result if asked
        #
        ( $stmt, @bind ) = $sql->insert( 'eps_tags2_xref', \%data );
        if ( $verbose >= 2 ) {
            print "Statement: $stmt\n";
            print "Bind: ", join( ",", @bind ), "\n";
        }

        #
        # Add the row
        #
        $sth1 = $dbh->prepare($stmt);
        $rv   = $sth1->execute(@bind);
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }
        $rv = 0 if ( $rv eq '0E0' );

        #
        # Report the action
        #
        if ($rv) {
            printf "Added new junction row (eps_id=%s,tags2_id=%s -> %s)\n",
                $id, $tid, $tags->[$i];
        }

    }

    print "Added ", scalar(@$tags), " row",
        ( scalar(@$tags) != 1 ? 's' : '' ), "\n";

}

#===  FUNCTION  ================================================================
#         NAME: array_compare
#      PURPOSE: Compares the elements of two arrays to see if an element
#               present in the master is also present in the slave
#   PARAMETERS: $arr1   A reference to the first array; the MASTER
#               $arr2   A reference to the second array; the SLAVE
#      RETURNS: A hash containing arrays of additions and deletions of the
#               elements that are different. The structure is:
#               {
#                   additions => [ tag1, tag2 .. tagn ],
#                   deletions => [ tag1, tag2 .. tagn ],
#               }
#               The returned hash will be empty if there are no differences.
#  DESCRIPTION: The requirement is to find if there are differences, then to
#               find what they are so that other code can make the slave array
#               match the master. The two arrays come from a database, so
#               we're trying to make a second source (slave) equal the first
#               (master).
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub array_compare {
    my ( $arr1, $arr2 ) = @_;

    my %res;
    my ( @additions, @deletions );

    #
    # Use hashes to make it easier to find existence of stuff
    #
    my %h1 = map { lc($_) => 1 } @$arr1;
    my %h2 = map { lc($_) => 1 } @$arr2;

    #
    # Find additions
    #
    for my $key ( keys(%h1) ) {
        unless ( exists( $h2{$key} ) ) {
            push( @additions, $key );
        }
    }

    #
    # Find deletions
    #
    for my $key ( keys(%h2) ) {
        unless ( exists( $h1{$key} ) ) {
            push( @deletions, $key );
        }
    }

    $res{additions} = [@additions] if @additions;
    $res{deletions} = [@deletions] if @deletions;

    return %res;
}

#===  FUNCTION  ================================================================
#         NAME: Usage
#      PURPOSE: Display a usage message and exit
#   PARAMETERS: None
#      RETURNS: To command line level with exit value 1
#  DESCRIPTION: Builds the usage message using global values
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Usage {
    print STDERR <<EOD;
Usage: $PROG [options] project

$PROG v$VERSION

    -help               Display this information
    -[no]dry-run        Display what would have been done but make no changes.
                        Default is -dry-run.
    -verbose            A repeatable option which turns up the verbosity from
                        0 (silent) to 3 (lots and lots of stuff). Default is 0.

EOD
    exit(1);
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION:
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = ( "help", "verbose+", "dry-run!", );

    if ( !GetOptions( $optref, @options ) ) {
        Usage();
    }

    return;
}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker

