hpr-tools/Database/refresh_tags

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: refresh_tags
#
#        USAGE: ./refresh_tags
#
#  DESCRIPTION: Parse tags from the eps.tags field and use them to populate
#               the tags table. The eps tag list is definitive (though it's
#               quite limited since it's only 200 characters long), and so the
#               tags table is kept in step by adding and deleting.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.3
#      CREATED: 2016-07-17 15:59:24
#     REVISION: 2017-01-30 17:13:28
#
#===============================================================================

use 5.010;
use strict;
use warnings;
use utf8;

use Carp;
use Getopt::Long;
use Config::General;
use Text::CSV;
use SQL::Abstract;
use DBI;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.0.3';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";

my ( $dbh, $sth1, $h1 );
my ( $status, @fields );
my ( %eps_tags, %tags_tags, %diffs );

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#
# Load configuration data
#
my $conf = Config::General->new(
    -ConfigFile      => $configfile,
    -InterPolateVars => 1,
    -ExtendedAccess  => 1,
);
my %config = $conf->getall();

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );

Usage() if ( $options{'help'} );

#
# Collect options
#
my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 1 );

#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd  = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
    $dbuser, $dbpwd, { AutoCommit => 1 } )
    or croak $DBI::errstr;

#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;

my $csv = Text::CSV_XS->new;

#-------------------------------------------------------------------------------
#  Collect and process the id numbers and tags from the 'eps' table
#-------------------------------------------------------------------------------
%eps_tags = %{ collect_eps_tags( $dbh, $verbose ) };

#-------------------------------------------------------------------------------
# Collect any tags we've already stashed in the database
#-------------------------------------------------------------------------------
%tags_tags = %{ collect_db_tags( $dbh, $verbose ) };

#-------------------------------------------------------------------------------
# Now compare the two sources to look for differences
#-------------------------------------------------------------------------------
%diffs = %{ find_differences(\%eps_tags,\%tags_tags) };

#-------------------------------------------------------------------------------
# Perform the updates if there are any
#-------------------------------------------------------------------------------
if (%diffs) {
    print "Differences found\n";
    unless ($dry_run) {
        #
        # Loop through all of the actions by episode number
        #
        foreach my $id ( sort { $a <=> $b } keys(%diffs) ) {

            #
            # Do deletions before additions
            #
            if ( exists( $diffs{$id}->{deletions} ) ) {
                do_deletions( $dbh, $verbose, $id, $diffs{$id}->{deletions} );
            }

            #
            # Do additions after deletions
            #
            if ( exists( $diffs{$id}->{additions} ) ) {
                do_additions( $dbh, $sth1, $verbose, $id,
                    $diffs{$id}->{additions} );
            }

        }
    }
    else {
        print "No changes made - dry run\n";
    }
}
else {
    print "No differences found\n";
}

exit;

#===  FUNCTION  ================================================================
#         NAME: collect_eps_tags
#      PURPOSE: Collects the tags from the eps.tags field
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#      RETURNS: A reference to the hash created by collecting all the tags
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub collect_eps_tags {
    my ( $dbh, $verbose ) = @_;

    my ( $status, @fields, %hash );
    my ( $sth,    $h );

    #
    # For parsing the field as CSV
    # NOTE: Unexplained error in [E. E. "Doc" Smith] (show 2462). Works with
    # double replaced by single quote, but doesn't work if quotes escaped (by
    # doubling) whether all tags are quoted or not. With 'auto_diag' enabled
    # get the error:
    # CSV_XS ERROR: 2034 - EIF - Loose unescaped quote @ rec 1632 pos 40 field 3
    #
    # NOTE: Adding 'allow_loose_quotes' avoids the issue
    #
    my $csv = Text::CSV_XS->new(
        { binary => 1, auto_diag => 1, allow_loose_quotes => 1 } );

    #
    # Query the eps table for all the id and tags
    #
    $sth = $dbh->prepare(
        q{SELECT id,tags FROM eps
            WHERE length(tags) > 0
            ORDER BY id}
    ) or die $DBI::errstr;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    $sth->execute;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    #
    # Loop through what we got
    #
    while ( $h = $sth->fetchrow_hashref ) {
       #
       # Parse the tag list
       #
        $status = $csv->parse( $h->{tags} );
        unless ($status) {
            #
            # Report any errors
            #
            print "Parse error on episode ", $h->{id}, "\n";
            print $csv->error_input(), "\n";
            next;
        }
        @fields = $csv->fields();

        next unless (@fields);

        #
        # Trim all tags (don't alter $_ when doing it)
        #
        @fields = map {
            my $t = $_;
            $t =~ s/(^\s+|\s+$)//g;
            $t;
        } @fields;

        #print "$h->{id}: ",join(",",@fields),"\n";

        #
        # Save the id and its tags, sorted for comparison, with empty elements
        # removed too
        #
        $hash{ $h->{id} } = [ sort grep {!/^$/} @fields ];

    }

    #print Dumper(\%hash),"\n";

    #
    # Dump all id numbers and tags if the verbose level is high enough
    #
    if ( $verbose >= 2 ) {
        print "\nTags collected from the 'eps' table\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
        }
    }

    return \%hash;

}

#===  FUNCTION  ================================================================
#         NAME: collect_db_tags
#      PURPOSE: Collects the tags already stored in the database
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#      RETURNS: A reference to the hash created by collecting all the tags
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub collect_db_tags {
    my ( $dbh, $verbose ) = @_;

    my %hash;
    my ( $sth, $h );

    #
    # Query the database for tag data
    #

    $sth = $dbh->prepare(q{SELECT * FROM tags ORDER BY id})
        or die $DBI::errstr;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    $sth->execute;
    if ( $dbh->err ) {
        warn $dbh->errstr;
    }

    #
    # Loop through what we got building an array of tags per episode number
    #
    while ( $h = $sth->fetchrow_hashref ) {
        if ( defined( $hash{ $h->{id} } ) ) {
            push( @{ $hash{ $h->{id} } }, $h->{tag} );
        }
        else {
            $hash{ $h->{id} } = [ $h->{tag} ];
        }
    }

    #
    # Sort all the tag arrays for comparison
    #
    foreach my $id ( keys(%hash) ) {
        $hash{$id} = [ sort @{ $hash{$id} } ];
    }

    #
    # Dump all id numbers and tags if the verbose level is high enough
    #
    if ( $verbose >= 2 ) {
        print "\nTags collected from the 'tags' table\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
        }
        print '=-' x 40,"\n";
    }

    return \%hash;

}

#===  FUNCTION  ================================================================
#         NAME: find_differences
#      PURPOSE: Find the differences between two hashes containing tags
#   PARAMETERS: $master         Reference to the master hash
#               $slave          Reference to the slave hash
#      RETURNS: A reference to the hash created checking for differences
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub find_differences {
    my ($master,$slave) = @_;

    my %hash;

    foreach my $id ( sort { $a <=> $b } keys(%$master) ) {
        my %iddiffs = array_compare( $master->{$id}, $slave->{$id} );
        if (%iddiffs) {
            if ( $verbose >= 1 ) {
                #
                # Report what was found if asked to
                #
                print "Episode: $id\n";
                print "Update:\n\teps:  ", join( ",", @{ $master->{$id} } ), "\n";
                print "\ttags: ",
                (
                    defined( $slave->{$id} )
                    ? join( ",", @{ $slave->{$id} } )
                    : '--None--' ), "\n";
                print '-' x 80,"\n";
            }
            $hash{$id} = {%iddiffs};
        }
    }

    #
    # Report differences and actions if the verbose level is high enough
    #
    if ( $verbose >= 2 ) {
        print "\nDifferences and actions\n\n";
        foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
            print "Episode: $id\n";
            if ( exists( $hash{$id}->{deletions} ) ) {
                print "Deletions: ";
                print join( ",", @{ $hash{$id}->{deletions} } ), "\n";
            }
            if ( exists( $hash{$id}->{additions} ) ) {
                print "Additions: ";
                print join( ",", @{ $hash{$id}->{additions} } ), "\n";
            }
            print '-' x 80, "\n";
        }
    }

    return \%hash;
}

#===  FUNCTION  ================================================================
#         NAME: do_deletions
#      PURPOSE: Perform any deletions indicated in an array for a given
#               episode
#   PARAMETERS: $dbh            Database handle
#               $verbose        Verbosity level
#               $id             Episode number
#               $tags           Reference to an array of tags for this episode
#      RETURNS: Nothing
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub do_deletions {
    my ( $dbh, $verbose, $id, $tags ) = @_;

    my ( $stmt, @bind, %data, %where );

    #
    # We will dynamically build SQL as we go
    #
    my $sql = SQL::Abstract->new;

    #
    # Process the list of tags we have been given
    #
    for my $i ( 0 .. $#$tags ) {
        #
        # Set up a deletion '... where id = ? and tag = ?'
        #
        %where = ( id => $id, tag => $tags->[$i] );

        ( $stmt, @bind ) = $sql->delete( 'tags', \%where );

        my $sth = $dbh->prepare($stmt);
        my $rv  = $sth->execute(@bind);
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }
        $rv = 0 if ( $rv eq '0E0' );

        #
        # Report the action
        #
        if ($rv) {
            print "Deleted tag for show $id ($tags->[$i])\n";
        }

    }

    print "Deleted ", scalar(@$tags), " row",
        ( scalar(@$tags) != 1 ? 's' : '' ), "\n";

}

#===  FUNCTION  ================================================================
#         NAME: do_additions
#      PURPOSE: Perform any additions indicated in an array for a given
#               episode
#   PARAMETERS: $dbh            Database handle
#               $sth            A prepared database handle with a query to
#                               search for the target tag
#               $verbose        Verbosity level
#               $id             Episode number
#               $tags           Reference to an array of tags for this episode
#      RETURNS: Nothing
#  DESCRIPTION: FIXME
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub do_additions {
    my ( $dbh, $sth, $verbose, $id, $tags ) = @_;

    my ( $sth1, $rv, $h, $tid, $stmt, @bind, %data );

    #
    # We will dynamically build SQL as we go
    #
    my $sql = SQL::Abstract->new;

    my @lctags = map { lc($_) } @$tags;

    for my $i ( 0 .. $#$tags ) {
        #
        # Build the row we're going to add
        #
        %data = (
            id    => $id,
            tag   => $tags->[$i],
            lctag => $lctags[$i]
        );

        ( $stmt, @bind ) = $sql->insert( 'tags', \%data );

        my $sth = $dbh->prepare($stmt);
        my $rv  = $sth->execute(@bind);
        if ( $dbh->err ) {
            warn $dbh->errstr;
        }
        $rv = 0 if ( $rv eq '0E0' );

        #
        # Report the action
        #
        if ($rv) {
            print "Added tag for show $id ($tags->[$i])\n";
        }
    }

    print "Added ", scalar(@$tags), " row",
        ( scalar(@$tags) != 1 ? 's' : '' ), "\n";

}

#===  FUNCTION  ================================================================
#         NAME: array_compare
#      PURPOSE: Compares the elements of two arrays to see if an element
#               present in the master is also present in the slave
#   PARAMETERS: $arr1   A reference to the first array; the MASTER
#               $arr2   A reference to the second array; the SLAVE
#      RETURNS: A hash containing arrays of additions and deletions of the
#               elements that are different. The structure is:
#               {
#                   additions => [ tag1, tag2 .. tagn ],
#                   deletions => [ tag1, tag2 .. tagn ],
#               }
#               The returned hash will be empty if there are no differences.
#  DESCRIPTION: The requirement is to find if there are differences, then to
#               find what they are so that other code can make the slave array
#               match the master. The two arrays come from a database, so
#               we're trying to make a second source (slave) equal the first
#               (master).
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub array_compare {
    my ( $arr1, $arr2 ) = @_;

    my %res;
    my ( @additions, @deletions );
    my %h1 = map { $_ => 1 } @$arr1;
    my %h2 = map { $_ => 1 } @$arr2;

    #
    # Find additions
    #
    for my $key ( keys(%h1) ) {
        unless ( exists( $h2{$key} ) ) {
            push( @additions, $key );
        }
    }

    #
    # Find deletions
    #
    for my $key ( keys(%h2) ) {
        unless ( exists( $h1{$key} ) ) {
            push( @deletions, $key );
        }
    }

    $res{additions} = [@additions] if @additions;
    $res{deletions} = [@deletions] if @deletions;

    return %res;
}

#===  FUNCTION  ================================================================
#         NAME: Usage
#      PURPOSE: Display a usage message and exit
#   PARAMETERS: None
#      RETURNS: To command line level with exit value 1
#  DESCRIPTION: Builds the usage message using global values
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Usage {
    print STDERR <<EOD;
Usage: $PROG [options] project

$PROG v$VERSION

    -help               Display this information
    -[no]dry-run        Display what would have been done but make no changes.
                        Default is -dry-run.
    -verbose            A repeatable option which turns up the verbosity from
                        0 (silent) to 2 (lots of stuff). Default is 0.

EOD
    exit(1);
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION:
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = ( "help", "verbose+", "dry-run!", );

    if ( !GetOptions( $optref, @options ) ) {
        Usage();
    }

    return;
}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker