hpr-tools/Database/refresh_tags

614 lines
18 KiB
Perl
Executable File

#!/usr/bin/env perl
#===============================================================================
#
# FILE: refresh_tags
#
# USAGE: ./refresh_tags
#
# DESCRIPTION: Parse tags from the eps.tags field and use them to populate
# the tags table. The eps tag list is definitive (though it's
# quite limited since it's only 200 characters long), and so the
# tags table is kept in step by adding and deleting.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.3
# CREATED: 2016-07-17 15:59:24
# REVISION: 2017-01-30 17:13:28
#
#===============================================================================
use 5.010;
use strict;
use warnings;
use utf8;
use Carp;
use Getopt::Long;
use Config::General;
use Text::CSV;
use SQL::Abstract;
use DBI;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.3';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my ( $dbh, $sth1, $h1 );
my ( $status, @fields );
my ( %eps_tags, %tags_tags, %diffs );
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#
# Load configuration data
#
my $conf = Config::General->new(
-ConfigFile => $configfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );
Usage() if ( $options{'help'} );
#
# Collect options
#
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 1 );
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or croak $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;
my $csv = Text::CSV_XS->new;
#-------------------------------------------------------------------------------
# Collect and process the id numbers and tags from the 'eps' table
#-------------------------------------------------------------------------------
%eps_tags = %{ collect_eps_tags( $dbh, $verbose ) };
#-------------------------------------------------------------------------------
# Collect any tags we've already stashed in the database
#-------------------------------------------------------------------------------
%tags_tags = %{ collect_db_tags( $dbh, $verbose ) };
#-------------------------------------------------------------------------------
# Now compare the two sources to look for differences
#-------------------------------------------------------------------------------
%diffs = %{ find_differences(\%eps_tags,\%tags_tags) };
#-------------------------------------------------------------------------------
# Perform the updates if there are any
#-------------------------------------------------------------------------------
if (%diffs) {
print "Differences found\n";
unless ($dry_run) {
#
# Loop through all of the actions by episode number
#
foreach my $id ( sort { $a <=> $b } keys(%diffs) ) {
#
# Do deletions before additions
#
if ( exists( $diffs{$id}->{deletions} ) ) {
do_deletions( $dbh, $verbose, $id, $diffs{$id}->{deletions} );
}
#
# Do additions after deletions
#
if ( exists( $diffs{$id}->{additions} ) ) {
do_additions( $dbh, $sth1, $verbose, $id,
$diffs{$id}->{additions} );
}
}
}
else {
print "No changes made - dry run\n";
}
}
else {
print "No differences found\n";
}
exit;
#=== FUNCTION ================================================================
# NAME: collect_eps_tags
# PURPOSE: Collects the tags from the eps.tags field
# PARAMETERS: $dbh Database handle
# $verbose Verbosity level
# RETURNS: A reference to the hash created by collecting all the tags
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub collect_eps_tags {
my ( $dbh, $verbose ) = @_;
my ( $status, @fields, %hash );
my ( $sth, $h );
#
# For parsing the field as CSV
# NOTE: Unexplained error in [E. E. "Doc" Smith] (show 2462). Works with
# double replaced by single quote, but doesn't work if quotes escaped (by
# doubling) whether all tags are quoted or not. With 'auto_diag' enabled
# get the error:
# CSV_XS ERROR: 2034 - EIF - Loose unescaped quote @ rec 1632 pos 40 field 3
#
# NOTE: Adding 'allow_loose_quotes' avoids the issue
#
my $csv = Text::CSV_XS->new(
{ binary => 1, auto_diag => 1, allow_loose_quotes => 1 } );
#
# Query the eps table for all the id and tags
#
$sth = $dbh->prepare(
q{SELECT id,tags FROM eps
WHERE length(tags) > 0
ORDER BY id}
) or die $DBI::errstr;
if ( $dbh->err ) {
warn $dbh->errstr;
}
$sth->execute;
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Loop through what we got
#
while ( $h = $sth->fetchrow_hashref ) {
#
# Parse the tag list
#
$status = $csv->parse( $h->{tags} );
unless ($status) {
#
# Report any errors
#
print "Parse error on episode ", $h->{id}, "\n";
print $csv->error_input(), "\n";
next;
}
@fields = $csv->fields();
next unless (@fields);
#
# Trim all tags (don't alter $_ when doing it)
#
@fields = map {
my $t = $_;
$t =~ s/(^\s+|\s+$)//g;
$t;
} @fields;
#print "$h->{id}: ",join(",",@fields),"\n";
#
# Save the id and its tags, sorted for comparison, with empty elements
# removed too
#
$hash{ $h->{id} } = [ sort grep {!/^$/} @fields ];
}
#print Dumper(\%hash),"\n";
#
# Dump all id numbers and tags if the verbose level is high enough
#
if ( $verbose >= 2 ) {
print "\nTags collected from the 'eps' table\n\n";
foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
}
}
return \%hash;
}
#=== FUNCTION ================================================================
# NAME: collect_db_tags
# PURPOSE: Collects the tags already stored in the database
# PARAMETERS: $dbh Database handle
# $verbose Verbosity level
# RETURNS: A reference to the hash created by collecting all the tags
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub collect_db_tags {
my ( $dbh, $verbose ) = @_;
my %hash;
my ( $sth, $h );
#
# Query the database for tag data
#
$sth = $dbh->prepare(q{SELECT * FROM tags ORDER BY id})
or die $DBI::errstr;
if ( $dbh->err ) {
warn $dbh->errstr;
}
$sth->execute;
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Loop through what we got building an array of tags per episode number
#
while ( $h = $sth->fetchrow_hashref ) {
if ( defined( $hash{ $h->{id} } ) ) {
push( @{ $hash{ $h->{id} } }, $h->{tag} );
}
else {
$hash{ $h->{id} } = [ $h->{tag} ];
}
}
#
# Sort all the tag arrays for comparison
#
foreach my $id ( keys(%hash) ) {
$hash{$id} = [ sort @{ $hash{$id} } ];
}
#
# Dump all id numbers and tags if the verbose level is high enough
#
if ( $verbose >= 2 ) {
print "\nTags collected from the 'tags' table\n\n";
foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
printf "%04d: %s\n", $id, join( ",", @{ $hash{$id} } );
}
print '=-' x 40,"\n";
}
return \%hash;
}
#=== FUNCTION ================================================================
# NAME: find_differences
# PURPOSE: Find the differences between two hashes containing tags
# PARAMETERS: $master Reference to the master hash
# $slave Reference to the slave hash
# RETURNS: A reference to the hash created checking for differences
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub find_differences {
my ($master,$slave) = @_;
my %hash;
foreach my $id ( sort { $a <=> $b } keys(%$master) ) {
my %iddiffs = array_compare( $master->{$id}, $slave->{$id} );
if (%iddiffs) {
if ( $verbose >= 1 ) {
#
# Report what was found if asked to
#
print "Episode: $id\n";
print "Update:\n\teps: ", join( ",", @{ $master->{$id} } ), "\n";
print "\ttags: ",
(
defined( $slave->{$id} )
? join( ",", @{ $slave->{$id} } )
: '--None--' ), "\n";
print '-' x 80,"\n";
}
$hash{$id} = {%iddiffs};
}
}
#
# Report differences and actions if the verbose level is high enough
#
if ( $verbose >= 2 ) {
print "\nDifferences and actions\n\n";
foreach my $id ( sort { $a <=> $b } keys(%hash) ) {
print "Episode: $id\n";
if ( exists( $hash{$id}->{deletions} ) ) {
print "Deletions: ";
print join( ",", @{ $hash{$id}->{deletions} } ), "\n";
}
if ( exists( $hash{$id}->{additions} ) ) {
print "Additions: ";
print join( ",", @{ $hash{$id}->{additions} } ), "\n";
}
print '-' x 80, "\n";
}
}
return \%hash;
}
#=== FUNCTION ================================================================
# NAME: do_deletions
# PURPOSE: Perform any deletions indicated in an array for a given
# episode
# PARAMETERS: $dbh Database handle
# $verbose Verbosity level
# $id Episode number
# $tags Reference to an array of tags for this episode
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub do_deletions {
my ( $dbh, $verbose, $id, $tags ) = @_;
my ( $stmt, @bind, %data, %where );
#
# We will dynamically build SQL as we go
#
my $sql = SQL::Abstract->new;
#
# Process the list of tags we have been given
#
for my $i ( 0 .. $#$tags ) {
#
# Set up a deletion '... where id = ? and tag = ?'
#
%where = ( id => $id, tag => $tags->[$i] );
( $stmt, @bind ) = $sql->delete( 'tags', \%where );
my $sth = $dbh->prepare($stmt);
my $rv = $sth->execute(@bind);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$rv = 0 if ( $rv eq '0E0' );
#
# Report the action
#
if ($rv) {
print "Deleted tag for show $id ($tags->[$i])\n";
}
}
print "Deleted ", scalar(@$tags), " row",
( scalar(@$tags) != 1 ? 's' : '' ), "\n";
}
#=== FUNCTION ================================================================
# NAME: do_additions
# PURPOSE: Perform any additions indicated in an array for a given
# episode
# PARAMETERS: $dbh Database handle
# $sth A prepared database handle with a query to
# search for the target tag
# $verbose Verbosity level
# $id Episode number
# $tags Reference to an array of tags for this episode
# RETURNS: Nothing
# DESCRIPTION: FIXME
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub do_additions {
my ( $dbh, $sth, $verbose, $id, $tags ) = @_;
my ( $sth1, $rv, $h, $tid, $stmt, @bind, %data );
#
# We will dynamically build SQL as we go
#
my $sql = SQL::Abstract->new;
my @lctags = map { lc($_) } @$tags;
for my $i ( 0 .. $#$tags ) {
#
# Build the row we're going to add
#
%data = (
id => $id,
tag => $tags->[$i],
lctag => $lctags[$i]
);
( $stmt, @bind ) = $sql->insert( 'tags', \%data );
my $sth = $dbh->prepare($stmt);
my $rv = $sth->execute(@bind);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$rv = 0 if ( $rv eq '0E0' );
#
# Report the action
#
if ($rv) {
print "Added tag for show $id ($tags->[$i])\n";
}
}
print "Added ", scalar(@$tags), " row",
( scalar(@$tags) != 1 ? 's' : '' ), "\n";
}
#=== FUNCTION ================================================================
# NAME: array_compare
# PURPOSE: Compares the elements of two arrays to see if an element
# present in the master is also present in the slave
# PARAMETERS: $arr1 A reference to the first array; the MASTER
# $arr2 A reference to the second array; the SLAVE
# RETURNS: A hash containing arrays of additions and deletions of the
# elements that are different. The structure is:
# {
# additions => [ tag1, tag2 .. tagn ],
# deletions => [ tag1, tag2 .. tagn ],
# }
# The returned hash will be empty if there are no differences.
# DESCRIPTION: The requirement is to find if there are differences, then to
# find what they are so that other code can make the slave array
# match the master. The two arrays come from a database, so
# we're trying to make a second source (slave) equal the first
# (master).
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub array_compare {
my ( $arr1, $arr2 ) = @_;
my %res;
my ( @additions, @deletions );
my %h1 = map { $_ => 1 } @$arr1;
my %h2 = map { $_ => 1 } @$arr2;
#
# Find additions
#
for my $key ( keys(%h1) ) {
unless ( exists( $h2{$key} ) ) {
push( @additions, $key );
}
}
#
# Find deletions
#
for my $key ( keys(%h2) ) {
unless ( exists( $h1{$key} ) ) {
push( @deletions, $key );
}
}
$res{additions} = [@additions] if @additions;
$res{deletions} = [@deletions] if @deletions;
return %res;
}
#=== FUNCTION ================================================================
# NAME: Usage
# PURPOSE: Display a usage message and exit
# PARAMETERS: None
# RETURNS: To command line level with exit value 1
# DESCRIPTION: Builds the usage message using global values
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Usage {
print STDERR <<EOD;
Usage: $PROG [options] project
$PROG v$VERSION
-help Display this information
-[no]dry-run Display what would have been done but make no changes.
Default is -dry-run.
-verbose A repeatable option which turns up the verbosity from
0 (silent) to 2 (lots of stuff). Default is 0.
EOD
exit(1);
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = ( "help", "verbose+", "dry-run!", );
if ( !GetOptions( $optref, @options ) ) {
Usage();
}
return;
}
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker