Database/find_series

#!/usr/bin/env perl
#===============================================================================
#
#         FILE: find_series
#
#        USAGE: ./find_series
#
#  DESCRIPTION: Gathers information from the HPR database to assist with the
#               process of placing episodes into series groups
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.5
#      CREATED: 2014-04-25 17:11:21
#     REVISION: 2016-06-15 15:51:55
#
#===============================================================================

use v5.16;
use strict;
use warnings;
use utf8;

use Config::General;
use List::MoreUtils qw(uniq);
use YAML::XS qw{LoadFile};

use DBI;

use Data::Dumper;

#
# Version number (manually incremented)
#
our $VERSION = '0.0.5';

#
# Various constants
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir    = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my $ignorefile = "$basedir/.$PROG.yml";

my $file_template = "${PROG}_%d.out";

my ( $dbh,    $sth1,    $h1 );
my ( @ignore, $title,   @words, $pair );
my ( %eps,    %tags,    @taglist, %single_words, %double_words );
my ( $phase,  $outfile, $outfh );

#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";

#
# Load YAML ignore list
#
my $yaml = LoadFile($ignorefile);
@ignore = uniq( @{ $yaml->{ignore} } );

#
# Load database configuration data
#
my $conf = Config::General->new(
    -ConfigFile      => $configfile,
    -InterPolateVars => 1,
    -ExtendedAccess  => 1
);
my %config = $conf->getall();

#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd  = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
    $dbuser, $dbpwd, { AutoCommit => 1 } )
    or die $DBI::errstr;

#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;

#
# Prepare to collect episode titles with the series it's related to
#
$sth1 = $dbh->prepare(
    q{SELECT
            eps.id AS eps_id,
            eps.date,
            eps.title,
            eps.duration,
            eps.summary,
            eps.notes,
            eps.hostid,
            eps.series,
            eps.explicit,
            eps.license,
            eps.tags,
            eps.version,
            eps.downloads,
            eps.valid AS eps_valid,
            ms.id AS ms_id,
            ms.name,
            ms.description,
            ms.private,
            ms.image,
            ms.valid AS ms_valid
        FROM eps
        JOIN miniseries ms ON eps.series = ms.id
        WHERE eps.valid = 1}
);
$sth1->execute;
if ( $dbh->err ) {
    warn $dbh->errstr;
}

#
# Generate an output file for phase 1
#
$phase = 1;
newOutput( \$outfh, $file_template, $phase++ );
print $outfh "Show  (Series) Title\n";

#-------------------------------------------------------------------------------
# Walk through the episode/series list from the database and build structures
# from them.
#-------------------------------------------------------------------------------
# The structures built are:
#
# %eps - a hash indexed by the episode number with an arrayref as the
# value. The arrayref starts with the episode title and is followed by the
# important words from the title in lower case. The term "important" means
# that the word must consist of the allowed characters and not be in the
# 'ignore' list.
#
# %tags - a hash indexed by the episode number with a CSV string of tags
# associated with the episode as the value.
#
# %single_words - a hash indexed by one of the words collected from the title
# (see %eps above). The value of each element is complex. It consists of an
# arrayref, the first element of which is a count of the succeeding elements.
# The next N elements are arrayrefs each of which contains two elements: an
# episode number and an episode title. Here is an example in Data::Dumper
# format:
#
# 'pre-ibm' => [
#                2,
#                [
#                  687,
#                  'pre-IBM PC computer history 1'
#                ],
#                [
#                  691,
#                  'pre-IBM PC computer history 2'
#                ]
#              ],
#
#-------------------------------------------------------------------------------
while ( $h1 = $sth1->fetchrow_hashref ) {
    #
    # Report what we found
    #
    printf $outfh "%4d: (%-2d)     %s\n", $h1->{eps_id}, $h1->{ms_id},
        $h1->{title};

    #
    # Skip this episode if it already has a series
    #
    next if $h1->{series} > 0;

    #
    # Save this episode for later
    #
    $eps{ $h1->{eps_id} } = [ $h1->{title} ];

    #
    # Save tag details
    #
    $tags{ $h1->{eps_id} } = $h1->{tags};

    #
    # Strip any trailing full stop, and chop the title into words
    #
    ( $title = $h1->{title} ) =~ s/\.$//;
    @words = split( /\s+/, $title );

    #
    # Remove all unwanted characters and force to lowercase (use 'fc' for case
    # folding since it's aware of character sets)
    #
    foreach my $word (@words) {
        $word =~ s{[^a-zA-Z./_-]}{}g;
        $word = fc($word);
    }

    #
    # Clean up the word list after removing unwanted characters
    #
    @words = grep { $_ !~ /^-*$/ } @words;

    #
    # Extract pairs of words before they're made unique and make a hash
    # pointing to the episodes they originated from
    #
    for (my $i = 0; $i < $#words; $i++) {
        $pair = "$words[$i] $words[$i+1]";

        if ( exists( $double_words{$pair} ) ) {
            $double_words{$pair}->[0] += 1;
            push( @{ $double_words{$pair} },
                [ $h1->{eps_id}, $h1->{title} ] );
        }
        else {
            $double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
        }
    }

    #
    # Make the word list unique
    #
    @words = uniq(@words);

    #
    # Walk the tidied single word list
    #
    foreach my $word (@words) {
        #
        # Ignore very short words and words in the ignore list
        #
        next if length($word) < 2;
        next if grep( /^$word$/, @ignore );

        #
        # Save this word in the episodes hash
        #
        push( @{ $eps{ $h1->{eps_id} } }, $word );

        #
        # If the word is not known initialise the entry containing an arrayref
        # with a counter and another arrayref with the saved episode number
        # and title. If it's known, increment the counter and stash the
        # episode details as another arrayref.
        #
        if ( exists( $single_words{$word} ) ) {
            $single_words{$word}->[0] += 1;
            push( @{ $single_words{$word} },
                [ $h1->{eps_id}, $h1->{title} ] );
        }
        else {
            $single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
        }
    }
}

#
# We've finished with the database
#
$dbh->disconnect;

#-------------------------------------------------------------------------------
# Done the first pass, prepare for the next
#-------------------------------------------------------------------------------
newOutput( \$outfh, $file_template, $phase++ );

#
# Process the saved data in increasing order of the frequency. Print the word
# and its frequency and follow that by the stashed episode details in the
# order we saw them
#
foreach my $key (
    sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] }
    sort( keys(%single_words) )
    )
{
    if ( $single_words{$key}->[0] > 3 ) {
        printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0];
        for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) {
            printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] };
            @taglist
                = split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } );
            if (@taglist) {
                print $outfh " [", join( ",", @taglist ), "]\n";
            }
            else {
                print $outfh "\n";
            }
        }
        print $outfh "\n";
    }
}

#-------------------------------------------------------------------------------
# Done the second pass, prepare for the next
#-------------------------------------------------------------------------------
newOutput( \$outfh, $file_template, $phase++ );

#
# Look through the collected data from the point of view of the episode, list
# all the (relevant) words in the title in order and report their frequencies
#
for my $key ( sort { $a <=> $b } keys(%eps) ) {
    printf $outfh "%4d: %s\n", $key, $eps{$key}->[0];
    for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) {
        my $word = $eps{$key}->[$i];
        printf $outfh "      %15s %d\n", $word, $single_words{$word}->[0];
    }
    print $outfh "\n";
}

#-------------------------------------------------------------------------------
# Done the third pass, prepare for the next
#-------------------------------------------------------------------------------
newOutput( \$outfh, $file_template, $phase++ );

#
# So the pairs of words we collected earlier might show something interesting.
# Let's see.
#
foreach my $key (
    sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] }
    sort( keys(%double_words) )
    )
{
    if ( $double_words{$key}->[0] > 3 ) {
        printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0];
        for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) {
            printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] };
            @taglist
                = split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } );
            if (@taglist) {
                print $outfh " [", join( ",", @taglist ), "]\n";
            }
            else {
                print $outfh "\n";
            }
        }
        print $outfh "\n";
    }
}


close($outfh);

exit;

#===  FUNCTION  ================================================================
#         NAME: newOutput
#      PURPOSE: Generate a new output file
#   PARAMETERS: $fh             a scalar ref pointing to a variable to hold a
#                               file handle
#               $template       a string suitable for sprintf for defining the
#                               name of the output file
#               $phase          an integer to be incorporated into the output
#                               file name
#      RETURNS: Nothing
#  DESCRIPTION: Closes any existing file and opens a new one with the same
#               file handle. The name of the file is derived from the template
#               and the phase number.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub newOutput {
    my ( $fh, $template, $phase ) = @_;

    if ($$fh) {
        close($$fh) if ( tell($$fh) > -1 );
    }

    my $outfile = sprintf( $template, $phase );
    open( $$fh, '>:encoding(UTF-8)', $outfile )
        or die "Unable to open $outfile\n";

    return;
}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
Moved project directories and files to an empty local repo 2024-06-04 16:35:44 +01:00			`#!/usr/bin/env perl`
			`#===============================================================================`
			`#`
			`# FILE: find_series`
			`#`
			`# USAGE: ./find_series`
			`#`
			`# DESCRIPTION: Gathers information from the HPR database to assist with the`
			`# process of placing episodes into series groups`
			`#`
			`# OPTIONS: ---`
			`# REQUIREMENTS: ---`
			`# BUGS: ---`
			`# NOTES: ---`
			`# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com`
			`# VERSION: 0.0.5`
			`# CREATED: 2014-04-25 17:11:21`
			`# REVISION: 2016-06-15 15:51:55`
			`#`
			`#===============================================================================`

			`use v5.16;`
			`use strict;`
			`use warnings;`
			`use utf8;`

			`use Config::General;`
			`use List::MoreUtils qw(uniq);`
			`use YAML::XS qw{LoadFile};`

			`use DBI;`

			`use Data::Dumper;`

			`#`
			`# Version number (manually incremented)`
			`#`
			`our $VERSION = '0.0.5';`

			`#`
			`# Various constants`
			`#`
			`( my $PROG = $0 ) =~ s\|.*/\|\|mx;`
			`( my $DIR = $0 ) =~ s\|/?[^/]*$\|\|mx;`
			`$DIR = '.' unless $DIR;`

			`#-------------------------------------------------------------------------------`
			`# Declarations`
			`#-------------------------------------------------------------------------------`
			`#`
			`# Constants and other declarations`
			`#`
			`my $basedir = "$ENV{HOME}/HPR/Database";`
			`my $configfile = "$basedir/.hpr_db.cfg";`
			`my $ignorefile = "$basedir/.$PROG.yml";`

			`my $file_template = "${PROG}_%d.out";`

			`my ( $dbh, $sth1, $h1 );`
			`my ( @ignore, $title, @words, $pair );`
			`my ( %eps, %tags, @taglist, %single_words, %double_words );`
			`my ( $phase, $outfile, $outfh );`

			`#`
			`# Enable Unicode mode`
			`#`
			`binmode STDOUT, ":encoding(UTF-8)";`
			`binmode STDERR, ":encoding(UTF-8)";`

			`#`
			`# Load YAML ignore list`
			`#`
			`my $yaml = LoadFile($ignorefile);`
			`@ignore = uniq( @{ $yaml->{ignore} } );`

			`#`
			`# Load database configuration data`
			`#`
			`my $conf = Config::General->new(`
			`-ConfigFile => $configfile,`
			`-InterPolateVars => 1,`
			`-ExtendedAccess => 1`
			`);`
			`my %config = $conf->getall();`

			`#-------------------------------------------------------------------------------`
			`# Connect to the database`
			`#-------------------------------------------------------------------------------`
			`my $dbhost = $config{database}->{host} // '127.0.0.1';`
			`my $dbport = $config{database}->{port} // 3306;`
			`my $dbname = $config{database}->{name};`
			`my $dbuser = $config{database}->{user};`
			`my $dbpwd = $config{database}->{password};`
			`$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",`
			`$dbuser, $dbpwd, { AutoCommit => 1 } )`
			`or die $DBI::errstr;`

			`#`
			`# Enable client-side UTF8`
			`#`
			`$dbh->{mysql_enable_utf8} = 1;`

			`#`
			`# Prepare to collect episode titles with the series it's related to`
			`#`
			`$sth1 = $dbh->prepare(`
			`q{SELECT`
			`eps.id AS eps_id,`
			`eps.date,`
			`eps.title,`
			`eps.duration,`
			`eps.summary,`
			`eps.notes,`
			`eps.hostid,`
			`eps.series,`
			`eps.explicit,`
			`eps.license,`
			`eps.tags,`
			`eps.version,`
			`eps.downloads,`
			`eps.valid AS eps_valid,`
			`ms.id AS ms_id,`
			`ms.name,`
			`ms.description,`
			`ms.private,`
			`ms.image,`
			`ms.valid AS ms_valid`
			`FROM eps`
			`JOIN miniseries ms ON eps.series = ms.id`
			`WHERE eps.valid = 1}`
			`);`
			`$sth1->execute;`
			`if ( $dbh->err ) {`
			`warn $dbh->errstr;`
			`}`

			`#`
			`# Generate an output file for phase 1`
			`#`
			`$phase = 1;`
			`newOutput( \$outfh, $file_template, $phase++ );`
			`print $outfh "Show (Series) Title\n";`

			`#-------------------------------------------------------------------------------`
			`# Walk through the episode/series list from the database and build structures`
			`# from them.`
			`#-------------------------------------------------------------------------------`
			`# The structures built are:`
			`#`
			`# %eps - a hash indexed by the episode number with an arrayref as the`
			`# value. The arrayref starts with the episode title and is followed by the`
			`# important words from the title in lower case. The term "important" means`
			`# that the word must consist of the allowed characters and not be in the`
			`# 'ignore' list.`
			`#`
			`# %tags - a hash indexed by the episode number with a CSV string of tags`
			`# associated with the episode as the value.`
			`#`
			`# %single_words - a hash indexed by one of the words collected from the title`
			`# (see %eps above). The value of each element is complex. It consists of an`
			`# arrayref, the first element of which is a count of the succeeding elements.`
			`# The next N elements are arrayrefs each of which contains two elements: an`
			`# episode number and an episode title. Here is an example in Data::Dumper`
			`# format:`
			`#`
			`# 'pre-ibm' => [`
			`# 2,`
			`# [`
			`# 687,`
			`# 'pre-IBM PC computer history 1'`
			`# ],`
			`# [`
			`# 691,`
			`# 'pre-IBM PC computer history 2'`
			`# ]`
			`# ],`
			`#`
			`#-------------------------------------------------------------------------------`
			`while ( $h1 = $sth1->fetchrow_hashref ) {`
			`#`
			`# Report what we found`
			`#`
			`printf $outfh "%4d: (%-2d) %s\n", $h1->{eps_id}, $h1->{ms_id},`
			`$h1->{title};`

			`#`
			`# Skip this episode if it already has a series`
			`#`
			`next if $h1->{series} > 0;`

			`#`
			`# Save this episode for later`
			`#`
			`$eps{ $h1->{eps_id} } = [ $h1->{title} ];`

			`#`
			`# Save tag details`
			`#`
			`$tags{ $h1->{eps_id} } = $h1->{tags};`

			`#`
			`# Strip any trailing full stop, and chop the title into words`
			`#`
			`( $title = $h1->{title} ) =~ s/\.$//;`
			`@words = split( /\s+/, $title );`

			`#`
			`# Remove all unwanted characters and force to lowercase (use 'fc' for case`
			`# folding since it's aware of character sets)`
			`#`
			`foreach my $word (@words) {`
			`$word =~ s{[^a-zA-Z./_-]}{}g;`
			`$word = fc($word);`
			`}`

			`#`
			`# Clean up the word list after removing unwanted characters`
			`#`
			`@words = grep { $_ !~ /^-*$/ } @words;`

			`#`
			`# Extract pairs of words before they're made unique and make a hash`
			`# pointing to the episodes they originated from`
			`#`
			`for (my $i = 0; $i < $#words; $i++) {`
			`$pair = "$words[$i] $words[$i+1]";`

			`if ( exists( $double_words{$pair} ) ) {`
			`$double_words{$pair}->[0] += 1;`
			`push( @{ $double_words{$pair} },`
			`[ $h1->{eps_id}, $h1->{title} ] );`
			`}`
			`else {`
			`$double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];`
			`}`
			`}`

			`#`
			`# Make the word list unique`
			`#`
			`@words = uniq(@words);`

			`#`
			`# Walk the tidied single word list`
			`#`
			`foreach my $word (@words) {`
			`#`
			`# Ignore very short words and words in the ignore list`
			`#`
			`next if length($word) < 2;`
			`next if grep( /^$word$/, @ignore );`

			`#`
			`# Save this word in the episodes hash`
			`#`
			`push( @{ $eps{ $h1->{eps_id} } }, $word );`

			`#`
			`# If the word is not known initialise the entry containing an arrayref`
			`# with a counter and another arrayref with the saved episode number`
			`# and title. If it's known, increment the counter and stash the`
			`# episode details as another arrayref.`
			`#`
			`if ( exists( $single_words{$word} ) ) {`
			`$single_words{$word}->[0] += 1;`
			`push( @{ $single_words{$word} },`
			`[ $h1->{eps_id}, $h1->{title} ] );`
			`}`
			`else {`
			`$single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];`
			`}`
			`}`
			`}`

			`#`
			`# We've finished with the database`
			`#`
			`$dbh->disconnect;`

			`#-------------------------------------------------------------------------------`
			`# Done the first pass, prepare for the next`
			`#-------------------------------------------------------------------------------`
			`newOutput( \$outfh, $file_template, $phase++ );`

			`#`
			`# Process the saved data in increasing order of the frequency. Print the word`
			`# and its frequency and follow that by the stashed episode details in the`
			`# order we saw them`
			`#`
			`foreach my $key (`
			`sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] }`
			`sort( keys(%single_words) )`
			`)`
			`{`
			`if ( $single_words{$key}->[0] > 3 ) {`
			`printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0];`
			`for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) {`
			`printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] };`
			`@taglist`
			`= split( /\s,\s/, $tags{ $single_words{$key}->[$i]->[0] } );`
			`if (@taglist) {`
			`print $outfh " [", join( ",", @taglist ), "]\n";`
			`}`
			`else {`
			`print $outfh "\n";`
			`}`
			`}`
			`print $outfh "\n";`
			`}`
			`}`

			`#-------------------------------------------------------------------------------`
			`# Done the second pass, prepare for the next`
			`#-------------------------------------------------------------------------------`
			`newOutput( \$outfh, $file_template, $phase++ );`

			`#`
			`# Look through the collected data from the point of view of the episode, list`
			`# all the (relevant) words in the title in order and report their frequencies`
			`#`
			`for my $key ( sort { $a <=> $b } keys(%eps) ) {`
			`printf $outfh "%4d: %s\n", $key, $eps{$key}->[0];`
			`for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) {`
			`my $word = $eps{$key}->[$i];`
			`printf $outfh " %15s %d\n", $word, $single_words{$word}->[0];`
			`}`
			`print $outfh "\n";`
			`}`

			`#-------------------------------------------------------------------------------`
			`# Done the third pass, prepare for the next`
			`#-------------------------------------------------------------------------------`
			`newOutput( \$outfh, $file_template, $phase++ );`

			`#`
			`# So the pairs of words we collected earlier might show something interesting.`
			`# Let's see.`
			`#`
			`foreach my $key (`
			`sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] }`
			`sort( keys(%double_words) )`
			`)`
			`{`
			`if ( $double_words{$key}->[0] > 3 ) {`
			`printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0];`
			`for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) {`
			`printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] };`
			`@taglist`
			`= split( /\s,\s/, $tags{ $double_words{$key}->[$i]->[0] } );`
			`if (@taglist) {`
			`print $outfh " [", join( ",", @taglist ), "]\n";`
			`}`
			`else {`
			`print $outfh "\n";`
			`}`
			`}`
			`print $outfh "\n";`
			`}`
			`}`


			`close($outfh);`

			`exit;`

			`#=== FUNCTION ================================================================`
			`# NAME: newOutput`
			`# PURPOSE: Generate a new output file`
			`# PARAMETERS: $fh a scalar ref pointing to a variable to hold a`
			`# file handle`
			`# $template a string suitable for sprintf for defining the`
			`# name of the output file`
			`# $phase an integer to be incorporated into the output`
			`# file name`
			`# RETURNS: Nothing`
			`# DESCRIPTION: Closes any existing file and opens a new one with the same`
			`# file handle. The name of the file is derived from the template`
			`# and the phase number.`
			`# THROWS: No exceptions`
			`# COMMENTS: None`
			`# SEE ALSO: N/A`
			`#===============================================================================`
			`sub newOutput {`
			`my ( $fh, $template, $phase ) = @_;`

			`if ($$fh) {`
			`close($$fh) if ( tell($$fh) > -1 );`
			`}`

			`my $outfile = sprintf( $template, $phase );`
			`open( $$fh, '>:encoding(UTF-8)', $outfile )`
			`or die "Unable to open $outfile\n";`

			`return;`
			`}`

			`# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker`