Moved project directories and files to an empty local repo

2024-06-04 16:35:44 +01:00
parent 2d2b937a9b
commit 38abbcdd39
271 changed files with 55348 additions and 0 deletions
--- a/Database/find_series
+++ b/Database/find_series
@@ -0,0 +1,397 @@
+#!/usr/bin/env perl
+#===============================================================================
+#
+#         FILE: find_series
+#
+#        USAGE: ./find_series
+#
+#  DESCRIPTION: Gathers information from the HPR database to assist with the
+#               process of placing episodes into series groups
+#
+#      OPTIONS: ---
+# REQUIREMENTS: ---
+#         BUGS: ---
+#        NOTES: ---
+#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
+#      VERSION: 0.0.5
+#      CREATED: 2014-04-25 17:11:21
+#     REVISION: 2016-06-15 15:51:55
+#
+#===============================================================================
+
+use v5.16;
+use strict;
+use warnings;
+use utf8;
+
+use Config::General;
+use List::MoreUtils qw(uniq);
+use YAML::XS qw{LoadFile};
+
+use DBI;
+
+use Data::Dumper;
+
+#
+# Version number (manually incremented)
+#
+our $VERSION = '0.0.5';
+
+#
+# Various constants
+#
+( my $PROG = $0 ) =~ s|.*/||mx;
+( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
+$DIR = '.' unless $DIR;
+
+#-------------------------------------------------------------------------------
+# Declarations
+#-------------------------------------------------------------------------------
+#
+# Constants and other declarations
+#
+my $basedir    = "$ENV{HOME}/HPR/Database";
+my $configfile = "$basedir/.hpr_db.cfg";
+my $ignorefile = "$basedir/.$PROG.yml";
+
+my $file_template = "${PROG}_%d.out";
+
+my ( $dbh,    $sth1,    $h1 );
+my ( @ignore, $title,   @words, $pair );
+my ( %eps,    %tags,    @taglist, %single_words, %double_words );
+my ( $phase,  $outfile, $outfh );
+
+#
+# Enable Unicode mode
+#
+binmode STDOUT, ":encoding(UTF-8)";
+binmode STDERR, ":encoding(UTF-8)";
+
+#
+# Load YAML ignore list
+#
+my $yaml = LoadFile($ignorefile);
+@ignore = uniq( @{ $yaml->{ignore} } );
+
+#
+# Load database configuration data
+#
+my $conf = Config::General->new(
+    -ConfigFile      => $configfile,
+    -InterPolateVars => 1,
+    -ExtendedAccess  => 1
+);
+my %config = $conf->getall();
+
+#-------------------------------------------------------------------------------
+# Connect to the database
+#-------------------------------------------------------------------------------
+my $dbhost = $config{database}->{host} // '127.0.0.1';
+my $dbport = $config{database}->{port} // 3306;
+my $dbname = $config{database}->{name};
+my $dbuser = $config{database}->{user};
+my $dbpwd  = $config{database}->{password};
+$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
+    $dbuser, $dbpwd, { AutoCommit => 1 } )
+    or die $DBI::errstr;
+
+#
+# Enable client-side UTF8
+#
+$dbh->{mysql_enable_utf8} = 1;
+
+#
+# Prepare to collect episode titles with the series it's related to
+#
+$sth1 = $dbh->prepare(
+    q{SELECT
+            eps.id AS eps_id,
+            eps.date,
+            eps.title,
+            eps.duration,
+            eps.summary,
+            eps.notes,
+            eps.hostid,
+            eps.series,
+            eps.explicit,
+            eps.license,
+            eps.tags,
+            eps.version,
+            eps.downloads,
+            eps.valid AS eps_valid,
+            ms.id AS ms_id,
+            ms.name,
+            ms.description,
+            ms.private,
+            ms.image,
+            ms.valid AS ms_valid
+        FROM eps
+        JOIN miniseries ms ON eps.series = ms.id
+        WHERE eps.valid = 1}
+);
+$sth1->execute;
+if ( $dbh->err ) {
+    warn $dbh->errstr;
+}
+
+#
+# Generate an output file for phase 1
+#
+$phase = 1;
+newOutput( \$outfh, $file_template, $phase++ );
+print $outfh "Show  (Series) Title\n";
+
+#-------------------------------------------------------------------------------
+# Walk through the episode/series list from the database and build structures
+# from them.
+#-------------------------------------------------------------------------------
+# The structures built are:
+#
+# %eps - a hash indexed by the episode number with an arrayref as the
+# value. The arrayref starts with the episode title and is followed by the
+# important words from the title in lower case. The term "important" means
+# that the word must consist of the allowed characters and not be in the
+# 'ignore' list.
+#
+# %tags - a hash indexed by the episode number with a CSV string of tags
+# associated with the episode as the value.
+#
+# %single_words - a hash indexed by one of the words collected from the title
+# (see %eps above). The value of each element is complex. It consists of an
+# arrayref, the first element of which is a count of the succeeding elements.
+# The next N elements are arrayrefs each of which contains two elements: an
+# episode number and an episode title. Here is an example in Data::Dumper
+# format:
+#
+# 'pre-ibm' => [
+#                2,
+#                [
+#                  687,
+#                  'pre-IBM PC computer history 1'
+#                ],
+#                [
+#                  691,
+#                  'pre-IBM PC computer history 2'
+#                ]
+#              ],
+#
+#-------------------------------------------------------------------------------
+while ( $h1 = $sth1->fetchrow_hashref ) {
+    #
+    # Report what we found
+    #
+    printf $outfh "%4d: (%-2d)     %s\n", $h1->{eps_id}, $h1->{ms_id},
+        $h1->{title};
+
+    #
+    # Skip this episode if it already has a series
+    #
+    next if $h1->{series} > 0;
+
+    #
+    # Save this episode for later
+    #
+    $eps{ $h1->{eps_id} } = [ $h1->{title} ];
+
+    #
+    # Save tag details
+    #
+    $tags{ $h1->{eps_id} } = $h1->{tags};
+
+    #
+    # Strip any trailing full stop, and chop the title into words
+    #
+    ( $title = $h1->{title} ) =~ s/\.$//;
+    @words = split( /\s+/, $title );
+
+    #
+    # Remove all unwanted characters and force to lowercase (use 'fc' for case
+    # folding since it's aware of character sets)
+    #
+    foreach my $word (@words) {
+        $word =~ s{[^a-zA-Z./_-]}{}g;
+        $word = fc($word);
+    }
+
+    #
+    # Clean up the word list after removing unwanted characters
+    #
+    @words = grep { $_ !~ /^-*$/ } @words;
+
+    #
+    # Extract pairs of words before they're made unique and make a hash
+    # pointing to the episodes they originated from
+    #
+    for (my $i = 0; $i < $#words; $i++) {
+        $pair = "$words[$i] $words[$i+1]";
+
+        if ( exists( $double_words{$pair} ) ) {
+            $double_words{$pair}->[0] += 1;
+            push( @{ $double_words{$pair} },
+                [ $h1->{eps_id}, $h1->{title} ] );
+        }
+        else {
+            $double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
+        }
+    }
+
+    #
+    # Make the word list unique
+    #
+    @words = uniq(@words);
+
+    #
+    # Walk the tidied single word list
+    #
+    foreach my $word (@words) {
+        #
+        # Ignore very short words and words in the ignore list
+        #
+        next if length($word) < 2;
+        next if grep( /^$word$/, @ignore );
+
+        #
+        # Save this word in the episodes hash
+        #
+        push( @{ $eps{ $h1->{eps_id} } }, $word );
+
+        #
+        # If the word is not known initialise the entry containing an arrayref
+        # with a counter and another arrayref with the saved episode number
+        # and title. If it's known, increment the counter and stash the
+        # episode details as another arrayref.
+        #
+        if ( exists( $single_words{$word} ) ) {
+            $single_words{$word}->[0] += 1;
+            push( @{ $single_words{$word} },
+                [ $h1->{eps_id}, $h1->{title} ] );
+        }
+        else {
+            $single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
+        }
+    }
+}
+
+#
+# We've finished with the database
+#
+$dbh->disconnect;
+
+#-------------------------------------------------------------------------------
+# Done the first pass, prepare for the next
+#-------------------------------------------------------------------------------
+newOutput( \$outfh, $file_template, $phase++ );
+
+#
+# Process the saved data in increasing order of the frequency. Print the word
+# and its frequency and follow that by the stashed episode details in the
+# order we saw them
+#
+foreach my $key (
+    sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] }
+    sort( keys(%single_words) )
+    )
+{
+    if ( $single_words{$key}->[0] > 3 ) {
+        printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0];
+        for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) {
+            printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] };
+            @taglist
+                = split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } );
+            if (@taglist) {
+                print $outfh " [", join( ",", @taglist ), "]\n";
+            }
+            else {
+                print $outfh "\n";
+            }
+        }
+        print $outfh "\n";
+    }
+}
+
+#-------------------------------------------------------------------------------
+# Done the second pass, prepare for the next
+#-------------------------------------------------------------------------------
+newOutput( \$outfh, $file_template, $phase++ );
+
+#
+# Look through the collected data from the point of view of the episode, list
+# all the (relevant) words in the title in order and report their frequencies
+#
+for my $key ( sort { $a <=> $b } keys(%eps) ) {
+    printf $outfh "%4d: %s\n", $key, $eps{$key}->[0];
+    for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) {
+        my $word = $eps{$key}->[$i];
+        printf $outfh "      %15s %d\n", $word, $single_words{$word}->[0];
+    }
+    print $outfh "\n";
+}
+
+#-------------------------------------------------------------------------------
+# Done the third pass, prepare for the next
+#-------------------------------------------------------------------------------
+newOutput( \$outfh, $file_template, $phase++ );
+
+#
+# So the pairs of words we collected earlier might show something interesting.
+# Let's see.
+#
+foreach my $key (
+    sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] }
+    sort( keys(%double_words) )
+    )
+{
+    if ( $double_words{$key}->[0] > 3 ) {
+        printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0];
+        for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) {
+            printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] };
+            @taglist
+                = split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } );
+            if (@taglist) {
+                print $outfh " [", join( ",", @taglist ), "]\n";
+            }
+            else {
+                print $outfh "\n";
+            }
+        }
+        print $outfh "\n";
+    }
+}
+
+
+close($outfh);
+
+exit;
+
+#===  FUNCTION  ================================================================
+#         NAME: newOutput
+#      PURPOSE: Generate a new output file
+#   PARAMETERS: $fh             a scalar ref pointing to a variable to hold a
+#                               file handle
+#               $template       a string suitable for sprintf for defining the
+#                               name of the output file
+#               $phase          an integer to be incorporated into the output
+#                               file name
+#      RETURNS: Nothing
+#  DESCRIPTION: Closes any existing file and opens a new one with the same
+#               file handle. The name of the file is derived from the template
+#               and the phase number.
+#       THROWS: No exceptions
+#     COMMENTS: None
+#     SEE ALSO: N/A
+#===============================================================================
+sub newOutput {
+    my ( $fh, $template, $phase ) = @_;
+
+    if ($$fh) {
+        close($$fh) if ( tell($$fh) > -1 );
+    }
+
+    my $outfile = sprintf( $template, $phase );
+    open( $$fh, '>:encoding(UTF-8)', $outfile )
+        or die "Unable to open $outfile\n";
+
+    return;
+}
+
+# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker