#!/usr/bin/env perl #=============================================================================== # # FILE: find_series # # USAGE: ./find_series # # DESCRIPTION: Gathers information from the HPR database to assist with the # process of placing episodes into series groups # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.5 # CREATED: 2014-04-25 17:11:21 # REVISION: 2016-06-15 15:51:55 # #=============================================================================== use v5.16; use strict; use warnings; use utf8; use Config::General; use List::MoreUtils qw(uniq); use YAML::XS qw{LoadFile}; use DBI; use Data::Dumper; # # Version number (manually incremented) # our $VERSION = '0.0.5'; # # Various constants # ( my $PROG = $0 ) =~ s|.*/||mx; ( my $DIR = $0 ) =~ s|/?[^/]*$||mx; $DIR = '.' unless $DIR; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- # # Constants and other declarations # my $basedir = "$ENV{HOME}/HPR/Database"; my $configfile = "$basedir/.hpr_db.cfg"; my $ignorefile = "$basedir/.$PROG.yml"; my $file_template = "${PROG}_%d.out"; my ( $dbh, $sth1, $h1 ); my ( @ignore, $title, @words, $pair ); my ( %eps, %tags, @taglist, %single_words, %double_words ); my ( $phase, $outfile, $outfh ); # # Enable Unicode mode # binmode STDOUT, ":encoding(UTF-8)"; binmode STDERR, ":encoding(UTF-8)"; # # Load YAML ignore list # my $yaml = LoadFile($ignorefile); @ignore = uniq( @{ $yaml->{ignore} } ); # # Load database configuration data # my $conf = Config::General->new( -ConfigFile => $configfile, -InterPolateVars => 1, -ExtendedAccess => 1 ); my %config = $conf->getall(); #------------------------------------------------------------------------------- # Connect to the database #------------------------------------------------------------------------------- my $dbhost = $config{database}->{host} // '127.0.0.1'; my $dbport = $config{database}->{port} // 3306; my $dbname = $config{database}->{name}; my $dbuser = $config{database}->{user}; my $dbpwd = $config{database}->{password}; $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", $dbuser, $dbpwd, { AutoCommit => 1 } ) or die $DBI::errstr; # # Enable client-side UTF8 # $dbh->{mysql_enable_utf8} = 1; # # Prepare to collect episode titles with the series it's related to # $sth1 = $dbh->prepare( q{SELECT eps.id AS eps_id, eps.date, eps.title, eps.duration, eps.summary, eps.notes, eps.hostid, eps.series, eps.explicit, eps.license, eps.tags, eps.version, eps.downloads, eps.valid AS eps_valid, ms.id AS ms_id, ms.name, ms.description, ms.private, ms.image, ms.valid AS ms_valid FROM eps JOIN miniseries ms ON eps.series = ms.id WHERE eps.valid = 1} ); $sth1->execute; if ( $dbh->err ) { warn $dbh->errstr; } # # Generate an output file for phase 1 # $phase = 1; newOutput( \$outfh, $file_template, $phase++ ); print $outfh "Show (Series) Title\n"; #------------------------------------------------------------------------------- # Walk through the episode/series list from the database and build structures # from them. #------------------------------------------------------------------------------- # The structures built are: # # %eps - a hash indexed by the episode number with an arrayref as the # value. The arrayref starts with the episode title and is followed by the # important words from the title in lower case. The term "important" means # that the word must consist of the allowed characters and not be in the # 'ignore' list. # # %tags - a hash indexed by the episode number with a CSV string of tags # associated with the episode as the value. # # %single_words - a hash indexed by one of the words collected from the title # (see %eps above). The value of each element is complex. It consists of an # arrayref, the first element of which is a count of the succeeding elements. # The next N elements are arrayrefs each of which contains two elements: an # episode number and an episode title. Here is an example in Data::Dumper # format: # # 'pre-ibm' => [ # 2, # [ # 687, # 'pre-IBM PC computer history 1' # ], # [ # 691, # 'pre-IBM PC computer history 2' # ] # ], # #------------------------------------------------------------------------------- while ( $h1 = $sth1->fetchrow_hashref ) { # # Report what we found # printf $outfh "%4d: (%-2d) %s\n", $h1->{eps_id}, $h1->{ms_id}, $h1->{title}; # # Skip this episode if it already has a series # next if $h1->{series} > 0; # # Save this episode for later # $eps{ $h1->{eps_id} } = [ $h1->{title} ]; # # Save tag details # $tags{ $h1->{eps_id} } = $h1->{tags}; # # Strip any trailing full stop, and chop the title into words # ( $title = $h1->{title} ) =~ s/\.$//; @words = split( /\s+/, $title ); # # Remove all unwanted characters and force to lowercase (use 'fc' for case # folding since it's aware of character sets) # foreach my $word (@words) { $word =~ s{[^a-zA-Z./_-]}{}g; $word = fc($word); } # # Clean up the word list after removing unwanted characters # @words = grep { $_ !~ /^-*$/ } @words; # # Extract pairs of words before they're made unique and make a hash # pointing to the episodes they originated from # for (my $i = 0; $i < $#words; $i++) { $pair = "$words[$i] $words[$i+1]"; if ( exists( $double_words{$pair} ) ) { $double_words{$pair}->[0] += 1; push( @{ $double_words{$pair} }, [ $h1->{eps_id}, $h1->{title} ] ); } else { $double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ]; } } # # Make the word list unique # @words = uniq(@words); # # Walk the tidied single word list # foreach my $word (@words) { # # Ignore very short words and words in the ignore list # next if length($word) < 2; next if grep( /^$word$/, @ignore ); # # Save this word in the episodes hash # push( @{ $eps{ $h1->{eps_id} } }, $word ); # # If the word is not known initialise the entry containing an arrayref # with a counter and another arrayref with the saved episode number # and title. If it's known, increment the counter and stash the # episode details as another arrayref. # if ( exists( $single_words{$word} ) ) { $single_words{$word}->[0] += 1; push( @{ $single_words{$word} }, [ $h1->{eps_id}, $h1->{title} ] ); } else { $single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ]; } } } # # We've finished with the database # $dbh->disconnect; #------------------------------------------------------------------------------- # Done the first pass, prepare for the next #------------------------------------------------------------------------------- newOutput( \$outfh, $file_template, $phase++ ); # # Process the saved data in increasing order of the frequency. Print the word # and its frequency and follow that by the stashed episode details in the # order we saw them # foreach my $key ( sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] } sort( keys(%single_words) ) ) { if ( $single_words{$key}->[0] > 3 ) { printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0]; for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) { printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] }; @taglist = split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } ); if (@taglist) { print $outfh " [", join( ",", @taglist ), "]\n"; } else { print $outfh "\n"; } } print $outfh "\n"; } } #------------------------------------------------------------------------------- # Done the second pass, prepare for the next #------------------------------------------------------------------------------- newOutput( \$outfh, $file_template, $phase++ ); # # Look through the collected data from the point of view of the episode, list # all the (relevant) words in the title in order and report their frequencies # for my $key ( sort { $a <=> $b } keys(%eps) ) { printf $outfh "%4d: %s\n", $key, $eps{$key}->[0]; for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) { my $word = $eps{$key}->[$i]; printf $outfh " %15s %d\n", $word, $single_words{$word}->[0]; } print $outfh "\n"; } #------------------------------------------------------------------------------- # Done the third pass, prepare for the next #------------------------------------------------------------------------------- newOutput( \$outfh, $file_template, $phase++ ); # # So the pairs of words we collected earlier might show something interesting. # Let's see. # foreach my $key ( sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] } sort( keys(%double_words) ) ) { if ( $double_words{$key}->[0] > 3 ) { printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0]; for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) { printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] }; @taglist = split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } ); if (@taglist) { print $outfh " [", join( ",", @taglist ), "]\n"; } else { print $outfh "\n"; } } print $outfh "\n"; } } close($outfh); exit; #=== FUNCTION ================================================================ # NAME: newOutput # PURPOSE: Generate a new output file # PARAMETERS: $fh a scalar ref pointing to a variable to hold a # file handle # $template a string suitable for sprintf for defining the # name of the output file # $phase an integer to be incorporated into the output # file name # RETURNS: Nothing # DESCRIPTION: Closes any existing file and opens a new one with the same # file handle. The name of the file is derived from the template # and the phase number. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub newOutput { my ( $fh, $template, $phase ) = @_; if ($$fh) { close($$fh) if ( tell($$fh) > -1 ); } my $outfile = sprintf( $template, $phase ); open( $$fh, '>:encoding(UTF-8)', $outfile ) or die "Unable to open $outfile\n"; return; } # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker