398 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			398 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: find_series | ||
|  | # | ||
|  | #        USAGE: ./find_series | ||
|  | # | ||
|  | #  DESCRIPTION: Gathers information from the HPR database to assist with the | ||
|  | #               process of placing episodes into series groups | ||
|  | # | ||
|  | #      OPTIONS: --- | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: --- | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.0.5 | ||
|  | #      CREATED: 2014-04-25 17:11:21 | ||
|  | #     REVISION: 2016-06-15 15:51:55 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use v5.16; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | use utf8; | ||
|  | 
 | ||
|  | use Config::General; | ||
|  | use List::MoreUtils qw(uniq); | ||
|  | use YAML::XS qw{LoadFile}; | ||
|  | 
 | ||
|  | use DBI; | ||
|  | 
 | ||
|  | use Data::Dumper; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (manually incremented) | ||
|  | # | ||
|  | our $VERSION = '0.0.5'; | ||
|  | 
 | ||
|  | # | ||
|  | # Various constants | ||
|  | # | ||
|  | ( my $PROG = $0 ) =~ s|.*/||mx; | ||
|  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | ||
|  | $DIR = '.' unless $DIR; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Declarations | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Constants and other declarations | ||
|  | # | ||
|  | my $basedir    = "$ENV{HOME}/HPR/Database"; | ||
|  | my $configfile = "$basedir/.hpr_db.cfg"; | ||
|  | my $ignorefile = "$basedir/.$PROG.yml"; | ||
|  | 
 | ||
|  | my $file_template = "${PROG}_%d.out"; | ||
|  | 
 | ||
|  | my ( $dbh,    $sth1,    $h1 ); | ||
|  | my ( @ignore, $title,   @words, $pair ); | ||
|  | my ( %eps,    %tags,    @taglist, %single_words, %double_words ); | ||
|  | my ( $phase,  $outfile, $outfh ); | ||
|  | 
 | ||
|  | # | ||
|  | # Enable Unicode mode | ||
|  | # | ||
|  | binmode STDOUT, ":encoding(UTF-8)"; | ||
|  | binmode STDERR, ":encoding(UTF-8)"; | ||
|  | 
 | ||
|  | # | ||
|  | # Load YAML ignore list | ||
|  | # | ||
|  | my $yaml = LoadFile($ignorefile); | ||
|  | @ignore = uniq( @{ $yaml->{ignore} } ); | ||
|  | 
 | ||
|  | # | ||
|  | # Load database configuration data | ||
|  | # | ||
|  | my $conf = Config::General->new( | ||
|  |     -ConfigFile      => $configfile, | ||
|  |     -InterPolateVars => 1, | ||
|  |     -ExtendedAccess  => 1 | ||
|  | ); | ||
|  | my %config = $conf->getall(); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Connect to the database | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $dbhost = $config{database}->{host} // '127.0.0.1'; | ||
|  | my $dbport = $config{database}->{port} // 3306; | ||
|  | my $dbname = $config{database}->{name}; | ||
|  | my $dbuser = $config{database}->{user}; | ||
|  | my $dbpwd  = $config{database}->{password}; | ||
|  | $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", | ||
|  |     $dbuser, $dbpwd, { AutoCommit => 1 } ) | ||
|  |     or die $DBI::errstr; | ||
|  | 
 | ||
|  | # | ||
|  | # Enable client-side UTF8 | ||
|  | # | ||
|  | $dbh->{mysql_enable_utf8} = 1; | ||
|  | 
 | ||
|  | # | ||
|  | # Prepare to collect episode titles with the series it's related to | ||
|  | # | ||
|  | $sth1 = $dbh->prepare( | ||
|  |     q{SELECT | ||
|  |             eps.id AS eps_id, | ||
|  |             eps.date, | ||
|  |             eps.title, | ||
|  |             eps.duration, | ||
|  |             eps.summary, | ||
|  |             eps.notes, | ||
|  |             eps.hostid, | ||
|  |             eps.series, | ||
|  |             eps.explicit, | ||
|  |             eps.license, | ||
|  |             eps.tags, | ||
|  |             eps.version, | ||
|  |             eps.downloads, | ||
|  |             eps.valid AS eps_valid, | ||
|  |             ms.id AS ms_id, | ||
|  |             ms.name, | ||
|  |             ms.description, | ||
|  |             ms.private, | ||
|  |             ms.image, | ||
|  |             ms.valid AS ms_valid | ||
|  |         FROM eps | ||
|  |         JOIN miniseries ms ON eps.series = ms.id | ||
|  |         WHERE eps.valid = 1} | ||
|  | ); | ||
|  | $sth1->execute; | ||
|  | if ( $dbh->err ) { | ||
|  |     warn $dbh->errstr; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Generate an output file for phase 1 | ||
|  | # | ||
|  | $phase = 1; | ||
|  | newOutput( \$outfh, $file_template, $phase++ ); | ||
|  | print $outfh "Show  (Series) Title\n"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Walk through the episode/series list from the database and build structures | ||
|  | # from them. | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # The structures built are: | ||
|  | # | ||
|  | # %eps - a hash indexed by the episode number with an arrayref as the | ||
|  | # value. The arrayref starts with the episode title and is followed by the | ||
|  | # important words from the title in lower case. The term "important" means | ||
|  | # that the word must consist of the allowed characters and not be in the | ||
|  | # 'ignore' list. | ||
|  | # | ||
|  | # %tags - a hash indexed by the episode number with a CSV string of tags | ||
|  | # associated with the episode as the value. | ||
|  | # | ||
|  | # %single_words - a hash indexed by one of the words collected from the title | ||
|  | # (see %eps above). The value of each element is complex. It consists of an | ||
|  | # arrayref, the first element of which is a count of the succeeding elements. | ||
|  | # The next N elements are arrayrefs each of which contains two elements: an | ||
|  | # episode number and an episode title. Here is an example in Data::Dumper | ||
|  | # format: | ||
|  | # | ||
|  | # 'pre-ibm' => [ | ||
|  | #                2, | ||
|  | #                [ | ||
|  | #                  687, | ||
|  | #                  'pre-IBM PC computer history 1' | ||
|  | #                ], | ||
|  | #                [ | ||
|  | #                  691, | ||
|  | #                  'pre-IBM PC computer history 2' | ||
|  | #                ] | ||
|  | #              ], | ||
|  | # | ||
|  | #------------------------------------------------------------------------------- | ||
|  | while ( $h1 = $sth1->fetchrow_hashref ) { | ||
|  |     # | ||
|  |     # Report what we found | ||
|  |     # | ||
|  |     printf $outfh "%4d: (%-2d)     %s\n", $h1->{eps_id}, $h1->{ms_id}, | ||
|  |         $h1->{title}; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Skip this episode if it already has a series | ||
|  |     # | ||
|  |     next if $h1->{series} > 0; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Save this episode for later | ||
|  |     # | ||
|  |     $eps{ $h1->{eps_id} } = [ $h1->{title} ]; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Save tag details | ||
|  |     # | ||
|  |     $tags{ $h1->{eps_id} } = $h1->{tags}; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Strip any trailing full stop, and chop the title into words | ||
|  |     # | ||
|  |     ( $title = $h1->{title} ) =~ s/\.$//; | ||
|  |     @words = split( /\s+/, $title ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Remove all unwanted characters and force to lowercase (use 'fc' for case | ||
|  |     # folding since it's aware of character sets) | ||
|  |     # | ||
|  |     foreach my $word (@words) { | ||
|  |         $word =~ s{[^a-zA-Z./_-]}{}g; | ||
|  |         $word = fc($word); | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Clean up the word list after removing unwanted characters | ||
|  |     # | ||
|  |     @words = grep { $_ !~ /^-*$/ } @words; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Extract pairs of words before they're made unique and make a hash | ||
|  |     # pointing to the episodes they originated from | ||
|  |     # | ||
|  |     for (my $i = 0; $i < $#words; $i++) { | ||
|  |         $pair = "$words[$i] $words[$i+1]"; | ||
|  | 
 | ||
|  |         if ( exists( $double_words{$pair} ) ) { | ||
|  |             $double_words{$pair}->[0] += 1; | ||
|  |             push( @{ $double_words{$pair} }, | ||
|  |                 [ $h1->{eps_id}, $h1->{title} ] ); | ||
|  |         } | ||
|  |         else { | ||
|  |             $double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ]; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Make the word list unique | ||
|  |     # | ||
|  |     @words = uniq(@words); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Walk the tidied single word list | ||
|  |     # | ||
|  |     foreach my $word (@words) { | ||
|  |         # | ||
|  |         # Ignore very short words and words in the ignore list | ||
|  |         # | ||
|  |         next if length($word) < 2; | ||
|  |         next if grep( /^$word$/, @ignore ); | ||
|  | 
 | ||
|  |         # | ||
|  |         # Save this word in the episodes hash | ||
|  |         # | ||
|  |         push( @{ $eps{ $h1->{eps_id} } }, $word ); | ||
|  | 
 | ||
|  |         # | ||
|  |         # If the word is not known initialise the entry containing an arrayref | ||
|  |         # with a counter and another arrayref with the saved episode number | ||
|  |         # and title. If it's known, increment the counter and stash the | ||
|  |         # episode details as another arrayref. | ||
|  |         # | ||
|  |         if ( exists( $single_words{$word} ) ) { | ||
|  |             $single_words{$word}->[0] += 1; | ||
|  |             push( @{ $single_words{$word} }, | ||
|  |                 [ $h1->{eps_id}, $h1->{title} ] ); | ||
|  |         } | ||
|  |         else { | ||
|  |             $single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ]; | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # We've finished with the database | ||
|  | # | ||
|  | $dbh->disconnect; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Done the first pass, prepare for the next | ||
|  | #------------------------------------------------------------------------------- | ||
|  | newOutput( \$outfh, $file_template, $phase++ ); | ||
|  | 
 | ||
|  | # | ||
|  | # Process the saved data in increasing order of the frequency. Print the word | ||
|  | # and its frequency and follow that by the stashed episode details in the | ||
|  | # order we saw them | ||
|  | # | ||
|  | foreach my $key ( | ||
|  |     sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] } | ||
|  |     sort( keys(%single_words) ) | ||
|  |     ) | ||
|  | { | ||
|  |     if ( $single_words{$key}->[0] > 3 ) { | ||
|  |         printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0]; | ||
|  |         for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) { | ||
|  |             printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] }; | ||
|  |             @taglist | ||
|  |                 = split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } ); | ||
|  |             if (@taglist) { | ||
|  |                 print $outfh " [", join( ",", @taglist ), "]\n"; | ||
|  |             } | ||
|  |             else { | ||
|  |                 print $outfh "\n"; | ||
|  |             } | ||
|  |         } | ||
|  |         print $outfh "\n"; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Done the second pass, prepare for the next | ||
|  | #------------------------------------------------------------------------------- | ||
|  | newOutput( \$outfh, $file_template, $phase++ ); | ||
|  | 
 | ||
|  | # | ||
|  | # Look through the collected data from the point of view of the episode, list | ||
|  | # all the (relevant) words in the title in order and report their frequencies | ||
|  | # | ||
|  | for my $key ( sort { $a <=> $b } keys(%eps) ) { | ||
|  |     printf $outfh "%4d: %s\n", $key, $eps{$key}->[0]; | ||
|  |     for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) { | ||
|  |         my $word = $eps{$key}->[$i]; | ||
|  |         printf $outfh "      %15s %d\n", $word, $single_words{$word}->[0]; | ||
|  |     } | ||
|  |     print $outfh "\n"; | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Done the third pass, prepare for the next | ||
|  | #------------------------------------------------------------------------------- | ||
|  | newOutput( \$outfh, $file_template, $phase++ ); | ||
|  | 
 | ||
|  | # | ||
|  | # So the pairs of words we collected earlier might show something interesting. | ||
|  | # Let's see. | ||
|  | # | ||
|  | foreach my $key ( | ||
|  |     sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] } | ||
|  |     sort( keys(%double_words) ) | ||
|  |     ) | ||
|  | { | ||
|  |     if ( $double_words{$key}->[0] > 3 ) { | ||
|  |         printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0]; | ||
|  |         for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) { | ||
|  |             printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] }; | ||
|  |             @taglist | ||
|  |                 = split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } ); | ||
|  |             if (@taglist) { | ||
|  |                 print $outfh " [", join( ",", @taglist ), "]\n"; | ||
|  |             } | ||
|  |             else { | ||
|  |                 print $outfh "\n"; | ||
|  |             } | ||
|  |         } | ||
|  |         print $outfh "\n"; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | close($outfh); | ||
|  | 
 | ||
|  | exit; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: newOutput | ||
|  | #      PURPOSE: Generate a new output file | ||
|  | #   PARAMETERS: $fh             a scalar ref pointing to a variable to hold a | ||
|  | #                               file handle | ||
|  | #               $template       a string suitable for sprintf for defining the | ||
|  | #                               name of the output file | ||
|  | #               $phase          an integer to be incorporated into the output | ||
|  | #                               file name | ||
|  | #      RETURNS: Nothing | ||
|  | #  DESCRIPTION: Closes any existing file and opens a new one with the same | ||
|  | #               file handle. The name of the file is derived from the template | ||
|  | #               and the phase number. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub newOutput { | ||
|  |     my ( $fh, $template, $phase ) = @_; | ||
|  | 
 | ||
|  |     if ($$fh) { | ||
|  |         close($$fh) if ( tell($$fh) > -1 ); | ||
|  |     } | ||
|  | 
 | ||
|  |     my $outfile = sprintf( $template, $phase ); | ||
|  |     open( $$fh, '>:encoding(UTF-8)', $outfile ) | ||
|  |         or die "Unable to open $outfile\n"; | ||
|  | 
 | ||
|  |     return; | ||
|  | } | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker |