398 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			398 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: find_series
 | |
| #
 | |
| #        USAGE: ./find_series
 | |
| #
 | |
| #  DESCRIPTION: Gathers information from the HPR database to assist with the
 | |
| #               process of placing episodes into series groups
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.5
 | |
| #      CREATED: 2014-04-25 17:11:21
 | |
| #     REVISION: 2016-06-15 15:51:55
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use v5.16;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| 
 | |
| use Config::General;
 | |
| use List::MoreUtils qw(uniq);
 | |
| use YAML::XS qw{LoadFile};
 | |
| 
 | |
| use DBI;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.0.5';
 | |
| 
 | |
| #
 | |
| # Various constants
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir    = "$ENV{HOME}/HPR/Database";
 | |
| my $configfile = "$basedir/.hpr_db.cfg";
 | |
| my $ignorefile = "$basedir/.$PROG.yml";
 | |
| 
 | |
| my $file_template = "${PROG}_%d.out";
 | |
| 
 | |
| my ( $dbh,    $sth1,    $h1 );
 | |
| my ( @ignore, $title,   @words, $pair );
 | |
| my ( %eps,    %tags,    @taglist, %single_words, %double_words );
 | |
| my ( $phase,  $outfile, $outfh );
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #
 | |
| # Load YAML ignore list
 | |
| #
 | |
| my $yaml = LoadFile($ignorefile);
 | |
| @ignore = uniq( @{ $yaml->{ignore} } );
 | |
| 
 | |
| #
 | |
| # Load database configuration data
 | |
| #
 | |
| my $conf = Config::General->new(
 | |
|     -ConfigFile      => $configfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1
 | |
| );
 | |
| my %config = $conf->getall();
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Connect to the database
 | |
| #-------------------------------------------------------------------------------
 | |
| my $dbhost = $config{database}->{host} // '127.0.0.1';
 | |
| my $dbport = $config{database}->{port} // 3306;
 | |
| my $dbname = $config{database}->{name};
 | |
| my $dbuser = $config{database}->{user};
 | |
| my $dbpwd  = $config{database}->{password};
 | |
| $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
 | |
|     $dbuser, $dbpwd, { AutoCommit => 1 } )
 | |
|     or die $DBI::errstr;
 | |
| 
 | |
| #
 | |
| # Enable client-side UTF8
 | |
| #
 | |
| $dbh->{mysql_enable_utf8} = 1;
 | |
| 
 | |
| #
 | |
| # Prepare to collect episode titles with the series it's related to
 | |
| #
 | |
| $sth1 = $dbh->prepare(
 | |
|     q{SELECT
 | |
|             eps.id AS eps_id,
 | |
|             eps.date,
 | |
|             eps.title,
 | |
|             eps.duration,
 | |
|             eps.summary,
 | |
|             eps.notes,
 | |
|             eps.hostid,
 | |
|             eps.series,
 | |
|             eps.explicit,
 | |
|             eps.license,
 | |
|             eps.tags,
 | |
|             eps.version,
 | |
|             eps.downloads,
 | |
|             eps.valid AS eps_valid,
 | |
|             ms.id AS ms_id,
 | |
|             ms.name,
 | |
|             ms.description,
 | |
|             ms.private,
 | |
|             ms.image,
 | |
|             ms.valid AS ms_valid
 | |
|         FROM eps
 | |
|         JOIN miniseries ms ON eps.series = ms.id
 | |
|         WHERE eps.valid = 1}
 | |
| );
 | |
| $sth1->execute;
 | |
| if ( $dbh->err ) {
 | |
|     warn $dbh->errstr;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Generate an output file for phase 1
 | |
| #
 | |
| $phase = 1;
 | |
| newOutput( \$outfh, $file_template, $phase++ );
 | |
| print $outfh "Show  (Series) Title\n";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Walk through the episode/series list from the database and build structures
 | |
| # from them.
 | |
| #-------------------------------------------------------------------------------
 | |
| # The structures built are:
 | |
| #
 | |
| # %eps - a hash indexed by the episode number with an arrayref as the
 | |
| # value. The arrayref starts with the episode title and is followed by the
 | |
| # important words from the title in lower case. The term "important" means
 | |
| # that the word must consist of the allowed characters and not be in the
 | |
| # 'ignore' list.
 | |
| #
 | |
| # %tags - a hash indexed by the episode number with a CSV string of tags
 | |
| # associated with the episode as the value.
 | |
| #
 | |
| # %single_words - a hash indexed by one of the words collected from the title
 | |
| # (see %eps above). The value of each element is complex. It consists of an
 | |
| # arrayref, the first element of which is a count of the succeeding elements.
 | |
| # The next N elements are arrayrefs each of which contains two elements: an
 | |
| # episode number and an episode title. Here is an example in Data::Dumper
 | |
| # format:
 | |
| #
 | |
| # 'pre-ibm' => [
 | |
| #                2,
 | |
| #                [
 | |
| #                  687,
 | |
| #                  'pre-IBM PC computer history 1'
 | |
| #                ],
 | |
| #                [
 | |
| #                  691,
 | |
| #                  'pre-IBM PC computer history 2'
 | |
| #                ]
 | |
| #              ],
 | |
| #
 | |
| #-------------------------------------------------------------------------------
 | |
| while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|     #
 | |
|     # Report what we found
 | |
|     #
 | |
|     printf $outfh "%4d: (%-2d)     %s\n", $h1->{eps_id}, $h1->{ms_id},
 | |
|         $h1->{title};
 | |
| 
 | |
|     #
 | |
|     # Skip this episode if it already has a series
 | |
|     #
 | |
|     next if $h1->{series} > 0;
 | |
| 
 | |
|     #
 | |
|     # Save this episode for later
 | |
|     #
 | |
|     $eps{ $h1->{eps_id} } = [ $h1->{title} ];
 | |
| 
 | |
|     #
 | |
|     # Save tag details
 | |
|     #
 | |
|     $tags{ $h1->{eps_id} } = $h1->{tags};
 | |
| 
 | |
|     #
 | |
|     # Strip any trailing full stop, and chop the title into words
 | |
|     #
 | |
|     ( $title = $h1->{title} ) =~ s/\.$//;
 | |
|     @words = split( /\s+/, $title );
 | |
| 
 | |
|     #
 | |
|     # Remove all unwanted characters and force to lowercase (use 'fc' for case
 | |
|     # folding since it's aware of character sets)
 | |
|     #
 | |
|     foreach my $word (@words) {
 | |
|         $word =~ s{[^a-zA-Z./_-]}{}g;
 | |
|         $word = fc($word);
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Clean up the word list after removing unwanted characters
 | |
|     #
 | |
|     @words = grep { $_ !~ /^-*$/ } @words;
 | |
| 
 | |
|     #
 | |
|     # Extract pairs of words before they're made unique and make a hash
 | |
|     # pointing to the episodes they originated from
 | |
|     #
 | |
|     for (my $i = 0; $i < $#words; $i++) {
 | |
|         $pair = "$words[$i] $words[$i+1]";
 | |
| 
 | |
|         if ( exists( $double_words{$pair} ) ) {
 | |
|             $double_words{$pair}->[0] += 1;
 | |
|             push( @{ $double_words{$pair} },
 | |
|                 [ $h1->{eps_id}, $h1->{title} ] );
 | |
|         }
 | |
|         else {
 | |
|             $double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Make the word list unique
 | |
|     #
 | |
|     @words = uniq(@words);
 | |
| 
 | |
|     #
 | |
|     # Walk the tidied single word list
 | |
|     #
 | |
|     foreach my $word (@words) {
 | |
|         #
 | |
|         # Ignore very short words and words in the ignore list
 | |
|         #
 | |
|         next if length($word) < 2;
 | |
|         next if grep( /^$word$/, @ignore );
 | |
| 
 | |
|         #
 | |
|         # Save this word in the episodes hash
 | |
|         #
 | |
|         push( @{ $eps{ $h1->{eps_id} } }, $word );
 | |
| 
 | |
|         #
 | |
|         # If the word is not known initialise the entry containing an arrayref
 | |
|         # with a counter and another arrayref with the saved episode number
 | |
|         # and title. If it's known, increment the counter and stash the
 | |
|         # episode details as another arrayref.
 | |
|         #
 | |
|         if ( exists( $single_words{$word} ) ) {
 | |
|             $single_words{$word}->[0] += 1;
 | |
|             push( @{ $single_words{$word} },
 | |
|                 [ $h1->{eps_id}, $h1->{title} ] );
 | |
|         }
 | |
|         else {
 | |
|             $single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # We've finished with the database
 | |
| #
 | |
| $dbh->disconnect;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Done the first pass, prepare for the next
 | |
| #-------------------------------------------------------------------------------
 | |
| newOutput( \$outfh, $file_template, $phase++ );
 | |
| 
 | |
| #
 | |
| # Process the saved data in increasing order of the frequency. Print the word
 | |
| # and its frequency and follow that by the stashed episode details in the
 | |
| # order we saw them
 | |
| #
 | |
| foreach my $key (
 | |
|     sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] }
 | |
|     sort( keys(%single_words) )
 | |
|     )
 | |
| {
 | |
|     if ( $single_words{$key}->[0] > 3 ) {
 | |
|         printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0];
 | |
|         for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) {
 | |
|             printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] };
 | |
|             @taglist
 | |
|                 = split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } );
 | |
|             if (@taglist) {
 | |
|                 print $outfh " [", join( ",", @taglist ), "]\n";
 | |
|             }
 | |
|             else {
 | |
|                 print $outfh "\n";
 | |
|             }
 | |
|         }
 | |
|         print $outfh "\n";
 | |
|     }
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Done the second pass, prepare for the next
 | |
| #-------------------------------------------------------------------------------
 | |
| newOutput( \$outfh, $file_template, $phase++ );
 | |
| 
 | |
| #
 | |
| # Look through the collected data from the point of view of the episode, list
 | |
| # all the (relevant) words in the title in order and report their frequencies
 | |
| #
 | |
| for my $key ( sort { $a <=> $b } keys(%eps) ) {
 | |
|     printf $outfh "%4d: %s\n", $key, $eps{$key}->[0];
 | |
|     for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) {
 | |
|         my $word = $eps{$key}->[$i];
 | |
|         printf $outfh "      %15s %d\n", $word, $single_words{$word}->[0];
 | |
|     }
 | |
|     print $outfh "\n";
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Done the third pass, prepare for the next
 | |
| #-------------------------------------------------------------------------------
 | |
| newOutput( \$outfh, $file_template, $phase++ );
 | |
| 
 | |
| #
 | |
| # So the pairs of words we collected earlier might show something interesting.
 | |
| # Let's see.
 | |
| #
 | |
| foreach my $key (
 | |
|     sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] }
 | |
|     sort( keys(%double_words) )
 | |
|     )
 | |
| {
 | |
|     if ( $double_words{$key}->[0] > 3 ) {
 | |
|         printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0];
 | |
|         for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) {
 | |
|             printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] };
 | |
|             @taglist
 | |
|                 = split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } );
 | |
|             if (@taglist) {
 | |
|                 print $outfh " [", join( ",", @taglist ), "]\n";
 | |
|             }
 | |
|             else {
 | |
|                 print $outfh "\n";
 | |
|             }
 | |
|         }
 | |
|         print $outfh "\n";
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| close($outfh);
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: newOutput
 | |
| #      PURPOSE: Generate a new output file
 | |
| #   PARAMETERS: $fh             a scalar ref pointing to a variable to hold a
 | |
| #                               file handle
 | |
| #               $template       a string suitable for sprintf for defining the
 | |
| #                               name of the output file
 | |
| #               $phase          an integer to be incorporated into the output
 | |
| #                               file name
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Closes any existing file and opens a new one with the same
 | |
| #               file handle. The name of the file is derived from the template
 | |
| #               and the phase number.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub newOutput {
 | |
|     my ( $fh, $template, $phase ) = @_;
 | |
| 
 | |
|     if ($$fh) {
 | |
|         close($$fh) if ( tell($$fh) > -1 );
 | |
|     }
 | |
| 
 | |
|     my $outfile = sprintf( $template, $phase );
 | |
|     open( $$fh, '>:encoding(UTF-8)', $outfile )
 | |
|         or die "Unable to open $outfile\n";
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |