forked from HPR/hpr-tools
Moved project directories and files to an empty local repo
This commit is contained in:
397
Database/find_series
Executable file
397
Database/find_series
Executable file
@@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env perl
|
||||
#===============================================================================
|
||||
#
|
||||
# FILE: find_series
|
||||
#
|
||||
# USAGE: ./find_series
|
||||
#
|
||||
# DESCRIPTION: Gathers information from the HPR database to assist with the
|
||||
# process of placing episodes into series groups
|
||||
#
|
||||
# OPTIONS: ---
|
||||
# REQUIREMENTS: ---
|
||||
# BUGS: ---
|
||||
# NOTES: ---
|
||||
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||||
# VERSION: 0.0.5
|
||||
# CREATED: 2014-04-25 17:11:21
|
||||
# REVISION: 2016-06-15 15:51:55
|
||||
#
|
||||
#===============================================================================
|
||||
|
||||
use v5.16;
|
||||
use strict;
|
||||
use warnings;
|
||||
use utf8;
|
||||
|
||||
use Config::General;
|
||||
use List::MoreUtils qw(uniq);
|
||||
use YAML::XS qw{LoadFile};
|
||||
|
||||
use DBI;
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
#
|
||||
# Version number (manually incremented)
|
||||
#
|
||||
our $VERSION = '0.0.5';
|
||||
|
||||
#
|
||||
# Various constants
|
||||
#
|
||||
( my $PROG = $0 ) =~ s|.*/||mx;
|
||||
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||||
$DIR = '.' unless $DIR;
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Declarations
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Constants and other declarations
|
||||
#
|
||||
my $basedir = "$ENV{HOME}/HPR/Database";
|
||||
my $configfile = "$basedir/.hpr_db.cfg";
|
||||
my $ignorefile = "$basedir/.$PROG.yml";
|
||||
|
||||
my $file_template = "${PROG}_%d.out";
|
||||
|
||||
my ( $dbh, $sth1, $h1 );
|
||||
my ( @ignore, $title, @words, $pair );
|
||||
my ( %eps, %tags, @taglist, %single_words, %double_words );
|
||||
my ( $phase, $outfile, $outfh );
|
||||
|
||||
#
|
||||
# Enable Unicode mode
|
||||
#
|
||||
binmode STDOUT, ":encoding(UTF-8)";
|
||||
binmode STDERR, ":encoding(UTF-8)";
|
||||
|
||||
#
|
||||
# Load YAML ignore list
|
||||
#
|
||||
my $yaml = LoadFile($ignorefile);
|
||||
@ignore = uniq( @{ $yaml->{ignore} } );
|
||||
|
||||
#
|
||||
# Load database configuration data
|
||||
#
|
||||
my $conf = Config::General->new(
|
||||
-ConfigFile => $configfile,
|
||||
-InterPolateVars => 1,
|
||||
-ExtendedAccess => 1
|
||||
);
|
||||
my %config = $conf->getall();
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Connect to the database
|
||||
#-------------------------------------------------------------------------------
|
||||
my $dbhost = $config{database}->{host} // '127.0.0.1';
|
||||
my $dbport = $config{database}->{port} // 3306;
|
||||
my $dbname = $config{database}->{name};
|
||||
my $dbuser = $config{database}->{user};
|
||||
my $dbpwd = $config{database}->{password};
|
||||
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
|
||||
$dbuser, $dbpwd, { AutoCommit => 1 } )
|
||||
or die $DBI::errstr;
|
||||
|
||||
#
|
||||
# Enable client-side UTF8
|
||||
#
|
||||
$dbh->{mysql_enable_utf8} = 1;
|
||||
|
||||
#
|
||||
# Prepare to collect episode titles with the series it's related to
|
||||
#
|
||||
$sth1 = $dbh->prepare(
|
||||
q{SELECT
|
||||
eps.id AS eps_id,
|
||||
eps.date,
|
||||
eps.title,
|
||||
eps.duration,
|
||||
eps.summary,
|
||||
eps.notes,
|
||||
eps.hostid,
|
||||
eps.series,
|
||||
eps.explicit,
|
||||
eps.license,
|
||||
eps.tags,
|
||||
eps.version,
|
||||
eps.downloads,
|
||||
eps.valid AS eps_valid,
|
||||
ms.id AS ms_id,
|
||||
ms.name,
|
||||
ms.description,
|
||||
ms.private,
|
||||
ms.image,
|
||||
ms.valid AS ms_valid
|
||||
FROM eps
|
||||
JOIN miniseries ms ON eps.series = ms.id
|
||||
WHERE eps.valid = 1}
|
||||
);
|
||||
$sth1->execute;
|
||||
if ( $dbh->err ) {
|
||||
warn $dbh->errstr;
|
||||
}
|
||||
|
||||
#
|
||||
# Generate an output file for phase 1
|
||||
#
|
||||
$phase = 1;
|
||||
newOutput( \$outfh, $file_template, $phase++ );
|
||||
print $outfh "Show (Series) Title\n";
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Walk through the episode/series list from the database and build structures
|
||||
# from them.
|
||||
#-------------------------------------------------------------------------------
|
||||
# The structures built are:
|
||||
#
|
||||
# %eps - a hash indexed by the episode number with an arrayref as the
|
||||
# value. The arrayref starts with the episode title and is followed by the
|
||||
# important words from the title in lower case. The term "important" means
|
||||
# that the word must consist of the allowed characters and not be in the
|
||||
# 'ignore' list.
|
||||
#
|
||||
# %tags - a hash indexed by the episode number with a CSV string of tags
|
||||
# associated with the episode as the value.
|
||||
#
|
||||
# %single_words - a hash indexed by one of the words collected from the title
|
||||
# (see %eps above). The value of each element is complex. It consists of an
|
||||
# arrayref, the first element of which is a count of the succeeding elements.
|
||||
# The next N elements are arrayrefs each of which contains two elements: an
|
||||
# episode number and an episode title. Here is an example in Data::Dumper
|
||||
# format:
|
||||
#
|
||||
# 'pre-ibm' => [
|
||||
# 2,
|
||||
# [
|
||||
# 687,
|
||||
# 'pre-IBM PC computer history 1'
|
||||
# ],
|
||||
# [
|
||||
# 691,
|
||||
# 'pre-IBM PC computer history 2'
|
||||
# ]
|
||||
# ],
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||||
#
|
||||
# Report what we found
|
||||
#
|
||||
printf $outfh "%4d: (%-2d) %s\n", $h1->{eps_id}, $h1->{ms_id},
|
||||
$h1->{title};
|
||||
|
||||
#
|
||||
# Skip this episode if it already has a series
|
||||
#
|
||||
next if $h1->{series} > 0;
|
||||
|
||||
#
|
||||
# Save this episode for later
|
||||
#
|
||||
$eps{ $h1->{eps_id} } = [ $h1->{title} ];
|
||||
|
||||
#
|
||||
# Save tag details
|
||||
#
|
||||
$tags{ $h1->{eps_id} } = $h1->{tags};
|
||||
|
||||
#
|
||||
# Strip any trailing full stop, and chop the title into words
|
||||
#
|
||||
( $title = $h1->{title} ) =~ s/\.$//;
|
||||
@words = split( /\s+/, $title );
|
||||
|
||||
#
|
||||
# Remove all unwanted characters and force to lowercase (use 'fc' for case
|
||||
# folding since it's aware of character sets)
|
||||
#
|
||||
foreach my $word (@words) {
|
||||
$word =~ s{[^a-zA-Z./_-]}{}g;
|
||||
$word = fc($word);
|
||||
}
|
||||
|
||||
#
|
||||
# Clean up the word list after removing unwanted characters
|
||||
#
|
||||
@words = grep { $_ !~ /^-*$/ } @words;
|
||||
|
||||
#
|
||||
# Extract pairs of words before they're made unique and make a hash
|
||||
# pointing to the episodes they originated from
|
||||
#
|
||||
for (my $i = 0; $i < $#words; $i++) {
|
||||
$pair = "$words[$i] $words[$i+1]";
|
||||
|
||||
if ( exists( $double_words{$pair} ) ) {
|
||||
$double_words{$pair}->[0] += 1;
|
||||
push( @{ $double_words{$pair} },
|
||||
[ $h1->{eps_id}, $h1->{title} ] );
|
||||
}
|
||||
else {
|
||||
$double_words{$pair} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Make the word list unique
|
||||
#
|
||||
@words = uniq(@words);
|
||||
|
||||
#
|
||||
# Walk the tidied single word list
|
||||
#
|
||||
foreach my $word (@words) {
|
||||
#
|
||||
# Ignore very short words and words in the ignore list
|
||||
#
|
||||
next if length($word) < 2;
|
||||
next if grep( /^$word$/, @ignore );
|
||||
|
||||
#
|
||||
# Save this word in the episodes hash
|
||||
#
|
||||
push( @{ $eps{ $h1->{eps_id} } }, $word );
|
||||
|
||||
#
|
||||
# If the word is not known initialise the entry containing an arrayref
|
||||
# with a counter and another arrayref with the saved episode number
|
||||
# and title. If it's known, increment the counter and stash the
|
||||
# episode details as another arrayref.
|
||||
#
|
||||
if ( exists( $single_words{$word} ) ) {
|
||||
$single_words{$word}->[0] += 1;
|
||||
push( @{ $single_words{$word} },
|
||||
[ $h1->{eps_id}, $h1->{title} ] );
|
||||
}
|
||||
else {
|
||||
$single_words{$word} = [ 1, [ $h1->{eps_id}, $h1->{title} ] ];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# We've finished with the database
|
||||
#
|
||||
$dbh->disconnect;
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Done the first pass, prepare for the next
|
||||
#-------------------------------------------------------------------------------
|
||||
newOutput( \$outfh, $file_template, $phase++ );
|
||||
|
||||
#
|
||||
# Process the saved data in increasing order of the frequency. Print the word
|
||||
# and its frequency and follow that by the stashed episode details in the
|
||||
# order we saw them
|
||||
#
|
||||
foreach my $key (
|
||||
sort { $single_words{$a}->[0] <=> $single_words{$b}->[0] }
|
||||
sort( keys(%single_words) )
|
||||
)
|
||||
{
|
||||
if ( $single_words{$key}->[0] > 3 ) {
|
||||
printf $outfh "%15s: %s\n", $key, $single_words{$key}->[0];
|
||||
for ( my $i = 1; $i <= $single_words{$key}->[0]; $i++ ) {
|
||||
printf $outfh "%17s%4d: %s", ' ', @{ $single_words{$key}->[$i] };
|
||||
@taglist
|
||||
= split( /\s*,\s*/, $tags{ $single_words{$key}->[$i]->[0] } );
|
||||
if (@taglist) {
|
||||
print $outfh " [", join( ",", @taglist ), "]\n";
|
||||
}
|
||||
else {
|
||||
print $outfh "\n";
|
||||
}
|
||||
}
|
||||
print $outfh "\n";
|
||||
}
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Done the second pass, prepare for the next
|
||||
#-------------------------------------------------------------------------------
|
||||
newOutput( \$outfh, $file_template, $phase++ );
|
||||
|
||||
#
|
||||
# Look through the collected data from the point of view of the episode, list
|
||||
# all the (relevant) words in the title in order and report their frequencies
|
||||
#
|
||||
for my $key ( sort { $a <=> $b } keys(%eps) ) {
|
||||
printf $outfh "%4d: %s\n", $key, $eps{$key}->[0];
|
||||
for ( my $i = 1; $i < scalar( @{ $eps{$key} } ); $i++ ) {
|
||||
my $word = $eps{$key}->[$i];
|
||||
printf $outfh " %15s %d\n", $word, $single_words{$word}->[0];
|
||||
}
|
||||
print $outfh "\n";
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Done the third pass, prepare for the next
|
||||
#-------------------------------------------------------------------------------
|
||||
newOutput( \$outfh, $file_template, $phase++ );
|
||||
|
||||
#
|
||||
# So the pairs of words we collected earlier might show something interesting.
|
||||
# Let's see.
|
||||
#
|
||||
foreach my $key (
|
||||
sort { $double_words{$a}->[0] <=> $double_words{$b}->[0] }
|
||||
sort( keys(%double_words) )
|
||||
)
|
||||
{
|
||||
if ( $double_words{$key}->[0] > 3 ) {
|
||||
printf $outfh "%15s: %s\n", $key, $double_words{$key}->[0];
|
||||
for ( my $i = 1; $i <= $double_words{$key}->[0]; $i++ ) {
|
||||
printf $outfh "%17s%4d: %s", ' ', @{ $double_words{$key}->[$i] };
|
||||
@taglist
|
||||
= split( /\s*,\s*/, $tags{ $double_words{$key}->[$i]->[0] } );
|
||||
if (@taglist) {
|
||||
print $outfh " [", join( ",", @taglist ), "]\n";
|
||||
}
|
||||
else {
|
||||
print $outfh "\n";
|
||||
}
|
||||
}
|
||||
print $outfh "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
close($outfh);
|
||||
|
||||
exit;
|
||||
|
||||
#=== FUNCTION ================================================================
|
||||
# NAME: newOutput
|
||||
# PURPOSE: Generate a new output file
|
||||
# PARAMETERS: $fh a scalar ref pointing to a variable to hold a
|
||||
# file handle
|
||||
# $template a string suitable for sprintf for defining the
|
||||
# name of the output file
|
||||
# $phase an integer to be incorporated into the output
|
||||
# file name
|
||||
# RETURNS: Nothing
|
||||
# DESCRIPTION: Closes any existing file and opens a new one with the same
|
||||
# file handle. The name of the file is derived from the template
|
||||
# and the phase number.
|
||||
# THROWS: No exceptions
|
||||
# COMMENTS: None
|
||||
# SEE ALSO: N/A
|
||||
#===============================================================================
|
||||
sub newOutput {
|
||||
my ( $fh, $template, $phase ) = @_;
|
||||
|
||||
if ($$fh) {
|
||||
close($$fh) if ( tell($$fh) > -1 );
|
||||
}
|
||||
|
||||
my $outfile = sprintf( $template, $phase );
|
||||
open( $$fh, '>:encoding(UTF-8)', $outfile )
|
||||
or die "Unable to open $outfile\n";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
Reference in New Issue
Block a user