hpr-tools/Database/make_tag_index

519 lines
13 KiB
Perl
Executable File

#!/usr/bin/env perl
#===============================================================================
#
# FILE: make_tag_index
#
# USAGE: ./make_tag_index [-help] [-debug=N] [-out=FILE] [-config=FILE]
#
# DESCRIPTION: Make tag lookup pages for the HPR website
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.2
# CREATED: 2022-09-08 11:52:53
# REVISION: 2022-09-10 14:59:38
#
#===============================================================================
use v5.16;
use strict;
use warnings;
use utf8;
use feature qw{ postderef say signatures state };
no warnings qw{ experimental::postderef experimental::signatures };
use Getopt::Long;
use Pod::Usage;
use Config::General;
use Template;
use Template::Filters;
Template::Filters->use_html_entities; # Use HTML::Entities in the template
use Text::CSV_XS;
use DBI;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.2';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my $template = "$basedir/$PROG.tpl";
my ( $dbh, $sth1, $h1 );
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
my $DEF_DEBUG = 0;
#
# Process options
#
my %options;
Options( \%options );
#
# Default help
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
if ( $options{'help'} );
#
# Full documentation if requested with -doc
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
if ( $options{'doc'} );
#
# Collect options
#
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEF_DEBUG );
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $templatefile = $options{template};
my $outfile = $options{out};
#-------------------------------------------------------------------------------
# Template is the default pre-defined string or a filename
#-------------------------------------------------------------------------------
if ($templatefile) {
die "Unable to find template $templatefile\n" unless ( -e $templatefile );
}
else {
$templatefile = $template;
}
#-------------------------------------------------------------------------------
# Open the output file (or STDOUT)
#-------------------------------------------------------------------------------
my $outfh;
if ($outfile) {
open( $outfh, ">:encoding(UTF-8)", $outfile )
or die "Unable to open $outfile for writing: $!";
}
else {
open( $outfh, ">&", \*STDOUT )
or die "Unable to initialise for writing: $!";
}
#
# Sanity check
#
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
#
# Load configuration data
#
my $conf = new Config::General(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or die $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;
#-------------------------------------------------------------------------------
# Perform a scan of episodes for tags and accumulate them in a hash
#-------------------------------------------------------------------------------
$sth1 = $dbh->prepare(q{SELECT id,title,tags FROM eps WHERE length(tags) > 0})
or die $DBI::errstr;
$sth1->execute;
if ( $dbh->err ) {
warn $dbh->errstr;
}
my ( $status, @fields, %tag_ids, $lastkey, @tagindex, %showtitles );
my $csv = Text::CSV_XS->new(
{ binary => 1,
auto_diag => 1,
escape_char => "\\",
allow_loose_quotes => 1
}
);
#
# Loop through the episodes returned by the query
#
while ( $h1 = $sth1->fetchrow_hashref ) {
#
# Stash the show title with the show number
#
$showtitles{ $h1->{id} } = $h1->{title};
#
# Parse the tag list for the current episode
#
$status = $csv->parse( $h1->{tags} );
unless ($status) {
#
# Report any errors
#
print "Parse error on episode ", $h1->{id}, "\n";
print $csv->error_input(), "\n";
next;
}
@fields = $csv->fields();
#
# Not sure why there are no tags but if not ignore this episode
#
next unless (@fields);
#
# Trim and lowercase all tags
#
@fields = map {
my $t = $_;
$t =~ s/(^\s+|\s+$)//g;
lc($t)
} @fields;
#
# Loop through the tags. For each tag add the associated episode id to the
# %tag_ids hash. The key to this hash is the lower case tag and the value
# is an array of episode numbers.
#
foreach my $tag (@fields) {
if ( defined( $tag_ids{$tag} ) ) {
#
# Add to the existing array
#
push( @{ $tag_ids{$tag} }, $h1->{id} );
}
else {
#
# Create the episode array
#
$tag_ids{$tag} = [ $h1->{id} ];
}
}
}
#
# Dumps the whole tags table. Warning!
#
_debug( $DEBUG > 2, '%tag_ids: ' . Dumper( \%tag_ids ) );
#-------------------------------------------------------------------------------
# Make an alphabetic index of the tags
#-------------------------------------------------------------------------------
$lastkey = '';
foreach my $tag ( sort( keys(%tag_ids) ) ) {
if ( substr( $tag, 0, 1 ) ne $lastkey ) {
$lastkey = substr( $tag, 0, 1 );
push( @tagindex, $tag );
}
}
_debug( $DEBUG > 1, '@tagindex: ' . Dumper( \@tagindex ) );
#-------------------------------------------------------------------------------
# Fill and print the template
#-------------------------------------------------------------------------------
my $tt = Template->new(
{ ABSOLUTE => 1,
ENCODING => 'utf8',
INCLUDE_PATH => $basedir,
OUTPUT_PATH => '.',
}
);
my $vars = {
tag_ids => \%tag_ids,
tagindex => \@tagindex,
titles => \%showtitles,
};
my $document;
$tt->process( $templatefile, $vars, \$document, { binmode => ':utf8' } )
|| die $tt->error(), "\n";
print $outfh $document;
close($outfh);
$dbh->disconnect;
exit;
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options
= ( "help", "doc", "debug=i", "template=s", "out=s", "config=s", );
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
make_tag_index - Generate a tag index from the tags in the database
=head1 VERSION
This documentation refers to make_tag_index version 0.0.2
=head1 USAGE
./make_tag_index [-help] [-doc] [-debug=N] [-template=FILE] [-out=FILE]
[-config=FILE]
./make_tag_index -help
./make_tag_index -doc
./make_tag_index -out=tags.php
./make_tag_index -template=MTI_1.tpl -out=tags.php
./make_tag_index -config=$HOME/HPR/.hpr_livedb.cfg -out=tags.php
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
=item B<-doc>
Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use:
pod2pdf make_tag_index --out=make_tag_index.pdf
=item B<-debug=N>
Causes certain debugging information to be displayed.
0 (the default) no debug output
1 N/A
2 dumps @tagindex an array containing tags and show numbers for the index
3 dumps %tag_ids the data used to build the entire tag list (warning!)
=item B<-out=FILE>
This option defines an output file to receive the report. If the option is
omitted the report is written to STDOUT, allowing it to be redirected if
required.
=item B<-config=FILE>
This option allows an alternative configuration file to be used. This file
defines the location of the database, its port, its name and the username and
password to be used to access it. This feature was added to allow the script
to access alternative databases or the live database over an SSH tunnel.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<.hpr_db.cfg>
=item B<-template=FILE>
This option defines the template used to generate the tag index. The template
is written using the B<Template> toolkit language.
If the option is omitted then the script uses the file
B<make_tag_index.tpl> in the same directory as the script. If this file
does not exist then the script will exit with an error message.
=back
=head1 DESCRIPTION
The script reads all episodes in the HPR database. Each row contains a 'tags'
field which contains tags as a comma-separated list. This list is parsed and
stored in a Perl hash. The hash is keyed by the lower-case tag and the value
part of each hash element contains a Perl arrayref containing a list of show
numbers. The tag/show hash is called B<%tag_ids>. There are over 5800 tags in
the system in September 2022.
An array called B<@tagindex> is also created which holds the first tag of each
group starting with the same character. So, with a particular tag population,
the 'a' group might start with 'aaron newcomb', 'b' with 'b+ tree' and so
forth.
A further hash called B<%showtitles> is indexed by show number and holds the
title of the show. This has been added in preparation for producing a tag
index pages which have better accessibility features.
=head1 DIAGNOSTICS
=over 4
=item B<Unable to find template ...>
Type: fatal
The template specified does not exist.
=item B<Unable to open ... for writing: ...>
Type: fatal
The nominated output file cannot be written to.
=item B<Unable to initialise for writing: ...>;
Type: fatal
Trying to write to STDOUT has failed.
=item B<Unable to find ...>
Type: fatal
The nominated configuration file cannot be found
=item B<various database errors>
Type: fatal
Failure while opening the database or preparing a query.
=item B<Errors from Template Toolkit>
Type: fatal
The template could not be processed
=back
=head1 CONFIGURATION AND ENVIRONMENT
The script obtains the credentials it requires to open the HPR database from
a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
directory holding the script. To change this will require changing the script.
The configuration file format is as follows:
<database>
host = 127.0.0.1
port = PORT
name = DATABASE
user = USERNAME
password = PASSWORD
</database>
=head1 DEPENDENCIES
Config::General
DBI
Data::Dumper
Getopt::Long
Pod::Usage
Template
Template::Filters
Text::CSV_XS
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2022 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker