forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env perl
 | 
						|
#===============================================================================
 | 
						|
#
 | 
						|
#         FILE: convert_latin1
 | 
						|
#
 | 
						|
#        USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N]
 | 
						|
#
 | 
						|
#  DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR
 | 
						|
#               database
 | 
						|
#
 | 
						|
#      OPTIONS: ---
 | 
						|
# REQUIREMENTS: ---
 | 
						|
#         BUGS: ---
 | 
						|
#        NOTES: ---
 | 
						|
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | 
						|
#      VERSION: 0.1.2
 | 
						|
#      CREATED: 2023-05-04 10:07:04
 | 
						|
#     REVISION: 2023-05-08 12:15:49
 | 
						|
#
 | 
						|
#===============================================================================
 | 
						|
 | 
						|
use v5.16;
 | 
						|
use strict;
 | 
						|
use warnings;
 | 
						|
#use utf8;
 | 
						|
 | 
						|
# Using experimental features, some of which require warnings to be turned off
 | 
						|
use feature qw{ postderef say signatures state try };
 | 
						|
no warnings qw{
 | 
						|
    experimental::postderef
 | 
						|
    experimental::signatures
 | 
						|
    experimental::try
 | 
						|
};
 | 
						|
 | 
						|
use Getopt::Long;
 | 
						|
use Pod::Usage;
 | 
						|
 | 
						|
use Config::General;
 | 
						|
 | 
						|
#use Encode qw( encode decode is_utf8 );
 | 
						|
#use Try::Tiny;
 | 
						|
#use TryCatch;
 | 
						|
 | 
						|
use SQL::Abstract;
 | 
						|
use DBI;
 | 
						|
 | 
						|
use Log::Handler;
 | 
						|
use Log::Handler::Output::File;
 | 
						|
 | 
						|
use Data::Dumper;
 | 
						|
 | 
						|
#
 | 
						|
# Version number (manually incremented)
 | 
						|
#
 | 
						|
our $VERSION = '0.1.2';
 | 
						|
 | 
						|
#
 | 
						|
# Script and directory names
 | 
						|
#
 | 
						|
( my $PROG = $0 ) =~ s|.*/||mx;
 | 
						|
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | 
						|
$DIR = '.' unless $DIR;
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Declarations
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
#
 | 
						|
# Constants and other declarations
 | 
						|
#
 | 
						|
my $basedir    = "$ENV{HOME}/HPR/Database";
 | 
						|
my $configfile = "$basedir/.hpr_db.cfg";
 | 
						|
my $logfile    = "$basedir/${PROG}.log";
 | 
						|
 | 
						|
my ( $dbh, $sth1, $sth2, $h1 );
 | 
						|
my ( $sql, $utf8, $viewed );
 | 
						|
 | 
						|
#
 | 
						|
# Map of latin1 characters with their Unicode equivalents {{{
 | 
						|
#
 | 
						|
# Commented out 2023-05-10 since no longer wanted
 | 
						|
#
 | 
						|
#my %map_latin1 = (
 | 
						|
#    q{€â‚¬}  => "\N{U+20AC}",
 | 
						|
#    q{ÀÀ}   => "\N{U+00C0}",
 | 
						|
#    q{ÁÃ}    => "\N{U+00C1}",
 | 
						|
#    q{‚‚}  => "\N{U+201A}",
 | 
						|
#    q{ÂÂ}   => "\N{U+00C2}",
 | 
						|
#    q{ƒÆ’}   => "\N{U+0192}",
 | 
						|
#    q{ÃÃ}   => "\N{U+00C3}",
 | 
						|
#    q{„„}  => "\N{U+201E}",
 | 
						|
#    q{ÄÄ}   => "\N{U+00C4}",
 | 
						|
#    q{……}  => "\N{U+2026}",
 | 
						|
#    q{ÅÃ…}   => "\N{U+00C5}",
 | 
						|
#    q{†â€}   => "\N{U+2020}",
 | 
						|
#    q{ÆÃ†}   => "\N{U+00C6}",
 | 
						|
#    q{‡â€¡}  => "\N{U+2021}",
 | 
						|
#    q{ÇÇ}   => "\N{U+00C7}",
 | 
						|
#    q{ˆË†}   => "\N{U+02C6}",
 | 
						|
#    q{ÈÈ}   => "\N{U+00C8}",
 | 
						|
#    q{‰â€°}  => "\N{U+2030}",
 | 
						|
#    q{ÉÉ}   => "\N{U+00C9}",
 | 
						|
#    q{ŠÅ}    => "\N{U+0160}",
 | 
						|
#    q{ÊÊ}   => "\N{U+00CA}",
 | 
						|
#    q{‹â€¹}  => "\N{U+2039}",
 | 
						|
#    q{ËË}   => "\N{U+00CB}",
 | 
						|
#    q{ŒÅ’}   => "\N{U+0152}",
 | 
						|
#    q{ÌÃŒ}   => "\N{U+00CC}",
 | 
						|
#    q{ÍÃ}    => "\N{U+00CD}",
 | 
						|
#    q{ŽÅ½}   => "\N{U+017D}",
 | 
						|
#    q{ÎÃŽ}   => "\N{U+00CE}",
 | 
						|
#    q{ÏÃ}    => "\N{U+00CF}",
 | 
						|
#    q{ÐÃ}    => "\N{U+00D0}",
 | 
						|
#    q{‘‘}  => "\N{U+2018}",
 | 
						|
#    q{ÑÑ}   => "\N{U+00D1}",
 | 
						|
#    q{Չ۪}  => "\N{U+2019}",
 | 
						|
#    q{ÒÃ’}   => "\N{U+00D2}",
 | 
						|
#    q{““}  => "\N{U+201C}",
 | 
						|
#    q{ÓÓ}   => "\N{U+00D3}",
 | 
						|
#    q{”â€}   => "\N{U+201D}",
 | 
						|
#    q{ÔÔ}   => "\N{U+00D4}",
 | 
						|
#    q{••}  => "\N{U+2022}",
 | 
						|
#    q{ÕÕ}   => "\N{U+00D5}",
 | 
						|
#    q{––}  => "\N{U+2013}",
 | 
						|
#    q{ÖÖ}   => "\N{U+00D6}",
 | 
						|
#    q{——}  => "\N{U+2014}",
 | 
						|
#    q{××}   => "\N{U+00D7}",
 | 
						|
#    q{˜Ëœ}   => "\N{U+02DC}",
 | 
						|
#    q{ØÃ˜}   => "\N{U+00D8}",
 | 
						|
#    q{™â„¢}  => "\N{U+2122}",
 | 
						|
#    q{ÙÙ}   => "\N{U+00D9}",
 | 
						|
#    q{šÅ¡}   => "\N{U+0161}",
 | 
						|
#    q{ÚÚ}   => "\N{U+00DA}",
 | 
						|
#    q{݉ۼ}  => "\N{U+203A}",
 | 
						|
#    q{ÛÛ}   => "\N{U+00DB}",
 | 
						|
#    q{œÅ“}   => "\N{U+0153}",
 | 
						|
#    q{ÜÜ}   => "\N{U+00DC}",
 | 
						|
#    q{ÝÃ}    => "\N{U+00DD}",
 | 
						|
#    q{žÅ¾}   => "\N{U+017E}",
 | 
						|
#    q{ÞÞ}   => "\N{U+00DE}",
 | 
						|
#    q{ŸÅ¸}   => "\N{U+0178}",
 | 
						|
#    q{ßß}   => "\N{U+00DF}",
 | 
						|
#    q{Â}     => "\N{U+00A0}",
 | 
						|
#    q{àÃ}    => "\N{U+00E0}",
 | 
						|
#    q{¡Â¡}   => "\N{U+00A1}",
 | 
						|
#    q{áá}   => "\N{U+00E1}",
 | 
						|
#    q{¢Â¢}   => "\N{U+00A2}",
 | 
						|
#    q{ââ}   => "\N{U+00E2}",
 | 
						|
#    q{£Â£}   => "\N{U+00A3}",
 | 
						|
#    q{ãã}   => "\N{U+00E3}",
 | 
						|
#    q{¤Â¤}   => "\N{U+00A4}",
 | 
						|
#    q{ää}   => "\N{U+00E4}",
 | 
						|
#    q{¥Â¥}   => "\N{U+00A5}",
 | 
						|
#    q{åÃ¥}   => "\N{U+00E5}",
 | 
						|
#    q{¦Â¦}   => "\N{U+00A6}",
 | 
						|
#    q{æÃ¦}   => "\N{U+00E6}",
 | 
						|
#    q{§Â§}   => "\N{U+00A7}",
 | 
						|
#    q{çç}   => "\N{U+00E7}",
 | 
						|
#    q{¨Â¨}   => "\N{U+00A8}",
 | 
						|
#    q{èè}   => "\N{U+00E8}",
 | 
						|
#    q{©Â©}   => "\N{U+00A9}",
 | 
						|
#    q{éé}   => "\N{U+00E9}",
 | 
						|
#    q{ªÂª}   => "\N{U+00AA}",
 | 
						|
#    q{êê}   => "\N{U+00EA}",
 | 
						|
#    q{«Â«}   => "\N{U+00AB}",
 | 
						|
#    q{ëë}   => "\N{U+00EB}",
 | 
						|
#    q{¬Â¬}   => "\N{U+00AC}",
 | 
						|
#    q{ìì}   => "\N{U+00EC}",
 | 
						|
#    q{Â}   => "\N{U+00AD}",
 | 
						|
#    q{íÃ}   => "\N{U+00ED}",
 | 
						|
#    q{®Â®}   => "\N{U+00AE}",
 | 
						|
#    q{îî}   => "\N{U+00EE}",
 | 
						|
#    q{¯Â¯}   => "\N{U+00AF}",
 | 
						|
#    q{ïï}   => "\N{U+00EF}",
 | 
						|
#    q{°Â°}   => "\N{U+00B0}",
 | 
						|
#    q{ðð}   => "\N{U+00F0}",
 | 
						|
#    q{±Â±}   => "\N{U+00B1}",
 | 
						|
#    q{ññ}   => "\N{U+00F1}",
 | 
						|
#    q{²Â²}   => "\N{U+00B2}",
 | 
						|
#    q{òò}   => "\N{U+00F2}",
 | 
						|
#    q{³Â³}   => "\N{U+00B3}",
 | 
						|
#    q{óó}   => "\N{U+00F3}",
 | 
						|
#    q{´Â´}   => "\N{U+00B4}",
 | 
						|
#    q{ôô}   => "\N{U+00F4}",
 | 
						|
#    q{µÂµ}   => "\N{U+00B5}",
 | 
						|
#    q{õõ}   => "\N{U+00F5}",
 | 
						|
#    q{¶Â¶}   => "\N{U+00B6}",
 | 
						|
#    q{öö}   => "\N{U+00F6}",
 | 
						|
#    q{·Â·}   => "\N{U+00B7}",
 | 
						|
#    q{÷÷}   => "\N{U+00F7}",
 | 
						|
#    q{¸Â¸}   => "\N{U+00B8}",
 | 
						|
#    q{øÃ¸}   => "\N{U+00F8}",
 | 
						|
#    q{¹Â¹}   => "\N{U+00B9}",
 | 
						|
#    q{ùù}   => "\N{U+00F9}",
 | 
						|
#    q{ºÂº}   => "\N{U+00BA}",
 | 
						|
#    q{úú}   => "\N{U+00FA}",
 | 
						|
#    q{»Â»}   => "\N{U+00BB}",
 | 
						|
#    q{ûû}   => "\N{U+00FB}",
 | 
						|
#    q{¼Â¼}   => "\N{U+00BC}",
 | 
						|
#    q{üü}   => "\N{U+00FC}",
 | 
						|
#    q{½Â½}   => "\N{U+00BD}",
 | 
						|
#    q{ýý}   => "\N{U+00FD}",
 | 
						|
#    q{¾Â¾}   => "\N{U+00BE}",
 | 
						|
#    q{þþ}   => "\N{U+00FE}",
 | 
						|
#    q{¿Â¿}   => "\N{U+00BF}",
 | 
						|
#    q{ÿÿ}   => "\N{U+00FF}",
 | 
						|
#);
 | 
						|
 | 
						|
#
 | 
						|
# Build a regex from all of the hash keys
 | 
						|
#
 | 
						|
#my $regex = join('|',sort(keys(%map_latin1)));
 | 
						|
#$regex=qr{$regex};
 | 
						|
 | 
						|
#}}}
 | 
						|
 | 
						|
#
 | 
						|
# Enable Unicode output mode
 | 
						|
#
 | 
						|
binmode STDOUT, ":encoding(UTF-8)";
 | 
						|
binmode STDERR, ":encoding(UTF-8)";
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Options and arguments {{{
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
#
 | 
						|
# Process options
 | 
						|
#
 | 
						|
my %options;
 | 
						|
Options( \%options );
 | 
						|
 | 
						|
#
 | 
						|
# Default help
 | 
						|
#
 | 
						|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
 | 
						|
    if ( $options{'help'} );
 | 
						|
 | 
						|
#
 | 
						|
# Full documentation if requested with -doc
 | 
						|
#
 | 
						|
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
 | 
						|
    if ( $options{'doc'} );
 | 
						|
 | 
						|
#
 | 
						|
# Collect options
 | 
						|
#
 | 
						|
my $cfgfile
 | 
						|
    = ( defined( $options{config} ) ? $options{config} : $configfile );
 | 
						|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
 | 
						|
my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
 | 
						|
my $field   = $options{field};
 | 
						|
my $skip    = $options{skip} // 0;
 | 
						|
my $limit   = $options{limit} // 0;
 | 
						|
 | 
						|
# }}}
 | 
						|
 | 
						|
#
 | 
						|
# Sanity checks
 | 
						|
#
 | 
						|
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
 | 
						|
if ($field) {
 | 
						|
    $field = lc($field);
 | 
						|
    die "Invalid value for -field=FIELD\n"
 | 
						|
        unless ( $field =~ /title|summary|tags|notes/ );
 | 
						|
}
 | 
						|
else {
 | 
						|
    $field = 'title';
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Load configuration data
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
my $conf = new Config::General(
 | 
						|
    -ConfigFile      => $cfgfile,
 | 
						|
    -InterPolateVars => 1,
 | 
						|
    -ExtendedAccess  => 1
 | 
						|
);
 | 
						|
my %config = $conf->getall();
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Connect to the database
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
my $dbhost = $config{database}->{host} // '127.0.0.1';
 | 
						|
my $dbport = $config{database}->{port} // 3306;
 | 
						|
my $dbname = $config{database}->{name};
 | 
						|
my $dbuser = $config{database}->{user};
 | 
						|
my $dbpwd  = $config{database}->{password};
 | 
						|
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
 | 
						|
    $dbuser, $dbpwd, { AutoCommit => 1 } )
 | 
						|
    or die $DBI::errstr;
 | 
						|
 | 
						|
#
 | 
						|
# Enable client-side UTF8
 | 
						|
#
 | 
						|
$dbh->{mysql_enable_utf8} = 1;
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Set up logging keeping the default log layout except for the date
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
my $log = Log::Handler->new();
 | 
						|
 | 
						|
$log->add(
 | 
						|
    file => {
 | 
						|
        timeformat => "%Y-%m-%d %H:%M:%S",
 | 
						|
        filename   => $logfile,
 | 
						|
        maxlevel   => 7,
 | 
						|
        minlevel   => 0,
 | 
						|
        utf8       => 1,
 | 
						|
    }
 | 
						|
);
 | 
						|
 | 
						|
#
 | 
						|
# Log the settings being used
 | 
						|
#
 | 
						|
$log->info("---- Running version $VERSION");
 | 
						|
$log->info("Configuration file $cfgfile");
 | 
						|
$log->info("Processing field '$field'");
 | 
						|
$log->info("Skipping $skip non-ASCII rows") if $skip;
 | 
						|
$log->info("Update limit is $limit") if $limit;
 | 
						|
$log->info("Dry-run mode") if ($dry_run);
 | 
						|
 | 
						|
#
 | 
						|
# Adjust limit
 | 
						|
#
 | 
						|
$limit += $skip if $skip;
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Perform a scan of episodes for the chosen field which contains non-ASCII
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
$sql = sprintf(
 | 
						|
    q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id},
 | 
						|
    $field, $field, $field
 | 
						|
);
 | 
						|
 | 
						|
$sth1 = $dbh->prepare($sql) or die $DBI::errstr;
 | 
						|
 | 
						|
$sth1->execute;
 | 
						|
if ( $dbh->err ) {
 | 
						|
    warn $dbh->errstr;
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Prepare SQL::Abstract and the SQL template for the updates
 | 
						|
#
 | 
						|
my $sqla = SQL::Abstract->new;
 | 
						|
 | 
						|
my $stmt1 = sprintf(
 | 
						|
    q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)},
 | 
						|
    $field, $field
 | 
						|
);
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Loop through what we get from the main query, attempting to convert each field
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
$viewed = 0;
 | 
						|
while ( $h1 = $sth1->fetchrow_hashref ) {
 | 
						|
    $viewed++;
 | 
						|
    next if $viewed <= $skip;
 | 
						|
 | 
						|
    #
 | 
						|
    # Prepare the 'WHERE' part of the SQL
 | 
						|
    #
 | 
						|
    my %where = ( id => $h1->{id} );
 | 
						|
    my ( $stmt2, @bind ) = $sqla->where( \%where );
 | 
						|
    my $stmt = "${stmt1}${stmt2}";
 | 
						|
 | 
						|
    #
 | 
						|
    # In dry-run mode just report what would have been done, otherwise try and
 | 
						|
    # make the change.
 | 
						|
    #
 | 
						|
    if ($dry_run) {
 | 
						|
        if ($verbose) {
 | 
						|
            printf "[%04d] %s\n", $h1->{id},
 | 
						|
                (
 | 
						|
                  $field eq 'notes'
 | 
						|
                ? ''
 | 
						|
                : $h1->{$field}
 | 
						|
                );
 | 
						|
        }
 | 
						|
 | 
						|
        say "SQL: ${stmt}";
 | 
						|
        say "Arguments: ",join( ',', @bind );
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
 | 
						|
 | 
						|
        #
 | 
						|
        # The SQL could generate an error which we'll try and intercept
 | 
						|
        #
 | 
						|
        try {
 | 
						|
            $sth2->execute(@bind)
 | 
						|
                or die $DBI::errstr;
 | 
						|
            $log->info("Updated $field field for row $h1->{id}");
 | 
						|
        }
 | 
						|
        catch ($e) {
 | 
						|
            $log->info("Failed to update $field field for row $h1->{id}");
 | 
						|
            $log->info("Error: $e");
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
continue {
 | 
						|
    if ($limit) {
 | 
						|
        if ($viewed >= $limit) {
 | 
						|
            $log->info("Update limit reached");
 | 
						|
            last;
 | 
						|
        };
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
exit;
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: Options
 | 
						|
#      PURPOSE: Processes command-line options
 | 
						|
#   PARAMETERS: $optref     Hash reference to hold the options
 | 
						|
#      RETURNS: Undef
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: no exceptions
 | 
						|
#     COMMENTS: none
 | 
						|
#     SEE ALSO: n/a
 | 
						|
#===============================================================================
 | 
						|
sub Options {
 | 
						|
    my ($optref) = @_;
 | 
						|
 | 
						|
    my @options = (
 | 
						|
        "help",     "doc",     "dry-run!", "verbose!",
 | 
						|
        "config=s", "field=s", "skip=i",   "limit=i",
 | 
						|
    );
 | 
						|
 | 
						|
    if ( !GetOptions( $optref, @options ) ) {
 | 
						|
        pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
__END__
 | 
						|
 | 
						|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | 
						|
#  Application Documentation
 | 
						|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | 
						|
#{{{
 | 
						|
 | 
						|
=head1 NAME
 | 
						|
 | 
						|
convert_latin1 - a script to convert fields in the HPR database to UTF-8
 | 
						|
 | 
						|
=head1 VERSION
 | 
						|
 | 
						|
This documentation refers to convert_latin1 version 0.1.2
 | 
						|
 | 
						|
 | 
						|
=head1 USAGE
 | 
						|
 | 
						|
    ./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run]
 | 
						|
        [-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N]
 | 
						|
 | 
						|
    ./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title
 | 
						|
    ./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes
 | 
						|
        -limit=10
 | 
						|
 | 
						|
 | 
						|
=head1 OPTIONS
 | 
						|
 | 
						|
=over 8
 | 
						|
 | 
						|
=item B<-help>
 | 
						|
 | 
						|
Prints a brief help message describing the usage of the program, and then exits.
 | 
						|
 | 
						|
=item B<-doc>
 | 
						|
 | 
						|
Displays the entirety of the documentation (using a pager), and then exits. To
 | 
						|
generate a PDF version use:
 | 
						|
 | 
						|
    pod2pdf convert_latin1 --out=convert_latin1.pdf
 | 
						|
 | 
						|
=item B<-config=FILE>
 | 
						|
 | 
						|
This option allows an alternative configuration file to be used. This file
 | 
						|
defines the location of the database, its port, its name and the username and
 | 
						|
password to be used to access it. This feature was added to allow the script
 | 
						|
to access alternative databases or the live database over an SSH tunnel.
 | 
						|
 | 
						|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | 
						|
 | 
						|
If the option is omitted the default file is used: B<.hpr_db.cfg>
 | 
						|
 | 
						|
=item B<-[no]dry-run>
 | 
						|
 | 
						|
Controls whether the program runs in a mode where it performs database
 | 
						|
updates. When enabled the details of the updates to be performed are shown,
 | 
						|
otherwise the updates are applied. The default B<-nodry-run> allows the
 | 
						|
program to perform the changes.
 | 
						|
 | 
						|
=item B<-[no]verbose>
 | 
						|
 | 
						|
Normally very little is reported by the script, although details of errors
 | 
						|
are reported. When B<-verbose> is selected more information
 | 
						|
about the number of rows needing work, the updates performed (or which would
 | 
						|
have been performed) and how many changes were made is reported.
 | 
						|
 | 
						|
=item B<-field=FIELDNAME>
 | 
						|
 | 
						|
This option defines the database field name to be converted. The permitted
 | 
						|
names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed
 | 
						|
to be B<eps>. If the option is not provided the default field B<title> will be
 | 
						|
used.
 | 
						|
 | 
						|
=item B<-skip=N>
 | 
						|
 | 
						|
This option defines the number of database rows to skip when processing the
 | 
						|
selected field. If omitted then no rows are skipped. The option is useful to
 | 
						|
allow the work to be split into manageable batches, in conjunction with the
 | 
						|
B<-limit=N> option below.
 | 
						|
 | 
						|
=item B<-limit=N>
 | 
						|
 | 
						|
This option defines the number of database rows to work on when processing the
 | 
						|
selected field. If omitted then all rows are processed (after any skip defined
 | 
						|
with te B<-skip=N> option). The option is useful to allow the work to split
 | 
						|
into manageable batches, in conjunction with the B<-skip=N> option above.
 | 
						|
 | 
						|
=back
 | 
						|
 | 
						|
=head1 DESCRIPTION
 | 
						|
 | 
						|
=head2 OVERVIEW
 | 
						|
 | 
						|
The script is designed to repair the HPR MySQL (MariaDB) database which holds
 | 
						|
show metadata. The database was created with 'latin1' encoding, and was later
 | 
						|
changed to use UTF-8. However, no action was taken to ensure the PHP software
 | 
						|
managing the database also used UTF-8. This meant that the 'latin1' encoded data
 | 
						|
was still being generated as Unicode UTF-8 data was being added, and was being
 | 
						|
rendered in the expected way, while there was little or no UTF-8 data being
 | 
						|
stored.
 | 
						|
 | 
						|
The PHP deficiencies were rectified in April 2023 but this meant that all
 | 
						|
non-ASCII characters stored in the database before that were rendered
 | 
						|
incorrectly. The solution was to convert all 'latin1' non-ASCII data into
 | 
						|
UTF-8, and that is what this script does.
 | 
						|
 | 
						|
Detecting non ASCII in database fields was performed with the following SQL:
 | 
						|
 | 
						|
    SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id
 | 
						|
 | 
						|
This is used to generate a list of all rows which might need conversion to
 | 
						|
UTF-8. However, the test is only whether there is non-ASCII data in the row.
 | 
						|
 | 
						|
Ideally, the conversion could have been performed entirely within the database
 | 
						|
with SQL such as the following (for each field):
 | 
						|
 | 
						|
    UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8)
 | 
						|
    WHERE field <> CONVERT(field USING ASCII);
 | 
						|
 | 
						|
However, the conversion to UTF-8 fails when the field already contains such
 | 
						|
characters, stopping the query.
 | 
						|
 | 
						|
MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in
 | 
						|
various programming languages), but only in stored procedures. It was felt to
 | 
						|
be undesirable to create stored procedures on the HPR database since this was
 | 
						|
only possible through B<phpMyAdmin> which is due to be phased out.
 | 
						|
 | 
						|
This script was written to enable the catching of errors instead.
 | 
						|
 | 
						|
=head2 SCRIPT DESIGN
 | 
						|
 | 
						|
The main loop returns all rows with non-ASCII characters in the field being
 | 
						|
processed. For each row an 'UPDATE' query is performed using the 'id' field
 | 
						|
(episode number) to select it:
 | 
						|
 | 
						|
    UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8)
 | 
						|
    WHERE id = value
 | 
						|
 | 
						|
This is performed inside a B<try/catch> statement so that if the query fails
 | 
						|
it does not stop the script. Successes and failures are logged.
 | 
						|
 | 
						|
This algorithm is fairly slow, particularly for the 'notes' field which has
 | 
						|
the most (nearly 600) non-ASCII rows. However, it seems to work as desired.
 | 
						|
 | 
						|
The B<-skip=N> and B<-limit=N> options allow control over the conversion
 | 
						|
process such that the work can be done in batches.
 | 
						|
 | 
						|
Note that the log file used by the script is called B<convert_latin1.log>. It
 | 
						|
is appended to on every run. The file name can only be changed by editing the
 | 
						|
script.
 | 
						|
 | 
						|
=head1 DIAGNOSTICS
 | 
						|
 | 
						|
A list of every error and warning message that the application can generate
 | 
						|
(even the ones that will "never happen"), with a full explanation of each
 | 
						|
problem, one or more likely causes, and any suggested remedies. If the
 | 
						|
application generates exit status codes (e.g. under Unix) then list the exit
 | 
						|
status associated with each error.
 | 
						|
 | 
						|
 | 
						|
=head1 CONFIGURATION AND ENVIRONMENT
 | 
						|
 | 
						|
The script obtains the credentials it requires to open the HPR database from
 | 
						|
a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
 | 
						|
directory holding the script. This can be changed by use of the
 | 
						|
B<-configuration=FILE> option as described above.
 | 
						|
 | 
						|
The configuration file format is as follows:
 | 
						|
 | 
						|
 <database>
 | 
						|
     host = 127.0.0.1
 | 
						|
     port = PORT
 | 
						|
     name = DATABASE
 | 
						|
     user = USERNAME
 | 
						|
     password = PASSWORD
 | 
						|
 </database>
 | 
						|
 | 
						|
=head1 DEPENDENCIES
 | 
						|
 | 
						|
    Config::General
 | 
						|
    DBI
 | 
						|
    Data::Dumper
 | 
						|
    Getopt::Long
 | 
						|
    Log::Handler
 | 
						|
    Log::Handler::Output::File
 | 
						|
    Pod::Usage
 | 
						|
    SQL::Abstract
 | 
						|
 | 
						|
The script uses the experimental B<try> feature and disables the warning that
 | 
						|
this feature generates. Note that this feature is only available in Perl
 | 
						|
versions at 5.34.0 or above (the script was developed under v5.36.0).
 | 
						|
 | 
						|
=head1 BUGS AND LIMITATIONS
 | 
						|
 | 
						|
There are no known bugs in this module.
 | 
						|
Please report problems to Dave Morriss  (Dave.Morriss@gmail.com) Patches are
 | 
						|
welcome.
 | 
						|
 | 
						|
=head1 AUTHOR
 | 
						|
 | 
						|
Dave Morriss  (Dave.Morriss@gmail.com)
 | 
						|
 | 
						|
=head1 LICENCE AND COPYRIGHT
 | 
						|
 | 
						|
Copyright (c) 2023 Dave Morriss  (Dave.Morriss@gmail.com). All rights reserved.
 | 
						|
 | 
						|
This module is free software; you can redistribute it and/or
 | 
						|
modify it under the same terms as Perl itself. See perldoc perlartistic.
 | 
						|
 | 
						|
This program is distributed in the hope that it will be useful,
 | 
						|
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | 
						|
 | 
						|
=cut
 | 
						|
 | 
						|
#}}}
 | 
						|
 | 
						|
# [zo to open fold, zc to close]
 | 
						|
 | 
						|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 | 
						|
 |