hpr-tools/Database/convert_latin1

658 lines
19 KiB
Perl
Executable File

#!/usr/bin/env perl
#===============================================================================
#
# FILE: convert_latin1
#
# USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N]
#
# DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR
# database
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.1.2
# CREATED: 2023-05-04 10:07:04
# REVISION: 2023-05-08 12:15:49
#
#===============================================================================
use v5.16;
use strict;
use warnings;
#use utf8;
# Using experimental features, some of which require warnings to be turned off
use feature qw{ postderef say signatures state try };
no warnings qw{
experimental::postderef
experimental::signatures
experimental::try
};
use Getopt::Long;
use Pod::Usage;
use Config::General;
#use Encode qw( encode decode is_utf8 );
#use Try::Tiny;
#use TryCatch;
use SQL::Abstract;
use DBI;
use Log::Handler;
use Log::Handler::Output::File;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.1.2';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my $logfile = "$basedir/${PROG}.log";
my ( $dbh, $sth1, $sth2, $h1 );
my ( $sql, $utf8, $viewed );
#
# Map of latin1 characters with their Unicode equivalents {{{
#
# Commented out 2023-05-10 since no longer wanted
#
#my %map_latin1 = (
# q{€â‚¬} => "\N{U+20AC}",
# q{ÀÀ} => "\N{U+00C0}",
# q{ÁÃ} => "\N{U+00C1}",
# q{‚‚} => "\N{U+201A}",
# q{ÂÂ} => "\N{U+00C2}",
# q{ƒÆ’} => "\N{U+0192}",
# q{ÃÃ} => "\N{U+00C3}",
# q{„„} => "\N{U+201E}",
# q{ÄÄ} => "\N{U+00C4}",
# q{……} => "\N{U+2026}",
# q{ÅÃ…} => "\N{U+00C5}",
# q{†â€} => "\N{U+2020}",
# q{ÆÆ} => "\N{U+00C6}",
# q{‡â€¡} => "\N{U+2021}",
# q{ÇÇ} => "\N{U+00C7}",
# q{ˆË†} => "\N{U+02C6}",
# q{ÈÈ} => "\N{U+00C8}",
# q{‰â€°} => "\N{U+2030}",
# q{ÉÉ} => "\N{U+00C9}",
# q{ŠÅ} => "\N{U+0160}",
# q{ÊÊ} => "\N{U+00CA}",
# q{‹â€¹} => "\N{U+2039}",
# q{ËË} => "\N{U+00CB}",
# q{ŒÅ’} => "\N{U+0152}",
# q{ÌÃŒ} => "\N{U+00CC}",
# q{ÍÃ} => "\N{U+00CD}",
# q{ŽÅ½} => "\N{U+017D}",
# q{ÎÃŽ} => "\N{U+00CE}",
# q{ÏÃ} => "\N{U+00CF}",
# q{ÐÃ} => "\N{U+00D0}",
# q{‘‘} => "\N{U+2018}",
# q{ÑÑ} => "\N{U+00D1}",
# q{Չ۪} => "\N{U+2019}",
# q{ÒÃ’} => "\N{U+00D2}",
# q{““} => "\N{U+201C}",
# q{ÓÓ} => "\N{U+00D3}",
# q{”â€} => "\N{U+201D}",
# q{ÔÔ} => "\N{U+00D4}",
# q{•â€¢} => "\N{U+2022}",
# q{ÕÕ} => "\N{U+00D5}",
# q{––} => "\N{U+2013}",
# q{ÖÖ} => "\N{U+00D6}",
# q{——} => "\N{U+2014}",
# q{××} => "\N{U+00D7}",
# q{˜Ëœ} => "\N{U+02DC}",
# q{ØØ} => "\N{U+00D8}",
# q{™â„¢} => "\N{U+2122}",
# q{ÙÙ} => "\N{U+00D9}",
# q{šÅ¡} => "\N{U+0161}",
# q{ÚÚ} => "\N{U+00DA}",
# q{݉ۼ} => "\N{U+203A}",
# q{ÛÛ} => "\N{U+00DB}",
# q{œÅ“} => "\N{U+0153}",
# q{ÜÃœ} => "\N{U+00DC}",
# q{ÝÃ} => "\N{U+00DD}",
# q{žÅ¾} => "\N{U+017E}",
# q{ÞÞ} => "\N{U+00DE}",
# q{ŸÅ¸} => "\N{U+0178}",
# q{ßß} => "\N{U+00DF}",
# q{Â} => "\N{U+00A0}",
# q{àÃ} => "\N{U+00E0}",
# q{¡Â¡} => "\N{U+00A1}",
# q{áá} => "\N{U+00E1}",
# q{¢Â¢} => "\N{U+00A2}",
# q{ââ} => "\N{U+00E2}",
# q{£Â£} => "\N{U+00A3}",
# q{ãã} => "\N{U+00E3}",
# q{¤Â¤} => "\N{U+00A4}",
# q{ää} => "\N{U+00E4}",
# q{¥Â¥} => "\N{U+00A5}",
# q{åÃ¥} => "\N{U+00E5}",
# q{¦Â¦} => "\N{U+00A6}",
# q{ææ} => "\N{U+00E6}",
# q{§Â§} => "\N{U+00A7}",
# q{çç} => "\N{U+00E7}",
# q{¨Â¨} => "\N{U+00A8}",
# q{èè} => "\N{U+00E8}",
# q{©Â©} => "\N{U+00A9}",
# q{éé} => "\N{U+00E9}",
# q{ªÂª} => "\N{U+00AA}",
# q{êê} => "\N{U+00EA}",
# q{«Â«} => "\N{U+00AB}",
# q{ëë} => "\N{U+00EB}",
# q{¬Â¬} => "\N{U+00AC}",
# q{ìì} => "\N{U+00EC}",
# q{­Â­} => "\N{U+00AD}",
# q{íí} => "\N{U+00ED}",
# q{®Â®} => "\N{U+00AE}",
# q{îî} => "\N{U+00EE}",
# q{¯Â¯} => "\N{U+00AF}",
# q{ïï} => "\N{U+00EF}",
# q{°Â°} => "\N{U+00B0}",
# q{ðð} => "\N{U+00F0}",
# q{±Â±} => "\N{U+00B1}",
# q{ññ} => "\N{U+00F1}",
# q{²Â²} => "\N{U+00B2}",
# q{òò} => "\N{U+00F2}",
# q{³Â³} => "\N{U+00B3}",
# q{óó} => "\N{U+00F3}",
# q{´Â´} => "\N{U+00B4}",
# q{ôô} => "\N{U+00F4}",
# q{µÂµ} => "\N{U+00B5}",
# q{õõ} => "\N{U+00F5}",
# q{¶Â¶} => "\N{U+00B6}",
# q{öö} => "\N{U+00F6}",
# q{·Â·} => "\N{U+00B7}",
# q{÷÷} => "\N{U+00F7}",
# q{¸Â¸} => "\N{U+00B8}",
# q{øø} => "\N{U+00F8}",
# q{¹Â¹} => "\N{U+00B9}",
# q{ùù} => "\N{U+00F9}",
# q{ºÂº} => "\N{U+00BA}",
# q{úú} => "\N{U+00FA}",
# q{»Â»} => "\N{U+00BB}",
# q{ûû} => "\N{U+00FB}",
# q{¼Â¼} => "\N{U+00BC}",
# q{üü} => "\N{U+00FC}",
# q{½Â½} => "\N{U+00BD}",
# q{ýý} => "\N{U+00FD}",
# q{¾Â¾} => "\N{U+00BE}",
# q{þþ} => "\N{U+00FE}",
# q{¿Â¿} => "\N{U+00BF}",
# q{ÿÿ} => "\N{U+00FF}",
#);
#
# Build a regex from all of the hash keys
#
#my $regex = join('|',sort(keys(%map_latin1)));
#$regex=qr{$regex};
#}}}
#
# Enable Unicode output mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments {{{
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );
#
# Default help
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
if ( $options{'help'} );
#
# Full documentation if requested with -doc
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
if ( $options{'doc'} );
#
# Collect options
#
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
my $field = $options{field};
my $skip = $options{skip} // 0;
my $limit = $options{limit} // 0;
# }}}
#
# Sanity checks
#
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
if ($field) {
$field = lc($field);
die "Invalid value for -field=FIELD\n"
unless ( $field =~ /title|summary|tags|notes/ );
}
else {
$field = 'title';
}
#-------------------------------------------------------------------------------
# Load configuration data
#-------------------------------------------------------------------------------
my $conf = new Config::General(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or die $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;
#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();
$log->add(
file => {
timeformat => "%Y-%m-%d %H:%M:%S",
filename => $logfile,
maxlevel => 7,
minlevel => 0,
utf8 => 1,
}
);
#
# Log the settings being used
#
$log->info("---- Running version $VERSION");
$log->info("Configuration file $cfgfile");
$log->info("Processing field '$field'");
$log->info("Skipping $skip non-ASCII rows") if $skip;
$log->info("Update limit is $limit") if $limit;
$log->info("Dry-run mode") if ($dry_run);
#
# Adjust limit
#
$limit += $skip if $skip;
#-------------------------------------------------------------------------------
# Perform a scan of episodes for the chosen field which contains non-ASCII
#-------------------------------------------------------------------------------
$sql = sprintf(
q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id},
$field, $field, $field
);
$sth1 = $dbh->prepare($sql) or die $DBI::errstr;
$sth1->execute;
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Prepare SQL::Abstract and the SQL template for the updates
#
my $sqla = SQL::Abstract->new;
my $stmt1 = sprintf(
q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)},
$field, $field
);
#-------------------------------------------------------------------------------
# Loop through what we get from the main query, attempting to convert each field
#-------------------------------------------------------------------------------
$viewed = 0;
while ( $h1 = $sth1->fetchrow_hashref ) {
$viewed++;
next if $viewed <= $skip;
#
# Prepare the 'WHERE' part of the SQL
#
my %where = ( id => $h1->{id} );
my ( $stmt2, @bind ) = $sqla->where( \%where );
my $stmt = "${stmt1}${stmt2}";
#
# In dry-run mode just report what would have been done, otherwise try and
# make the change.
#
if ($dry_run) {
if ($verbose) {
printf "[%04d] %s\n", $h1->{id},
(
$field eq 'notes'
? ''
: $h1->{$field}
);
}
say "SQL: ${stmt}";
say "Arguments: ",join( ',', @bind );
}
else {
$sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
#
# The SQL could generate an error which we'll try and intercept
#
try {
$sth2->execute(@bind)
or die $DBI::errstr;
$log->info("Updated $field field for row $h1->{id}");
}
catch ($e) {
$log->info("Failed to update $field field for row $h1->{id}");
$log->info("Error: $e");
}
}
}
continue {
if ($limit) {
if ($viewed >= $limit) {
$log->info("Update limit reached");
last;
};
}
}
exit;
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "doc", "dry-run!", "verbose!",
"config=s", "field=s", "skip=i", "limit=i",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
convert_latin1 - a script to convert fields in the HPR database to UTF-8
=head1 VERSION
This documentation refers to convert_latin1 version 0.1.2
=head1 USAGE
./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run]
[-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N]
./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title
./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes
-limit=10
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
=item B<-doc>
Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use:
pod2pdf convert_latin1 --out=convert_latin1.pdf
=item B<-config=FILE>
This option allows an alternative configuration file to be used. This file
defines the location of the database, its port, its name and the username and
password to be used to access it. This feature was added to allow the script
to access alternative databases or the live database over an SSH tunnel.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<.hpr_db.cfg>
=item B<-[no]dry-run>
Controls whether the program runs in a mode where it performs database
updates. When enabled the details of the updates to be performed are shown,
otherwise the updates are applied. The default B<-nodry-run> allows the
program to perform the changes.
=item B<-[no]verbose>
Normally very little is reported by the script, although details of errors
are reported. When B<-verbose> is selected more information
about the number of rows needing work, the updates performed (or which would
have been performed) and how many changes were made is reported.
=item B<-field=FIELDNAME>
This option defines the database field name to be converted. The permitted
names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed
to be B<eps>. If the option is not provided the default field B<title> will be
used.
=item B<-skip=N>
This option defines the number of database rows to skip when processing the
selected field. If omitted then no rows are skipped. The option is useful to
allow the work to be split into manageable batches, in conjunction with the
B<-limit=N> option below.
=item B<-limit=N>
This option defines the number of database rows to work on when processing the
selected field. If omitted then all rows are processed (after any skip defined
with te B<-skip=N> option). The option is useful to allow the work to split
into manageable batches, in conjunction with the B<-skip=N> option above.
=back
=head1 DESCRIPTION
=head2 OVERVIEW
The script is designed to repair the HPR MySQL (MariaDB) database which holds
show metadata. The database was created with 'latin1' encoding, and was later
changed to use UTF-8. However, no action was taken to ensure the PHP software
managing the database also used UTF-8. This meant that the 'latin1' encoded data
was still being generated as Unicode UTF-8 data was being added, and was being
rendered in the expected way, while there was little or no UTF-8 data being
stored.
The PHP deficiencies were rectified in April 2023 but this meant that all
non-ASCII characters stored in the database before that were rendered
incorrectly. The solution was to convert all 'latin1' non-ASCII data into
UTF-8, and that is what this script does.
Detecting non ASCII in database fields was performed with the following SQL:
SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id
This is used to generate a list of all rows which might need conversion to
UTF-8. However, the test is only whether there is non-ASCII data in the row.
Ideally, the conversion could have been performed entirely within the database
with SQL such as the following (for each field):
UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8)
WHERE field <> CONVERT(field USING ASCII);
However, the conversion to UTF-8 fails when the field already contains such
characters, stopping the query.
MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in
various programming languages), but only in stored procedures. It was felt to
be undesirable to create stored procedures on the HPR database since this was
only possible through B<phpMyAdmin> which is due to be phased out.
This script was written to enable the catching of errors instead.
=head2 SCRIPT DESIGN
The main loop returns all rows with non-ASCII characters in the field being
processed. For each row an 'UPDATE' query is performed using the 'id' field
(episode number) to select it:
UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8)
WHERE id = value
This is performed inside a B<try/catch> statement so that if the query fails
it does not stop the script. Successes and failures are logged.
This algorithm is fairly slow, particularly for the 'notes' field which has
the most (nearly 600) non-ASCII rows. However, it seems to work as desired.
The B<-skip=N> and B<-limit=N> options allow control over the conversion
process such that the work can be done in batches.
Note that the log file used by the script is called B<convert_latin1.log>. It
is appended to on every run. The file name can only be changed by editing the
script.
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
The script obtains the credentials it requires to open the HPR database from
a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
directory holding the script. This can be changed by use of the
B<-configuration=FILE> option as described above.
The configuration file format is as follows:
<database>
host = 127.0.0.1
port = PORT
name = DATABASE
user = USERNAME
password = PASSWORD
</database>
=head1 DEPENDENCIES
Config::General
DBI
Data::Dumper
Getopt::Long
Log::Handler
Log::Handler::Output::File
Pod::Usage
SQL::Abstract
The script uses the experimental B<try> feature and disables the warning that
this feature generates. Note that this feature is only available in Perl
versions at 5.34.0 or above (the script was developed under v5.36.0).
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com) Patches are
welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2023 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker