658 lines
19 KiB
Plaintext
658 lines
19 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: convert_latin1
|
||
|
#
|
||
|
# USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N]
|
||
|
#
|
||
|
# DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR
|
||
|
# database
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: ---
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.1.2
|
||
|
# CREATED: 2023-05-04 10:07:04
|
||
|
# REVISION: 2023-05-08 12:15:49
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use v5.16;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
#use utf8;
|
||
|
|
||
|
# Using experimental features, some of which require warnings to be turned off
|
||
|
use feature qw{ postderef say signatures state try };
|
||
|
no warnings qw{
|
||
|
experimental::postderef
|
||
|
experimental::signatures
|
||
|
experimental::try
|
||
|
};
|
||
|
|
||
|
use Getopt::Long;
|
||
|
use Pod::Usage;
|
||
|
|
||
|
use Config::General;
|
||
|
|
||
|
#use Encode qw( encode decode is_utf8 );
|
||
|
#use Try::Tiny;
|
||
|
#use TryCatch;
|
||
|
|
||
|
use SQL::Abstract;
|
||
|
use DBI;
|
||
|
|
||
|
use Log::Handler;
|
||
|
use Log::Handler::Output::File;
|
||
|
|
||
|
use Data::Dumper;
|
||
|
|
||
|
#
|
||
|
# Version number (manually incremented)
|
||
|
#
|
||
|
our $VERSION = '0.1.2';
|
||
|
|
||
|
#
|
||
|
# Script and directory names
|
||
|
#
|
||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||
|
$DIR = '.' unless $DIR;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Declarations
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Constants and other declarations
|
||
|
#
|
||
|
my $basedir = "$ENV{HOME}/HPR/Database";
|
||
|
my $configfile = "$basedir/.hpr_db.cfg";
|
||
|
my $logfile = "$basedir/${PROG}.log";
|
||
|
|
||
|
my ( $dbh, $sth1, $sth2, $h1 );
|
||
|
my ( $sql, $utf8, $viewed );
|
||
|
|
||
|
#
|
||
|
# Map of latin1 characters with their Unicode equivalents {{{
|
||
|
#
|
||
|
# Commented out 2023-05-10 since no longer wanted
|
||
|
#
|
||
|
#my %map_latin1 = (
|
||
|
# q{€â‚¬} => "\N{U+20AC}",
|
||
|
# q{ÀÀ} => "\N{U+00C0}",
|
||
|
# q{ÁÃ} => "\N{U+00C1}",
|
||
|
# q{‚‚} => "\N{U+201A}",
|
||
|
# q{ÂÂ} => "\N{U+00C2}",
|
||
|
# q{ƒÆ’} => "\N{U+0192}",
|
||
|
# q{ÃÃ} => "\N{U+00C3}",
|
||
|
# q{„„} => "\N{U+201E}",
|
||
|
# q{ÄÄ} => "\N{U+00C4}",
|
||
|
# q{……} => "\N{U+2026}",
|
||
|
# q{ÅÃ…} => "\N{U+00C5}",
|
||
|
# q{†â€} => "\N{U+2020}",
|
||
|
# q{ÆÆ} => "\N{U+00C6}",
|
||
|
# q{‡â€¡} => "\N{U+2021}",
|
||
|
# q{ÇÇ} => "\N{U+00C7}",
|
||
|
# q{ˆË†} => "\N{U+02C6}",
|
||
|
# q{ÈÈ} => "\N{U+00C8}",
|
||
|
# q{‰â€°} => "\N{U+2030}",
|
||
|
# q{ÉÉ} => "\N{U+00C9}",
|
||
|
# q{ŠÅ} => "\N{U+0160}",
|
||
|
# q{ÊÊ} => "\N{U+00CA}",
|
||
|
# q{‹â€¹} => "\N{U+2039}",
|
||
|
# q{ËË} => "\N{U+00CB}",
|
||
|
# q{ŒÅ’} => "\N{U+0152}",
|
||
|
# q{ÌÃŒ} => "\N{U+00CC}",
|
||
|
# q{ÍÃ} => "\N{U+00CD}",
|
||
|
# q{ŽÅ½} => "\N{U+017D}",
|
||
|
# q{ÎÃŽ} => "\N{U+00CE}",
|
||
|
# q{ÏÃ} => "\N{U+00CF}",
|
||
|
# q{ÐÃ} => "\N{U+00D0}",
|
||
|
# q{‘‘} => "\N{U+2018}",
|
||
|
# q{ÑÑ} => "\N{U+00D1}",
|
||
|
# q{Չ۪} => "\N{U+2019}",
|
||
|
# q{ÒÃ’} => "\N{U+00D2}",
|
||
|
# q{““} => "\N{U+201C}",
|
||
|
# q{ÓÓ} => "\N{U+00D3}",
|
||
|
# q{”â€} => "\N{U+201D}",
|
||
|
# q{ÔÔ} => "\N{U+00D4}",
|
||
|
# q{•â€¢} => "\N{U+2022}",
|
||
|
# q{ÕÕ} => "\N{U+00D5}",
|
||
|
# q{––} => "\N{U+2013}",
|
||
|
# q{ÖÖ} => "\N{U+00D6}",
|
||
|
# q{——} => "\N{U+2014}",
|
||
|
# q{××} => "\N{U+00D7}",
|
||
|
# q{˜Ëœ} => "\N{U+02DC}",
|
||
|
# q{ØØ} => "\N{U+00D8}",
|
||
|
# q{™â„¢} => "\N{U+2122}",
|
||
|
# q{ÙÙ} => "\N{U+00D9}",
|
||
|
# q{šÅ¡} => "\N{U+0161}",
|
||
|
# q{ÚÚ} => "\N{U+00DA}",
|
||
|
# q{݉ۼ} => "\N{U+203A}",
|
||
|
# q{ÛÛ} => "\N{U+00DB}",
|
||
|
# q{œÅ“} => "\N{U+0153}",
|
||
|
# q{ÜÃœ} => "\N{U+00DC}",
|
||
|
# q{ÝÃ} => "\N{U+00DD}",
|
||
|
# q{žÅ¾} => "\N{U+017E}",
|
||
|
# q{ÞÞ} => "\N{U+00DE}",
|
||
|
# q{ŸÅ¸} => "\N{U+0178}",
|
||
|
# q{ßß} => "\N{U+00DF}",
|
||
|
# q{Â} => "\N{U+00A0}",
|
||
|
# q{àÃ} => "\N{U+00E0}",
|
||
|
# q{¡Â¡} => "\N{U+00A1}",
|
||
|
# q{áá} => "\N{U+00E1}",
|
||
|
# q{¢Â¢} => "\N{U+00A2}",
|
||
|
# q{ââ} => "\N{U+00E2}",
|
||
|
# q{£Â£} => "\N{U+00A3}",
|
||
|
# q{ãã} => "\N{U+00E3}",
|
||
|
# q{¤Â¤} => "\N{U+00A4}",
|
||
|
# q{ää} => "\N{U+00E4}",
|
||
|
# q{¥Â¥} => "\N{U+00A5}",
|
||
|
# q{åÃ¥} => "\N{U+00E5}",
|
||
|
# q{¦Â¦} => "\N{U+00A6}",
|
||
|
# q{ææ} => "\N{U+00E6}",
|
||
|
# q{§Â§} => "\N{U+00A7}",
|
||
|
# q{çç} => "\N{U+00E7}",
|
||
|
# q{¨Â¨} => "\N{U+00A8}",
|
||
|
# q{èè} => "\N{U+00E8}",
|
||
|
# q{©Â©} => "\N{U+00A9}",
|
||
|
# q{éé} => "\N{U+00E9}",
|
||
|
# q{ªÂª} => "\N{U+00AA}",
|
||
|
# q{êê} => "\N{U+00EA}",
|
||
|
# q{«Â«} => "\N{U+00AB}",
|
||
|
# q{ëë} => "\N{U+00EB}",
|
||
|
# q{¬Â¬} => "\N{U+00AC}",
|
||
|
# q{ìì} => "\N{U+00EC}",
|
||
|
# q{Â} => "\N{U+00AD}",
|
||
|
# q{íÃ} => "\N{U+00ED}",
|
||
|
# q{®Â®} => "\N{U+00AE}",
|
||
|
# q{îî} => "\N{U+00EE}",
|
||
|
# q{¯Â¯} => "\N{U+00AF}",
|
||
|
# q{ïï} => "\N{U+00EF}",
|
||
|
# q{°Â°} => "\N{U+00B0}",
|
||
|
# q{ðð} => "\N{U+00F0}",
|
||
|
# q{±Â±} => "\N{U+00B1}",
|
||
|
# q{ññ} => "\N{U+00F1}",
|
||
|
# q{²Â²} => "\N{U+00B2}",
|
||
|
# q{òò} => "\N{U+00F2}",
|
||
|
# q{³Â³} => "\N{U+00B3}",
|
||
|
# q{óó} => "\N{U+00F3}",
|
||
|
# q{´Â´} => "\N{U+00B4}",
|
||
|
# q{ôô} => "\N{U+00F4}",
|
||
|
# q{µÂµ} => "\N{U+00B5}",
|
||
|
# q{õõ} => "\N{U+00F5}",
|
||
|
# q{¶Â¶} => "\N{U+00B6}",
|
||
|
# q{öö} => "\N{U+00F6}",
|
||
|
# q{·Â·} => "\N{U+00B7}",
|
||
|
# q{÷÷} => "\N{U+00F7}",
|
||
|
# q{¸Â¸} => "\N{U+00B8}",
|
||
|
# q{øø} => "\N{U+00F8}",
|
||
|
# q{¹Â¹} => "\N{U+00B9}",
|
||
|
# q{ùù} => "\N{U+00F9}",
|
||
|
# q{ºÂº} => "\N{U+00BA}",
|
||
|
# q{úú} => "\N{U+00FA}",
|
||
|
# q{»Â»} => "\N{U+00BB}",
|
||
|
# q{ûû} => "\N{U+00FB}",
|
||
|
# q{¼Â¼} => "\N{U+00BC}",
|
||
|
# q{üü} => "\N{U+00FC}",
|
||
|
# q{½Â½} => "\N{U+00BD}",
|
||
|
# q{ýý} => "\N{U+00FD}",
|
||
|
# q{¾Â¾} => "\N{U+00BE}",
|
||
|
# q{þþ} => "\N{U+00FE}",
|
||
|
# q{¿Â¿} => "\N{U+00BF}",
|
||
|
# q{ÿÿ} => "\N{U+00FF}",
|
||
|
#);
|
||
|
|
||
|
#
|
||
|
# Build a regex from all of the hash keys
|
||
|
#
|
||
|
#my $regex = join('|',sort(keys(%map_latin1)));
|
||
|
#$regex=qr{$regex};
|
||
|
|
||
|
#}}}
|
||
|
|
||
|
#
|
||
|
# Enable Unicode output mode
|
||
|
#
|
||
|
binmode STDOUT, ":encoding(UTF-8)";
|
||
|
binmode STDERR, ":encoding(UTF-8)";
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Options and arguments {{{
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Process options
|
||
|
#
|
||
|
my %options;
|
||
|
Options( \%options );
|
||
|
|
||
|
#
|
||
|
# Default help
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
|
||
|
if ( $options{'help'} );
|
||
|
|
||
|
#
|
||
|
# Full documentation if requested with -doc
|
||
|
#
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
|
||
|
if ( $options{'doc'} );
|
||
|
|
||
|
#
|
||
|
# Collect options
|
||
|
#
|
||
|
my $cfgfile
|
||
|
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
||
|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
||
|
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
||
|
my $field = $options{field};
|
||
|
my $skip = $options{skip} // 0;
|
||
|
my $limit = $options{limit} // 0;
|
||
|
|
||
|
# }}}
|
||
|
|
||
|
#
|
||
|
# Sanity checks
|
||
|
#
|
||
|
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
|
||
|
if ($field) {
|
||
|
$field = lc($field);
|
||
|
die "Invalid value for -field=FIELD\n"
|
||
|
unless ( $field =~ /title|summary|tags|notes/ );
|
||
|
}
|
||
|
else {
|
||
|
$field = 'title';
|
||
|
}
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Load configuration data
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $conf = new Config::General(
|
||
|
-ConfigFile => $cfgfile,
|
||
|
-InterPolateVars => 1,
|
||
|
-ExtendedAccess => 1
|
||
|
);
|
||
|
my %config = $conf->getall();
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Connect to the database
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $dbhost = $config{database}->{host} // '127.0.0.1';
|
||
|
my $dbport = $config{database}->{port} // 3306;
|
||
|
my $dbname = $config{database}->{name};
|
||
|
my $dbuser = $config{database}->{user};
|
||
|
my $dbpwd = $config{database}->{password};
|
||
|
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
|
||
|
$dbuser, $dbpwd, { AutoCommit => 1 } )
|
||
|
or die $DBI::errstr;
|
||
|
|
||
|
#
|
||
|
# Enable client-side UTF8
|
||
|
#
|
||
|
$dbh->{mysql_enable_utf8} = 1;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Set up logging keeping the default log layout except for the date
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $log = Log::Handler->new();
|
||
|
|
||
|
$log->add(
|
||
|
file => {
|
||
|
timeformat => "%Y-%m-%d %H:%M:%S",
|
||
|
filename => $logfile,
|
||
|
maxlevel => 7,
|
||
|
minlevel => 0,
|
||
|
utf8 => 1,
|
||
|
}
|
||
|
);
|
||
|
|
||
|
#
|
||
|
# Log the settings being used
|
||
|
#
|
||
|
$log->info("---- Running version $VERSION");
|
||
|
$log->info("Configuration file $cfgfile");
|
||
|
$log->info("Processing field '$field'");
|
||
|
$log->info("Skipping $skip non-ASCII rows") if $skip;
|
||
|
$log->info("Update limit is $limit") if $limit;
|
||
|
$log->info("Dry-run mode") if ($dry_run);
|
||
|
|
||
|
#
|
||
|
# Adjust limit
|
||
|
#
|
||
|
$limit += $skip if $skip;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Perform a scan of episodes for the chosen field which contains non-ASCII
|
||
|
#-------------------------------------------------------------------------------
|
||
|
$sql = sprintf(
|
||
|
q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id},
|
||
|
$field, $field, $field
|
||
|
);
|
||
|
|
||
|
$sth1 = $dbh->prepare($sql) or die $DBI::errstr;
|
||
|
|
||
|
$sth1->execute;
|
||
|
if ( $dbh->err ) {
|
||
|
warn $dbh->errstr;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Prepare SQL::Abstract and the SQL template for the updates
|
||
|
#
|
||
|
my $sqla = SQL::Abstract->new;
|
||
|
|
||
|
my $stmt1 = sprintf(
|
||
|
q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)},
|
||
|
$field, $field
|
||
|
);
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Loop through what we get from the main query, attempting to convert each field
|
||
|
#-------------------------------------------------------------------------------
|
||
|
$viewed = 0;
|
||
|
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||
|
$viewed++;
|
||
|
next if $viewed <= $skip;
|
||
|
|
||
|
#
|
||
|
# Prepare the 'WHERE' part of the SQL
|
||
|
#
|
||
|
my %where = ( id => $h1->{id} );
|
||
|
my ( $stmt2, @bind ) = $sqla->where( \%where );
|
||
|
my $stmt = "${stmt1}${stmt2}";
|
||
|
|
||
|
#
|
||
|
# In dry-run mode just report what would have been done, otherwise try and
|
||
|
# make the change.
|
||
|
#
|
||
|
if ($dry_run) {
|
||
|
if ($verbose) {
|
||
|
printf "[%04d] %s\n", $h1->{id},
|
||
|
(
|
||
|
$field eq 'notes'
|
||
|
? ''
|
||
|
: $h1->{$field}
|
||
|
);
|
||
|
}
|
||
|
|
||
|
say "SQL: ${stmt}";
|
||
|
say "Arguments: ",join( ',', @bind );
|
||
|
}
|
||
|
else {
|
||
|
$sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
|
||
|
|
||
|
#
|
||
|
# The SQL could generate an error which we'll try and intercept
|
||
|
#
|
||
|
try {
|
||
|
$sth2->execute(@bind)
|
||
|
or die $DBI::errstr;
|
||
|
$log->info("Updated $field field for row $h1->{id}");
|
||
|
}
|
||
|
catch ($e) {
|
||
|
$log->info("Failed to update $field field for row $h1->{id}");
|
||
|
$log->info("Error: $e");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
continue {
|
||
|
if ($limit) {
|
||
|
if ($viewed >= $limit) {
|
||
|
$log->info("Update limit reached");
|
||
|
last;
|
||
|
};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
exit;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Options
|
||
|
# PURPOSE: Processes command-line options
|
||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||
|
# RETURNS: Undef
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Options {
|
||
|
my ($optref) = @_;
|
||
|
|
||
|
my @options = (
|
||
|
"help", "doc", "dry-run!", "verbose!",
|
||
|
"config=s", "field=s", "skip=i", "limit=i",
|
||
|
);
|
||
|
|
||
|
if ( !GetOptions( $optref, @options ) ) {
|
||
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
__END__
|
||
|
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
# Application Documentation
|
||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||
|
#{{{
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
convert_latin1 - a script to convert fields in the HPR database to UTF-8
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
This documentation refers to convert_latin1 version 0.1.2
|
||
|
|
||
|
|
||
|
=head1 USAGE
|
||
|
|
||
|
./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run]
|
||
|
[-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N]
|
||
|
|
||
|
./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title
|
||
|
./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes
|
||
|
-limit=10
|
||
|
|
||
|
|
||
|
=head1 OPTIONS
|
||
|
|
||
|
=over 8
|
||
|
|
||
|
=item B<-help>
|
||
|
|
||
|
Prints a brief help message describing the usage of the program, and then exits.
|
||
|
|
||
|
=item B<-doc>
|
||
|
|
||
|
Displays the entirety of the documentation (using a pager), and then exits. To
|
||
|
generate a PDF version use:
|
||
|
|
||
|
pod2pdf convert_latin1 --out=convert_latin1.pdf
|
||
|
|
||
|
=item B<-config=FILE>
|
||
|
|
||
|
This option allows an alternative configuration file to be used. This file
|
||
|
defines the location of the database, its port, its name and the username and
|
||
|
password to be used to access it. This feature was added to allow the script
|
||
|
to access alternative databases or the live database over an SSH tunnel.
|
||
|
|
||
|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
||
|
|
||
|
If the option is omitted the default file is used: B<.hpr_db.cfg>
|
||
|
|
||
|
=item B<-[no]dry-run>
|
||
|
|
||
|
Controls whether the program runs in a mode where it performs database
|
||
|
updates. When enabled the details of the updates to be performed are shown,
|
||
|
otherwise the updates are applied. The default B<-nodry-run> allows the
|
||
|
program to perform the changes.
|
||
|
|
||
|
=item B<-[no]verbose>
|
||
|
|
||
|
Normally very little is reported by the script, although details of errors
|
||
|
are reported. When B<-verbose> is selected more information
|
||
|
about the number of rows needing work, the updates performed (or which would
|
||
|
have been performed) and how many changes were made is reported.
|
||
|
|
||
|
=item B<-field=FIELDNAME>
|
||
|
|
||
|
This option defines the database field name to be converted. The permitted
|
||
|
names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed
|
||
|
to be B<eps>. If the option is not provided the default field B<title> will be
|
||
|
used.
|
||
|
|
||
|
=item B<-skip=N>
|
||
|
|
||
|
This option defines the number of database rows to skip when processing the
|
||
|
selected field. If omitted then no rows are skipped. The option is useful to
|
||
|
allow the work to be split into manageable batches, in conjunction with the
|
||
|
B<-limit=N> option below.
|
||
|
|
||
|
=item B<-limit=N>
|
||
|
|
||
|
This option defines the number of database rows to work on when processing the
|
||
|
selected field. If omitted then all rows are processed (after any skip defined
|
||
|
with te B<-skip=N> option). The option is useful to allow the work to split
|
||
|
into manageable batches, in conjunction with the B<-skip=N> option above.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
=head2 OVERVIEW
|
||
|
|
||
|
The script is designed to repair the HPR MySQL (MariaDB) database which holds
|
||
|
show metadata. The database was created with 'latin1' encoding, and was later
|
||
|
changed to use UTF-8. However, no action was taken to ensure the PHP software
|
||
|
managing the database also used UTF-8. This meant that the 'latin1' encoded data
|
||
|
was still being generated as Unicode UTF-8 data was being added, and was being
|
||
|
rendered in the expected way, while there was little or no UTF-8 data being
|
||
|
stored.
|
||
|
|
||
|
The PHP deficiencies were rectified in April 2023 but this meant that all
|
||
|
non-ASCII characters stored in the database before that were rendered
|
||
|
incorrectly. The solution was to convert all 'latin1' non-ASCII data into
|
||
|
UTF-8, and that is what this script does.
|
||
|
|
||
|
Detecting non ASCII in database fields was performed with the following SQL:
|
||
|
|
||
|
SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id
|
||
|
|
||
|
This is used to generate a list of all rows which might need conversion to
|
||
|
UTF-8. However, the test is only whether there is non-ASCII data in the row.
|
||
|
|
||
|
Ideally, the conversion could have been performed entirely within the database
|
||
|
with SQL such as the following (for each field):
|
||
|
|
||
|
UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8)
|
||
|
WHERE field <> CONVERT(field USING ASCII);
|
||
|
|
||
|
However, the conversion to UTF-8 fails when the field already contains such
|
||
|
characters, stopping the query.
|
||
|
|
||
|
MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in
|
||
|
various programming languages), but only in stored procedures. It was felt to
|
||
|
be undesirable to create stored procedures on the HPR database since this was
|
||
|
only possible through B<phpMyAdmin> which is due to be phased out.
|
||
|
|
||
|
This script was written to enable the catching of errors instead.
|
||
|
|
||
|
=head2 SCRIPT DESIGN
|
||
|
|
||
|
The main loop returns all rows with non-ASCII characters in the field being
|
||
|
processed. For each row an 'UPDATE' query is performed using the 'id' field
|
||
|
(episode number) to select it:
|
||
|
|
||
|
UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8)
|
||
|
WHERE id = value
|
||
|
|
||
|
This is performed inside a B<try/catch> statement so that if the query fails
|
||
|
it does not stop the script. Successes and failures are logged.
|
||
|
|
||
|
This algorithm is fairly slow, particularly for the 'notes' field which has
|
||
|
the most (nearly 600) non-ASCII rows. However, it seems to work as desired.
|
||
|
|
||
|
The B<-skip=N> and B<-limit=N> options allow control over the conversion
|
||
|
process such that the work can be done in batches.
|
||
|
|
||
|
Note that the log file used by the script is called B<convert_latin1.log>. It
|
||
|
is appended to on every run. The file name can only be changed by editing the
|
||
|
script.
|
||
|
|
||
|
=head1 DIAGNOSTICS
|
||
|
|
||
|
A list of every error and warning message that the application can generate
|
||
|
(even the ones that will "never happen"), with a full explanation of each
|
||
|
problem, one or more likely causes, and any suggested remedies. If the
|
||
|
application generates exit status codes (e.g. under Unix) then list the exit
|
||
|
status associated with each error.
|
||
|
|
||
|
|
||
|
=head1 CONFIGURATION AND ENVIRONMENT
|
||
|
|
||
|
The script obtains the credentials it requires to open the HPR database from
|
||
|
a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
|
||
|
directory holding the script. This can be changed by use of the
|
||
|
B<-configuration=FILE> option as described above.
|
||
|
|
||
|
The configuration file format is as follows:
|
||
|
|
||
|
<database>
|
||
|
host = 127.0.0.1
|
||
|
port = PORT
|
||
|
name = DATABASE
|
||
|
user = USERNAME
|
||
|
password = PASSWORD
|
||
|
</database>
|
||
|
|
||
|
=head1 DEPENDENCIES
|
||
|
|
||
|
Config::General
|
||
|
DBI
|
||
|
Data::Dumper
|
||
|
Getopt::Long
|
||
|
Log::Handler
|
||
|
Log::Handler::Output::File
|
||
|
Pod::Usage
|
||
|
SQL::Abstract
|
||
|
|
||
|
The script uses the experimental B<try> feature and disables the warning that
|
||
|
this feature generates. Note that this feature is only available in Perl
|
||
|
versions at 5.34.0 or above (the script was developed under v5.36.0).
|
||
|
|
||
|
=head1 BUGS AND LIMITATIONS
|
||
|
|
||
|
There are no known bugs in this module.
|
||
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com) Patches are
|
||
|
welcome.
|
||
|
|
||
|
=head1 AUTHOR
|
||
|
|
||
|
Dave Morriss (Dave.Morriss@gmail.com)
|
||
|
|
||
|
=head1 LICENCE AND COPYRIGHT
|
||
|
|
||
|
Copyright (c) 2023 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
||
|
|
||
|
This module is free software; you can redistribute it and/or
|
||
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
#}}}
|
||
|
|
||
|
# [zo to open fold, zc to close]
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||
|
|