forked from HPR/hpr-tools
Moved project directories and files to an empty local repo
This commit is contained in:
657
Database/convert_latin1
Executable file
657
Database/convert_latin1
Executable file
@@ -0,0 +1,657 @@
|
||||
#!/usr/bin/env perl
|
||||
#===============================================================================
|
||||
#
|
||||
# FILE: convert_latin1
|
||||
#
|
||||
# USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N]
|
||||
#
|
||||
# DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR
|
||||
# database
|
||||
#
|
||||
# OPTIONS: ---
|
||||
# REQUIREMENTS: ---
|
||||
# BUGS: ---
|
||||
# NOTES: ---
|
||||
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||||
# VERSION: 0.1.2
|
||||
# CREATED: 2023-05-04 10:07:04
|
||||
# REVISION: 2023-05-08 12:15:49
|
||||
#
|
||||
#===============================================================================
|
||||
|
||||
use v5.16;
|
||||
use strict;
|
||||
use warnings;
|
||||
#use utf8;
|
||||
|
||||
# Using experimental features, some of which require warnings to be turned off
|
||||
use feature qw{ postderef say signatures state try };
|
||||
no warnings qw{
|
||||
experimental::postderef
|
||||
experimental::signatures
|
||||
experimental::try
|
||||
};
|
||||
|
||||
use Getopt::Long;
|
||||
use Pod::Usage;
|
||||
|
||||
use Config::General;
|
||||
|
||||
#use Encode qw( encode decode is_utf8 );
|
||||
#use Try::Tiny;
|
||||
#use TryCatch;
|
||||
|
||||
use SQL::Abstract;
|
||||
use DBI;
|
||||
|
||||
use Log::Handler;
|
||||
use Log::Handler::Output::File;
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
#
|
||||
# Version number (manually incremented)
|
||||
#
|
||||
our $VERSION = '0.1.2';
|
||||
|
||||
#
|
||||
# Script and directory names
|
||||
#
|
||||
( my $PROG = $0 ) =~ s|.*/||mx;
|
||||
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||||
$DIR = '.' unless $DIR;
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Declarations
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Constants and other declarations
|
||||
#
|
||||
my $basedir = "$ENV{HOME}/HPR/Database";
|
||||
my $configfile = "$basedir/.hpr_db.cfg";
|
||||
my $logfile = "$basedir/${PROG}.log";
|
||||
|
||||
my ( $dbh, $sth1, $sth2, $h1 );
|
||||
my ( $sql, $utf8, $viewed );
|
||||
|
||||
#
|
||||
# Map of latin1 characters with their Unicode equivalents {{{
|
||||
#
|
||||
# Commented out 2023-05-10 since no longer wanted
|
||||
#
|
||||
#my %map_latin1 = (
|
||||
# q{€â‚¬} => "\N{U+20AC}",
|
||||
# q{ÀÀ} => "\N{U+00C0}",
|
||||
# q{ÁÃ} => "\N{U+00C1}",
|
||||
# q{‚‚} => "\N{U+201A}",
|
||||
# q{ÂÂ} => "\N{U+00C2}",
|
||||
# q{ƒÆ’} => "\N{U+0192}",
|
||||
# q{ÃÃ} => "\N{U+00C3}",
|
||||
# q{„„} => "\N{U+201E}",
|
||||
# q{ÄÄ} => "\N{U+00C4}",
|
||||
# q{……} => "\N{U+2026}",
|
||||
# q{ÅÃ…} => "\N{U+00C5}",
|
||||
# q{†â€} => "\N{U+2020}",
|
||||
# q{ÆÃ†} => "\N{U+00C6}",
|
||||
# q{‡â€¡} => "\N{U+2021}",
|
||||
# q{ÇÇ} => "\N{U+00C7}",
|
||||
# q{ˆË†} => "\N{U+02C6}",
|
||||
# q{ÈÈ} => "\N{U+00C8}",
|
||||
# q{‰â€°} => "\N{U+2030}",
|
||||
# q{ÉÉ} => "\N{U+00C9}",
|
||||
# q{ŠÅ} => "\N{U+0160}",
|
||||
# q{ÊÊ} => "\N{U+00CA}",
|
||||
# q{‹â€¹} => "\N{U+2039}",
|
||||
# q{ËË} => "\N{U+00CB}",
|
||||
# q{ŒÅ’} => "\N{U+0152}",
|
||||
# q{ÌÃŒ} => "\N{U+00CC}",
|
||||
# q{ÍÃ} => "\N{U+00CD}",
|
||||
# q{ŽÅ½} => "\N{U+017D}",
|
||||
# q{ÎÃŽ} => "\N{U+00CE}",
|
||||
# q{ÏÃ} => "\N{U+00CF}",
|
||||
# q{ÐÃ} => "\N{U+00D0}",
|
||||
# q{‘‘} => "\N{U+2018}",
|
||||
# q{ÑÑ} => "\N{U+00D1}",
|
||||
# q{Չ۪} => "\N{U+2019}",
|
||||
# q{ÒÃ’} => "\N{U+00D2}",
|
||||
# q{““} => "\N{U+201C}",
|
||||
# q{ÓÓ} => "\N{U+00D3}",
|
||||
# q{”â€} => "\N{U+201D}",
|
||||
# q{ÔÔ} => "\N{U+00D4}",
|
||||
# q{••} => "\N{U+2022}",
|
||||
# q{ÕÕ} => "\N{U+00D5}",
|
||||
# q{––} => "\N{U+2013}",
|
||||
# q{ÖÖ} => "\N{U+00D6}",
|
||||
# q{——} => "\N{U+2014}",
|
||||
# q{××} => "\N{U+00D7}",
|
||||
# q{˜Ëœ} => "\N{U+02DC}",
|
||||
# q{ØÃ˜} => "\N{U+00D8}",
|
||||
# q{™â„¢} => "\N{U+2122}",
|
||||
# q{ÙÙ} => "\N{U+00D9}",
|
||||
# q{šÅ¡} => "\N{U+0161}",
|
||||
# q{ÚÚ} => "\N{U+00DA}",
|
||||
# q{݉ۼ} => "\N{U+203A}",
|
||||
# q{ÛÛ} => "\N{U+00DB}",
|
||||
# q{œÅ“} => "\N{U+0153}",
|
||||
# q{ÜÜ} => "\N{U+00DC}",
|
||||
# q{ÝÃ} => "\N{U+00DD}",
|
||||
# q{žÅ¾} => "\N{U+017E}",
|
||||
# q{ÞÞ} => "\N{U+00DE}",
|
||||
# q{ŸÅ¸} => "\N{U+0178}",
|
||||
# q{ßß} => "\N{U+00DF}",
|
||||
# q{Â} => "\N{U+00A0}",
|
||||
# q{àÃ} => "\N{U+00E0}",
|
||||
# q{¡Â¡} => "\N{U+00A1}",
|
||||
# q{áá} => "\N{U+00E1}",
|
||||
# q{¢Â¢} => "\N{U+00A2}",
|
||||
# q{ââ} => "\N{U+00E2}",
|
||||
# q{£Â£} => "\N{U+00A3}",
|
||||
# q{ãã} => "\N{U+00E3}",
|
||||
# q{¤Â¤} => "\N{U+00A4}",
|
||||
# q{ää} => "\N{U+00E4}",
|
||||
# q{¥Â¥} => "\N{U+00A5}",
|
||||
# q{åÃ¥} => "\N{U+00E5}",
|
||||
# q{¦Â¦} => "\N{U+00A6}",
|
||||
# q{æÃ¦} => "\N{U+00E6}",
|
||||
# q{§Â§} => "\N{U+00A7}",
|
||||
# q{çç} => "\N{U+00E7}",
|
||||
# q{¨Â¨} => "\N{U+00A8}",
|
||||
# q{èè} => "\N{U+00E8}",
|
||||
# q{©Â©} => "\N{U+00A9}",
|
||||
# q{éé} => "\N{U+00E9}",
|
||||
# q{ªÂª} => "\N{U+00AA}",
|
||||
# q{êê} => "\N{U+00EA}",
|
||||
# q{«Â«} => "\N{U+00AB}",
|
||||
# q{ëë} => "\N{U+00EB}",
|
||||
# q{¬Â¬} => "\N{U+00AC}",
|
||||
# q{ìì} => "\N{U+00EC}",
|
||||
# q{Â} => "\N{U+00AD}",
|
||||
# q{íÃ} => "\N{U+00ED}",
|
||||
# q{®Â®} => "\N{U+00AE}",
|
||||
# q{îî} => "\N{U+00EE}",
|
||||
# q{¯Â¯} => "\N{U+00AF}",
|
||||
# q{ïï} => "\N{U+00EF}",
|
||||
# q{°Â°} => "\N{U+00B0}",
|
||||
# q{ðð} => "\N{U+00F0}",
|
||||
# q{±Â±} => "\N{U+00B1}",
|
||||
# q{ññ} => "\N{U+00F1}",
|
||||
# q{²Â²} => "\N{U+00B2}",
|
||||
# q{òò} => "\N{U+00F2}",
|
||||
# q{³Â³} => "\N{U+00B3}",
|
||||
# q{óó} => "\N{U+00F3}",
|
||||
# q{´Â´} => "\N{U+00B4}",
|
||||
# q{ôô} => "\N{U+00F4}",
|
||||
# q{µÂµ} => "\N{U+00B5}",
|
||||
# q{õõ} => "\N{U+00F5}",
|
||||
# q{¶Â¶} => "\N{U+00B6}",
|
||||
# q{öö} => "\N{U+00F6}",
|
||||
# q{·Â·} => "\N{U+00B7}",
|
||||
# q{÷÷} => "\N{U+00F7}",
|
||||
# q{¸Â¸} => "\N{U+00B8}",
|
||||
# q{øÃ¸} => "\N{U+00F8}",
|
||||
# q{¹Â¹} => "\N{U+00B9}",
|
||||
# q{ùù} => "\N{U+00F9}",
|
||||
# q{ºÂº} => "\N{U+00BA}",
|
||||
# q{úú} => "\N{U+00FA}",
|
||||
# q{»Â»} => "\N{U+00BB}",
|
||||
# q{ûû} => "\N{U+00FB}",
|
||||
# q{¼Â¼} => "\N{U+00BC}",
|
||||
# q{üü} => "\N{U+00FC}",
|
||||
# q{½Â½} => "\N{U+00BD}",
|
||||
# q{ýý} => "\N{U+00FD}",
|
||||
# q{¾Â¾} => "\N{U+00BE}",
|
||||
# q{þþ} => "\N{U+00FE}",
|
||||
# q{¿Â¿} => "\N{U+00BF}",
|
||||
# q{ÿÿ} => "\N{U+00FF}",
|
||||
#);
|
||||
|
||||
#
|
||||
# Build a regex from all of the hash keys
|
||||
#
|
||||
#my $regex = join('|',sort(keys(%map_latin1)));
|
||||
#$regex=qr{$regex};
|
||||
|
||||
#}}}
|
||||
|
||||
#
|
||||
# Enable Unicode output mode
|
||||
#
|
||||
binmode STDOUT, ":encoding(UTF-8)";
|
||||
binmode STDERR, ":encoding(UTF-8)";
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Options and arguments {{{
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Process options
|
||||
#
|
||||
my %options;
|
||||
Options( \%options );
|
||||
|
||||
#
|
||||
# Default help
|
||||
#
|
||||
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
|
||||
if ( $options{'help'} );
|
||||
|
||||
#
|
||||
# Full documentation if requested with -doc
|
||||
#
|
||||
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
|
||||
if ( $options{'doc'} );
|
||||
|
||||
#
|
||||
# Collect options
|
||||
#
|
||||
my $cfgfile
|
||||
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
||||
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
||||
my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 );
|
||||
my $field = $options{field};
|
||||
my $skip = $options{skip} // 0;
|
||||
my $limit = $options{limit} // 0;
|
||||
|
||||
# }}}
|
||||
|
||||
#
|
||||
# Sanity checks
|
||||
#
|
||||
die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
|
||||
if ($field) {
|
||||
$field = lc($field);
|
||||
die "Invalid value for -field=FIELD\n"
|
||||
unless ( $field =~ /title|summary|tags|notes/ );
|
||||
}
|
||||
else {
|
||||
$field = 'title';
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Load configuration data
|
||||
#-------------------------------------------------------------------------------
|
||||
my $conf = new Config::General(
|
||||
-ConfigFile => $cfgfile,
|
||||
-InterPolateVars => 1,
|
||||
-ExtendedAccess => 1
|
||||
);
|
||||
my %config = $conf->getall();
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Connect to the database
|
||||
#-------------------------------------------------------------------------------
|
||||
my $dbhost = $config{database}->{host} // '127.0.0.1';
|
||||
my $dbport = $config{database}->{port} // 3306;
|
||||
my $dbname = $config{database}->{name};
|
||||
my $dbuser = $config{database}->{user};
|
||||
my $dbpwd = $config{database}->{password};
|
||||
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
|
||||
$dbuser, $dbpwd, { AutoCommit => 1 } )
|
||||
or die $DBI::errstr;
|
||||
|
||||
#
|
||||
# Enable client-side UTF8
|
||||
#
|
||||
$dbh->{mysql_enable_utf8} = 1;
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Set up logging keeping the default log layout except for the date
|
||||
#-------------------------------------------------------------------------------
|
||||
my $log = Log::Handler->new();
|
||||
|
||||
$log->add(
|
||||
file => {
|
||||
timeformat => "%Y-%m-%d %H:%M:%S",
|
||||
filename => $logfile,
|
||||
maxlevel => 7,
|
||||
minlevel => 0,
|
||||
utf8 => 1,
|
||||
}
|
||||
);
|
||||
|
||||
#
|
||||
# Log the settings being used
|
||||
#
|
||||
$log->info("---- Running version $VERSION");
|
||||
$log->info("Configuration file $cfgfile");
|
||||
$log->info("Processing field '$field'");
|
||||
$log->info("Skipping $skip non-ASCII rows") if $skip;
|
||||
$log->info("Update limit is $limit") if $limit;
|
||||
$log->info("Dry-run mode") if ($dry_run);
|
||||
|
||||
#
|
||||
# Adjust limit
|
||||
#
|
||||
$limit += $skip if $skip;
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Perform a scan of episodes for the chosen field which contains non-ASCII
|
||||
#-------------------------------------------------------------------------------
|
||||
$sql = sprintf(
|
||||
q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id},
|
||||
$field, $field, $field
|
||||
);
|
||||
|
||||
$sth1 = $dbh->prepare($sql) or die $DBI::errstr;
|
||||
|
||||
$sth1->execute;
|
||||
if ( $dbh->err ) {
|
||||
warn $dbh->errstr;
|
||||
}
|
||||
|
||||
#
|
||||
# Prepare SQL::Abstract and the SQL template for the updates
|
||||
#
|
||||
my $sqla = SQL::Abstract->new;
|
||||
|
||||
my $stmt1 = sprintf(
|
||||
q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)},
|
||||
$field, $field
|
||||
);
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Loop through what we get from the main query, attempting to convert each field
|
||||
#-------------------------------------------------------------------------------
|
||||
$viewed = 0;
|
||||
while ( $h1 = $sth1->fetchrow_hashref ) {
|
||||
$viewed++;
|
||||
next if $viewed <= $skip;
|
||||
|
||||
#
|
||||
# Prepare the 'WHERE' part of the SQL
|
||||
#
|
||||
my %where = ( id => $h1->{id} );
|
||||
my ( $stmt2, @bind ) = $sqla->where( \%where );
|
||||
my $stmt = "${stmt1}${stmt2}";
|
||||
|
||||
#
|
||||
# In dry-run mode just report what would have been done, otherwise try and
|
||||
# make the change.
|
||||
#
|
||||
if ($dry_run) {
|
||||
if ($verbose) {
|
||||
printf "[%04d] %s\n", $h1->{id},
|
||||
(
|
||||
$field eq 'notes'
|
||||
? ''
|
||||
: $h1->{$field}
|
||||
);
|
||||
}
|
||||
|
||||
say "SQL: ${stmt}";
|
||||
say "Arguments: ",join( ',', @bind );
|
||||
}
|
||||
else {
|
||||
$sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
|
||||
|
||||
#
|
||||
# The SQL could generate an error which we'll try and intercept
|
||||
#
|
||||
try {
|
||||
$sth2->execute(@bind)
|
||||
or die $DBI::errstr;
|
||||
$log->info("Updated $field field for row $h1->{id}");
|
||||
}
|
||||
catch ($e) {
|
||||
$log->info("Failed to update $field field for row $h1->{id}");
|
||||
$log->info("Error: $e");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
continue {
|
||||
if ($limit) {
|
||||
if ($viewed >= $limit) {
|
||||
$log->info("Update limit reached");
|
||||
last;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
exit;
|
||||
|
||||
#=== FUNCTION ================================================================
|
||||
# NAME: Options
|
||||
# PURPOSE: Processes command-line options
|
||||
# PARAMETERS: $optref Hash reference to hold the options
|
||||
# RETURNS: Undef
|
||||
# DESCRIPTION:
|
||||
# THROWS: no exceptions
|
||||
# COMMENTS: none
|
||||
# SEE ALSO: n/a
|
||||
#===============================================================================
|
||||
sub Options {
|
||||
my ($optref) = @_;
|
||||
|
||||
my @options = (
|
||||
"help", "doc", "dry-run!", "verbose!",
|
||||
"config=s", "field=s", "skip=i", "limit=i",
|
||||
);
|
||||
|
||||
if ( !GetOptions( $optref, @options ) ) {
|
||||
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
__END__
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# Application Documentation
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
#{{{
|
||||
|
||||
=head1 NAME
|
||||
|
||||
convert_latin1 - a script to convert fields in the HPR database to UTF-8
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
This documentation refers to convert_latin1 version 0.1.2
|
||||
|
||||
|
||||
=head1 USAGE
|
||||
|
||||
./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run]
|
||||
[-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N]
|
||||
|
||||
./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title
|
||||
./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes
|
||||
-limit=10
|
||||
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
=over 8
|
||||
|
||||
=item B<-help>
|
||||
|
||||
Prints a brief help message describing the usage of the program, and then exits.
|
||||
|
||||
=item B<-doc>
|
||||
|
||||
Displays the entirety of the documentation (using a pager), and then exits. To
|
||||
generate a PDF version use:
|
||||
|
||||
pod2pdf convert_latin1 --out=convert_latin1.pdf
|
||||
|
||||
=item B<-config=FILE>
|
||||
|
||||
This option allows an alternative configuration file to be used. This file
|
||||
defines the location of the database, its port, its name and the username and
|
||||
password to be used to access it. This feature was added to allow the script
|
||||
to access alternative databases or the live database over an SSH tunnel.
|
||||
|
||||
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
||||
|
||||
If the option is omitted the default file is used: B<.hpr_db.cfg>
|
||||
|
||||
=item B<-[no]dry-run>
|
||||
|
||||
Controls whether the program runs in a mode where it performs database
|
||||
updates. When enabled the details of the updates to be performed are shown,
|
||||
otherwise the updates are applied. The default B<-nodry-run> allows the
|
||||
program to perform the changes.
|
||||
|
||||
=item B<-[no]verbose>
|
||||
|
||||
Normally very little is reported by the script, although details of errors
|
||||
are reported. When B<-verbose> is selected more information
|
||||
about the number of rows needing work, the updates performed (or which would
|
||||
have been performed) and how many changes were made is reported.
|
||||
|
||||
=item B<-field=FIELDNAME>
|
||||
|
||||
This option defines the database field name to be converted. The permitted
|
||||
names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed
|
||||
to be B<eps>. If the option is not provided the default field B<title> will be
|
||||
used.
|
||||
|
||||
=item B<-skip=N>
|
||||
|
||||
This option defines the number of database rows to skip when processing the
|
||||
selected field. If omitted then no rows are skipped. The option is useful to
|
||||
allow the work to be split into manageable batches, in conjunction with the
|
||||
B<-limit=N> option below.
|
||||
|
||||
=item B<-limit=N>
|
||||
|
||||
This option defines the number of database rows to work on when processing the
|
||||
selected field. If omitted then all rows are processed (after any skip defined
|
||||
with te B<-skip=N> option). The option is useful to allow the work to split
|
||||
into manageable batches, in conjunction with the B<-skip=N> option above.
|
||||
|
||||
=back
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
=head2 OVERVIEW
|
||||
|
||||
The script is designed to repair the HPR MySQL (MariaDB) database which holds
|
||||
show metadata. The database was created with 'latin1' encoding, and was later
|
||||
changed to use UTF-8. However, no action was taken to ensure the PHP software
|
||||
managing the database also used UTF-8. This meant that the 'latin1' encoded data
|
||||
was still being generated as Unicode UTF-8 data was being added, and was being
|
||||
rendered in the expected way, while there was little or no UTF-8 data being
|
||||
stored.
|
||||
|
||||
The PHP deficiencies were rectified in April 2023 but this meant that all
|
||||
non-ASCII characters stored in the database before that were rendered
|
||||
incorrectly. The solution was to convert all 'latin1' non-ASCII data into
|
||||
UTF-8, and that is what this script does.
|
||||
|
||||
Detecting non ASCII in database fields was performed with the following SQL:
|
||||
|
||||
SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id
|
||||
|
||||
This is used to generate a list of all rows which might need conversion to
|
||||
UTF-8. However, the test is only whether there is non-ASCII data in the row.
|
||||
|
||||
Ideally, the conversion could have been performed entirely within the database
|
||||
with SQL such as the following (for each field):
|
||||
|
||||
UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8)
|
||||
WHERE field <> CONVERT(field USING ASCII);
|
||||
|
||||
However, the conversion to UTF-8 fails when the field already contains such
|
||||
characters, stopping the query.
|
||||
|
||||
MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in
|
||||
various programming languages), but only in stored procedures. It was felt to
|
||||
be undesirable to create stored procedures on the HPR database since this was
|
||||
only possible through B<phpMyAdmin> which is due to be phased out.
|
||||
|
||||
This script was written to enable the catching of errors instead.
|
||||
|
||||
=head2 SCRIPT DESIGN
|
||||
|
||||
The main loop returns all rows with non-ASCII characters in the field being
|
||||
processed. For each row an 'UPDATE' query is performed using the 'id' field
|
||||
(episode number) to select it:
|
||||
|
||||
UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8)
|
||||
WHERE id = value
|
||||
|
||||
This is performed inside a B<try/catch> statement so that if the query fails
|
||||
it does not stop the script. Successes and failures are logged.
|
||||
|
||||
This algorithm is fairly slow, particularly for the 'notes' field which has
|
||||
the most (nearly 600) non-ASCII rows. However, it seems to work as desired.
|
||||
|
||||
The B<-skip=N> and B<-limit=N> options allow control over the conversion
|
||||
process such that the work can be done in batches.
|
||||
|
||||
Note that the log file used by the script is called B<convert_latin1.log>. It
|
||||
is appended to on every run. The file name can only be changed by editing the
|
||||
script.
|
||||
|
||||
=head1 DIAGNOSTICS
|
||||
|
||||
A list of every error and warning message that the application can generate
|
||||
(even the ones that will "never happen"), with a full explanation of each
|
||||
problem, one or more likely causes, and any suggested remedies. If the
|
||||
application generates exit status codes (e.g. under Unix) then list the exit
|
||||
status associated with each error.
|
||||
|
||||
|
||||
=head1 CONFIGURATION AND ENVIRONMENT
|
||||
|
||||
The script obtains the credentials it requires to open the HPR database from
|
||||
a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
|
||||
directory holding the script. This can be changed by use of the
|
||||
B<-configuration=FILE> option as described above.
|
||||
|
||||
The configuration file format is as follows:
|
||||
|
||||
<database>
|
||||
host = 127.0.0.1
|
||||
port = PORT
|
||||
name = DATABASE
|
||||
user = USERNAME
|
||||
password = PASSWORD
|
||||
</database>
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
Config::General
|
||||
DBI
|
||||
Data::Dumper
|
||||
Getopt::Long
|
||||
Log::Handler
|
||||
Log::Handler::Output::File
|
||||
Pod::Usage
|
||||
SQL::Abstract
|
||||
|
||||
The script uses the experimental B<try> feature and disables the warning that
|
||||
this feature generates. Note that this feature is only available in Perl
|
||||
versions at 5.34.0 or above (the script was developed under v5.36.0).
|
||||
|
||||
=head1 BUGS AND LIMITATIONS
|
||||
|
||||
There are no known bugs in this module.
|
||||
Please report problems to Dave Morriss (Dave.Morriss@gmail.com) Patches are
|
||||
welcome.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Dave Morriss (Dave.Morriss@gmail.com)
|
||||
|
||||
=head1 LICENCE AND COPYRIGHT
|
||||
|
||||
Copyright (c) 2023 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
||||
|
||||
This module is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
=cut
|
||||
|
||||
#}}}
|
||||
|
||||
# [zo to open fold, zc to close]
|
||||
|
||||
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
||||
|
Reference in New Issue
Block a user