#!/usr/bin/env perl #=============================================================================== # # FILE: convert_latin1 # # USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N] # # DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR # database # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.1.2 # CREATED: 2023-05-04 10:07:04 # REVISION: 2023-05-08 12:15:49 # #=============================================================================== use v5.16; use strict; use warnings; #use utf8; # Using experimental features, some of which require warnings to be turned off use feature qw{ postderef say signatures state try }; no warnings qw{ experimental::postderef experimental::signatures experimental::try }; use Getopt::Long; use Pod::Usage; use Config::General; #use Encode qw( encode decode is_utf8 ); #use Try::Tiny; #use TryCatch; use SQL::Abstract; use DBI; use Log::Handler; use Log::Handler::Output::File; use Data::Dumper; # # Version number (manually incremented) # our $VERSION = '0.1.2'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; ( my $DIR = $0 ) =~ s|/?[^/]*$||mx; $DIR = '.' unless $DIR; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- # # Constants and other declarations # my $basedir = "$ENV{HOME}/HPR/Database"; my $configfile = "$basedir/.hpr_db.cfg"; my $logfile = "$basedir/${PROG}.log"; my ( $dbh, $sth1, $sth2, $h1 ); my ( $sql, $utf8, $viewed ); # # Map of latin1 characters with their Unicode equivalents {{{ # # Commented out 2023-05-10 since no longer wanted # #my %map_latin1 = ( # q{€â‚¬} => "\N{U+20AC}", # q{ÀÀ} => "\N{U+00C0}", # q{ÁÃ} => "\N{U+00C1}", # q{‚‚} => "\N{U+201A}", # q{ÂÂ} => "\N{U+00C2}", # q{ƒÆ’} => "\N{U+0192}", # q{ÃÃ} => "\N{U+00C3}", # q{„„} => "\N{U+201E}", # q{ÄÄ} => "\N{U+00C4}", # q{……} => "\N{U+2026}", # q{ÅÃ…} => "\N{U+00C5}", # q{†â€} => "\N{U+2020}", # q{ÆÆ} => "\N{U+00C6}", # q{‡â€¡} => "\N{U+2021}", # q{ÇÇ} => "\N{U+00C7}", # q{ˆË†} => "\N{U+02C6}", # q{ÈÈ} => "\N{U+00C8}", # q{‰â€°} => "\N{U+2030}", # q{ÉÉ} => "\N{U+00C9}", # q{ŠÅ} => "\N{U+0160}", # q{ÊÊ} => "\N{U+00CA}", # q{‹â€¹} => "\N{U+2039}", # q{ËË} => "\N{U+00CB}", # q{ŒÅ’} => "\N{U+0152}", # q{ÌÃŒ} => "\N{U+00CC}", # q{ÍÃ} => "\N{U+00CD}", # q{ŽÅ½} => "\N{U+017D}", # q{ÎÃŽ} => "\N{U+00CE}", # q{ÏÃ} => "\N{U+00CF}", # q{ÐÃ} => "\N{U+00D0}", # q{‘‘} => "\N{U+2018}", # q{ÑÑ} => "\N{U+00D1}", # q{’’} => "\N{U+2019}", # q{ÒÃ’} => "\N{U+00D2}", # q{““} => "\N{U+201C}", # q{ÓÓ} => "\N{U+00D3}", # q{”â€} => "\N{U+201D}", # q{ÔÔ} => "\N{U+00D4}", # q{•â€¢} => "\N{U+2022}", # q{ÕÕ} => "\N{U+00D5}", # q{––} => "\N{U+2013}", # q{ÖÖ} => "\N{U+00D6}", # q{——} => "\N{U+2014}", # q{××} => "\N{U+00D7}", # q{˜Ëœ} => "\N{U+02DC}", # q{ØØ} => "\N{U+00D8}", # q{™â„¢} => "\N{U+2122}", # q{ÙÙ} => "\N{U+00D9}", # q{šÅ¡} => "\N{U+0161}", # q{ÚÚ} => "\N{U+00DA}", # q{›â€º} => "\N{U+203A}", # q{ÛÛ} => "\N{U+00DB}", # q{œÅ“} => "\N{U+0153}", # q{ÜÃœ} => "\N{U+00DC}", # q{ÝÃ} => "\N{U+00DD}", # q{žÅ¾} => "\N{U+017E}", # q{ÞÞ} => "\N{U+00DE}", # q{ŸÅ¸} => "\N{U+0178}", # q{ßß} => "\N{U+00DF}", # q{Â} => "\N{U+00A0}", # q{àÃ} => "\N{U+00E0}", # q{¡Â¡} => "\N{U+00A1}", # q{áá} => "\N{U+00E1}", # q{¢Â¢} => "\N{U+00A2}", # q{ââ} => "\N{U+00E2}", # q{£Â£} => "\N{U+00A3}", # q{ãã} => "\N{U+00E3}", # q{¤Â¤} => "\N{U+00A4}", # q{ää} => "\N{U+00E4}", # q{¥Â¥} => "\N{U+00A5}", # q{åÃ¥} => "\N{U+00E5}", # q{¦Â¦} => "\N{U+00A6}", # q{ææ} => "\N{U+00E6}", # q{§Â§} => "\N{U+00A7}", # q{çç} => "\N{U+00E7}", # q{¨Â¨} => "\N{U+00A8}", # q{èè} => "\N{U+00E8}", # q{©Â©} => "\N{U+00A9}", # q{éé} => "\N{U+00E9}", # q{ªÂª} => "\N{U+00AA}", # q{êê} => "\N{U+00EA}", # q{«Â«} => "\N{U+00AB}", # q{ëë} => "\N{U+00EB}", # q{¬Â¬} => "\N{U+00AC}", # q{ìì} => "\N{U+00EC}", # q{­Â­} => "\N{U+00AD}", # q{íí} => "\N{U+00ED}", # q{®Â®} => "\N{U+00AE}", # q{îî} => "\N{U+00EE}", # q{¯Â¯} => "\N{U+00AF}", # q{ïï} => "\N{U+00EF}", # q{°Â°} => "\N{U+00B0}", # q{ðð} => "\N{U+00F0}", # q{±Â±} => "\N{U+00B1}", # q{ññ} => "\N{U+00F1}", # q{²Â²} => "\N{U+00B2}", # q{òò} => "\N{U+00F2}", # q{³Â³} => "\N{U+00B3}", # q{óó} => "\N{U+00F3}", # q{´Â´} => "\N{U+00B4}", # q{ôô} => "\N{U+00F4}", # q{µÂµ} => "\N{U+00B5}", # q{õõ} => "\N{U+00F5}", # q{¶Â¶} => "\N{U+00B6}", # q{öö} => "\N{U+00F6}", # q{·Â·} => "\N{U+00B7}", # q{÷÷} => "\N{U+00F7}", # q{¸Â¸} => "\N{U+00B8}", # q{øø} => "\N{U+00F8}", # q{¹Â¹} => "\N{U+00B9}", # q{ùù} => "\N{U+00F9}", # q{ºÂº} => "\N{U+00BA}", # q{úú} => "\N{U+00FA}", # q{»Â»} => "\N{U+00BB}", # q{ûû} => "\N{U+00FB}", # q{¼Â¼} => "\N{U+00BC}", # q{üü} => "\N{U+00FC}", # q{½Â½} => "\N{U+00BD}", # q{ýý} => "\N{U+00FD}", # q{¾Â¾} => "\N{U+00BE}", # q{þþ} => "\N{U+00FE}", # q{¿Â¿} => "\N{U+00BF}", # q{ÿÿ} => "\N{U+00FF}", #); # # Build a regex from all of the hash keys # #my $regex = join('|',sort(keys(%map_latin1))); #$regex=qr{$regex}; #}}} # # Enable Unicode output mode # binmode STDOUT, ":encoding(UTF-8)"; binmode STDERR, ":encoding(UTF-8)"; #------------------------------------------------------------------------------- # Options and arguments {{{ #------------------------------------------------------------------------------- # # Process options # my %options; Options( \%options ); # # Default help # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ) if ( $options{'help'} ); # # Full documentation if requested with -doc # pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 ) if ( $options{'doc'} ); # # Collect options # my $cfgfile = ( defined( $options{config} ) ? $options{config} : $configfile ); my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 ); my $field = $options{field}; my $skip = $options{skip} // 0; my $limit = $options{limit} // 0; # }}} # # Sanity checks # die "Unable to find $cfgfile\n" unless ( -e $cfgfile ); if ($field) { $field = lc($field); die "Invalid value for -field=FIELD\n" unless ( $field =~ /title|summary|tags|notes/ ); } else { $field = 'title'; } #------------------------------------------------------------------------------- # Load configuration data #------------------------------------------------------------------------------- my $conf = new Config::General( -ConfigFile => $cfgfile, -InterPolateVars => 1, -ExtendedAccess => 1 ); my %config = $conf->getall(); #------------------------------------------------------------------------------- # Connect to the database #------------------------------------------------------------------------------- my $dbhost = $config{database}->{host} // '127.0.0.1'; my $dbport = $config{database}->{port} // 3306; my $dbname = $config{database}->{name}; my $dbuser = $config{database}->{user}; my $dbpwd = $config{database}->{password}; $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", $dbuser, $dbpwd, { AutoCommit => 1 } ) or die $DBI::errstr; # # Enable client-side UTF8 # $dbh->{mysql_enable_utf8} = 1; #------------------------------------------------------------------------------- # Set up logging keeping the default log layout except for the date #------------------------------------------------------------------------------- my $log = Log::Handler->new(); $log->add( file => { timeformat => "%Y-%m-%d %H:%M:%S", filename => $logfile, maxlevel => 7, minlevel => 0, utf8 => 1, } ); # # Log the settings being used # $log->info("---- Running version $VERSION"); $log->info("Configuration file $cfgfile"); $log->info("Processing field '$field'"); $log->info("Skipping $skip non-ASCII rows") if $skip; $log->info("Update limit is $limit") if $limit; $log->info("Dry-run mode") if ($dry_run); # # Adjust limit # $limit += $skip if $skip; #------------------------------------------------------------------------------- # Perform a scan of episodes for the chosen field which contains non-ASCII #------------------------------------------------------------------------------- $sql = sprintf( q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id}, $field, $field, $field ); $sth1 = $dbh->prepare($sql) or die $DBI::errstr; $sth1->execute; if ( $dbh->err ) { warn $dbh->errstr; } # # Prepare SQL::Abstract and the SQL template for the updates # my $sqla = SQL::Abstract->new; my $stmt1 = sprintf( q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)}, $field, $field ); #------------------------------------------------------------------------------- # Loop through what we get from the main query, attempting to convert each field #------------------------------------------------------------------------------- $viewed = 0; while ( $h1 = $sth1->fetchrow_hashref ) { $viewed++; next if $viewed <= $skip; # # Prepare the 'WHERE' part of the SQL # my %where = ( id => $h1->{id} ); my ( $stmt2, @bind ) = $sqla->where( \%where ); my $stmt = "${stmt1}${stmt2}"; # # In dry-run mode just report what would have been done, otherwise try and # make the change. # if ($dry_run) { if ($verbose) { printf "[%04d] %s\n", $h1->{id}, ( $field eq 'notes' ? '' : $h1->{$field} ); } say "SQL: ${stmt}"; say "Arguments: ",join( ',', @bind ); } else { $sth2 = $dbh->prepare($stmt) or die $DBI::errstr; # # The SQL could generate an error which we'll try and intercept # try { $sth2->execute(@bind) or die $DBI::errstr; $log->info("Updated $field field for row $h1->{id}"); } catch ($e) { $log->info("Failed to update $field field for row $h1->{id}"); $log->info("Error: $e"); } } } continue { if ($limit) { if ($viewed >= $limit) { $log->info("Update limit reached"); last; }; } } exit; #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "doc", "dry-run!", "verbose!", "config=s", "field=s", "skip=i", "limit=i", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME convert_latin1 - a script to convert fields in the HPR database to UTF-8 =head1 VERSION This documentation refers to convert_latin1 version 0.1.2 =head1 USAGE ./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run] [-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N] ./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title ./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes -limit=10 =head1 OPTIONS =over 8 =item B<-help> Prints a brief help message describing the usage of the program, and then exits. =item B<-doc> Displays the entirety of the documentation (using a pager), and then exits. To generate a PDF version use: pod2pdf convert_latin1 --out=convert_latin1.pdf =item B<-config=FILE> This option allows an alternative configuration file to be used. This file defines the location of the database, its port, its name and the username and password to be used to access it. This feature was added to allow the script to access alternative databases or the live database over an SSH tunnel. See the CONFIGURATION AND ENVIRONMENT section below for the file format. If the option is omitted the default file is used: B<.hpr_db.cfg> =item B<-[no]dry-run> Controls whether the program runs in a mode where it performs database updates. When enabled the details of the updates to be performed are shown, otherwise the updates are applied. The default B<-nodry-run> allows the program to perform the changes. =item B<-[no]verbose> Normally very little is reported by the script, although details of errors are reported. When B<-verbose> is selected more information about the number of rows needing work, the updates performed (or which would have been performed) and how many changes were made is reported. =item B<-field=FIELDNAME> This option defines the database field name to be converted. The permitted names are B, B<summary>, B<tags> and B<notes> and the table is asumed to be B<eps>. If the option is not provided the default field B<title> will be used. =item B<-skip=N> This option defines the number of database rows to skip when processing the selected field. If omitted then no rows are skipped. The option is useful to allow the work to be split into manageable batches, in conjunction with the B<-limit=N> option below. =item B<-limit=N> This option defines the number of database rows to work on when processing the selected field. If omitted then all rows are processed (after any skip defined with te B<-skip=N> option). The option is useful to allow the work to split into manageable batches, in conjunction with the B<-skip=N> option above. =back =head1 DESCRIPTION =head2 OVERVIEW The script is designed to repair the HPR MySQL (MariaDB) database which holds show metadata. The database was created with 'latin1' encoding, and was later changed to use UTF-8. However, no action was taken to ensure the PHP software managing the database also used UTF-8. This meant that the 'latin1' encoded data was still being generated as Unicode UTF-8 data was being added, and was being rendered in the expected way, while there was little or no UTF-8 data being stored. The PHP deficiencies were rectified in April 2023 but this meant that all non-ASCII characters stored in the database before that were rendered incorrectly. The solution was to convert all 'latin1' non-ASCII data into UTF-8, and that is what this script does. Detecting non ASCII in database fields was performed with the following SQL: SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id This is used to generate a list of all rows which might need conversion to UTF-8. However, the test is only whether there is non-ASCII data in the row. Ideally, the conversion could have been performed entirely within the database with SQL such as the following (for each field): UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8) WHERE field <> CONVERT(field USING ASCII); However, the conversion to UTF-8 fails when the field already contains such characters, stopping the query. MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in various programming languages), but only in stored procedures. It was felt to be undesirable to create stored procedures on the HPR database since this was only possible through B<phpMyAdmin> which is due to be phased out. This script was written to enable the catching of errors instead. =head2 SCRIPT DESIGN The main loop returns all rows with non-ASCII characters in the field being processed. For each row an 'UPDATE' query is performed using the 'id' field (episode number) to select it: UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8) WHERE id = value This is performed inside a B<try/catch> statement so that if the query fails it does not stop the script. Successes and failures are logged. This algorithm is fairly slow, particularly for the 'notes' field which has the most (nearly 600) non-ASCII rows. However, it seems to work as desired. The B<-skip=N> and B<-limit=N> options allow control over the conversion process such that the work can be done in batches. Note that the log file used by the script is called B<convert_latin1.log>. It is appended to on every run. The file name can only be changed by editing the script. =head1 DIAGNOSTICS A list of every error and warning message that the application can generate (even the ones that will "never happen"), with a full explanation of each problem, one or more likely causes, and any suggested remedies. If the application generates exit status codes (e.g. under Unix) then list the exit status associated with each error. =head1 CONFIGURATION AND ENVIRONMENT The script obtains the credentials it requires to open the HPR database from a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the directory holding the script. This can be changed by use of the B<-configuration=FILE> option as described above. The configuration file format is as follows: <database> host = 127.0.0.1 port = PORT name = DATABASE user = USERNAME password = PASSWORD </database> =head1 DEPENDENCIES Config::General DBI Data::Dumper Getopt::Long Log::Handler Log::Handler::Output::File Pod::Usage SQL::Abstract The script uses the experimental B<try> feature and disables the warning that this feature generates. Note that this feature is only available in Perl versions at 5.34.0 or above (the script was developed under v5.36.0). =head1 BUGS AND LIMITATIONS There are no known bugs in this module. Please report problems to Dave Morriss (Dave.Morriss@gmail.com) Patches are welcome. =head1 AUTHOR Dave Morriss (Dave.Morriss@gmail.com) =head1 LICENCE AND COPYRIGHT Copyright (c) 2023 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See perldoc perlartistic. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. =cut #}}} # [zo to open fold, zc to close] # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker