#!/usr/bin/env perl #=============================================================================== # # FILE: convert_latin1 # # USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N] # # DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR # database # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.1.2 # CREATED: 2023-05-04 10:07:04 # REVISION: 2023-05-08 12:15:49 # #=============================================================================== use v5.16; use strict; use warnings; #use utf8; # Using experimental features, some of which require warnings to be turned off use feature qw{ postderef say signatures state try }; no warnings qw{ experimental::postderef experimental::signatures experimental::try }; use Getopt::Long; use Pod::Usage; use Config::General; #use Encode qw( encode decode is_utf8 ); #use Try::Tiny; #use TryCatch; use SQL::Abstract; use DBI; use Log::Handler; use Log::Handler::Output::File; use Data::Dumper; # # Version number (manually incremented) # our $VERSION = '0.1.2'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; ( my $DIR = $0 ) =~ s|/?[^/]*$||mx; $DIR = '.' unless $DIR; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- # # Constants and other declarations # my $basedir = "$ENV{HOME}/HPR/Database"; my $configfile = "$basedir/.hpr_db.cfg"; my $logfile = "$basedir/${PROG}.log"; my ( $dbh, $sth1, $sth2, $h1 ); my ( $sql, $utf8, $viewed ); # # Map of latin1 characters with their Unicode equivalents {{{ # # Commented out 2023-05-10 since no longer wanted # #my %map_latin1 = ( # q{€â‚¬} => "\N{U+20AC}", # q{ÀÀ} => "\N{U+00C0}", # q{ÁÃ} => "\N{U+00C1}", # q{‚‚} => "\N{U+201A}", # q{ÂÂ} => "\N{U+00C2}", # q{ƒÆ’} => "\N{U+0192}", # q{ÃÃ} => "\N{U+00C3}", # q{„„} => "\N{U+201E}", # q{ÄÄ} => "\N{U+00C4}", # q{……} => "\N{U+2026}", # q{ÅÃ…} => "\N{U+00C5}", # q{†â€} => "\N{U+2020}", # q{ÆÆ} => "\N{U+00C6}", # q{‡â€¡} => "\N{U+2021}", # q{ÇÇ} => "\N{U+00C7}", # q{ˆË†} => "\N{U+02C6}", # q{ÈÈ} => "\N{U+00C8}", # q{‰â€°} => "\N{U+2030}", # q{ÉÉ} => "\N{U+00C9}", # q{ŠÅ} => "\N{U+0160}", # q{ÊÊ} => "\N{U+00CA}", # q{‹â€¹} => "\N{U+2039}", # q{ËË} => "\N{U+00CB}", # q{ŒÅ’} => "\N{U+0152}", # q{ÌÃŒ} => "\N{U+00CC}", # q{ÍÃ} => "\N{U+00CD}", # q{ŽÅ½} => "\N{U+017D}", # q{ÎÃŽ} => "\N{U+00CE}", # q{ÏÃ} => "\N{U+00CF}", # q{ÐÃ} => "\N{U+00D0}", # q{‘‘} => "\N{U+2018}", # q{ÑÑ} => "\N{U+00D1}", # q{’’} => "\N{U+2019}", # q{ÒÃ’} => "\N{U+00D2}", # q{““} => "\N{U+201C}", # q{ÓÓ} => "\N{U+00D3}", # q{”â€} => "\N{U+201D}", # q{ÔÔ} => "\N{U+00D4}", # q{•â€¢} => "\N{U+2022}", # q{ÕÕ} => "\N{U+00D5}", # q{––} => "\N{U+2013}", # q{ÖÖ} => "\N{U+00D6}", # q{——} => "\N{U+2014}", # q{××} => "\N{U+00D7}", # q{˜Ëœ} => "\N{U+02DC}", # q{ØØ} => "\N{U+00D8}", # q{™â„¢} => "\N{U+2122}", # q{ÙÙ} => "\N{U+00D9}", # q{šÅ¡} => "\N{U+0161}", # q{ÚÚ} => "\N{U+00DA}", # q{›â€º} => "\N{U+203A}", # q{ÛÛ} => "\N{U+00DB}", # q{œÅ“} => "\N{U+0153}", # q{ÜÃœ} => "\N{U+00DC}", # q{ÝÃ} => "\N{U+00DD}", # q{žÅ¾} => "\N{U+017E}", # q{ÞÞ} => "\N{U+00DE}", # q{ŸÅ¸} => "\N{U+0178}", # q{ßß} => "\N{U+00DF}", # q{Â} => "\N{U+00A0}", # q{àÃ} => "\N{U+00E0}", # q{¡Â¡} => "\N{U+00A1}", # q{áá} => "\N{U+00E1}", # q{¢Â¢} => "\N{U+00A2}", # q{ââ} => "\N{U+00E2}", # q{£Â£} => "\N{U+00A3}", # q{ãã} => "\N{U+00E3}", # q{¤Â¤} => "\N{U+00A4}", # q{ää} => "\N{U+00E4}", # q{¥Â¥} => "\N{U+00A5}", # q{åÃ¥} => "\N{U+00E5}", # q{¦Â¦} => "\N{U+00A6}", # q{ææ} => "\N{U+00E6}", # q{§Â§} => "\N{U+00A7}", # q{çç} => "\N{U+00E7}", # q{¨Â¨} => "\N{U+00A8}", # q{èè} => "\N{U+00E8}", # q{©Â©} => "\N{U+00A9}", # q{éé} => "\N{U+00E9}", # q{ªÂª} => "\N{U+00AA}", # q{êê} => "\N{U+00EA}", # q{«Â«} => "\N{U+00AB}", # q{ëë} => "\N{U+00EB}", # q{¬Â¬} => "\N{U+00AC}", # q{ìì} => "\N{U+00EC}", # q{Â} => "\N{U+00AD}", # q{íÃ} => "\N{U+00ED}", # q{®Â®} => "\N{U+00AE}", # q{îî} => "\N{U+00EE}", # q{¯Â¯} => "\N{U+00AF}", # q{ïï} => "\N{U+00EF}", # q{°Â°} => "\N{U+00B0}", # q{ðð} => "\N{U+00F0}", # q{±Â±} => "\N{U+00B1}", # q{ññ} => "\N{U+00F1}", # q{²Â²} => "\N{U+00B2}", # q{òò} => "\N{U+00F2}", # q{³Â³} => "\N{U+00B3}", # q{óó} => "\N{U+00F3}", # q{´Â´} => "\N{U+00B4}", # q{ôô} => "\N{U+00F4}", # q{µÂµ} => "\N{U+00B5}", # q{õõ} => "\N{U+00F5}", # q{¶Â¶} => "\N{U+00B6}", # q{öö} => "\N{U+00F6}", # q{·Â·} => "\N{U+00B7}", # q{÷÷} => "\N{U+00F7}", # q{¸Â¸} => "\N{U+00B8}", # q{øø} => "\N{U+00F8}", # q{¹Â¹} => "\N{U+00B9}", # q{ùù} => "\N{U+00F9}", # q{ºÂº} => "\N{U+00BA}", # q{úú} => "\N{U+00FA}", # q{»Â»} => "\N{U+00BB}", # q{ûû} => "\N{U+00FB}", # q{¼Â¼} => "\N{U+00BC}", # q{üü} => "\N{U+00FC}", # q{½Â½} => "\N{U+00BD}", # q{ýý} => "\N{U+00FD}", # q{¾Â¾} => "\N{U+00BE}", # q{þþ} => "\N{U+00FE}", # q{¿Â¿} => "\N{U+00BF}", # q{ÿÿ} => "\N{U+00FF}", #); # # Build a regex from all of the hash keys # #my $regex = join('|',sort(keys(%map_latin1))); #$regex=qr{$regex}; #}}} # # Enable Unicode output mode # binmode STDOUT, ":encoding(UTF-8)"; binmode STDERR, ":encoding(UTF-8)"; #------------------------------------------------------------------------------- # Options and arguments {{{ #------------------------------------------------------------------------------- # # Process options # my %options; Options( \%options ); # # Default help # pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ) if ( $options{'help'} ); # # Full documentation if requested with -doc # pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 ) if ( $options{'doc'} ); # # Collect options # my $cfgfile = ( defined( $options{config} ) ? $options{config} : $configfile ); my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); my $verbose = ( defined( $options{verbose} ) ? $options{verbose} : 0 ); my $field = $options{field}; my $skip = $options{skip} // 0; my $limit = $options{limit} // 0; # }}} # # Sanity checks # die "Unable to find $cfgfile\n" unless ( -e $cfgfile ); if ($field) { $field = lc($field); die "Invalid value for -field=FIELD\n" unless ( $field =~ /title|summary|tags|notes/ ); } else { $field = 'title'; } #------------------------------------------------------------------------------- # Load configuration data #------------------------------------------------------------------------------- my $conf = new Config::General( -ConfigFile => $cfgfile, -InterPolateVars => 1, -ExtendedAccess => 1 ); my %config = $conf->getall(); #------------------------------------------------------------------------------- # Connect to the database #------------------------------------------------------------------------------- my $dbhost = $config{database}->{host} // '127.0.0.1'; my $dbport = $config{database}->{port} // 3306; my $dbname = $config{database}->{name}; my $dbuser = $config{database}->{user}; my $dbpwd = $config{database}->{password}; $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", $dbuser, $dbpwd, { AutoCommit => 1 } ) or die $DBI::errstr; # # Enable client-side UTF8 # $dbh->{mysql_enable_utf8} = 1; #------------------------------------------------------------------------------- # Set up logging keeping the default log layout except for the date #------------------------------------------------------------------------------- my $log = Log::Handler->new(); $log->add( file => { timeformat => "%Y-%m-%d %H:%M:%S", filename => $logfile, maxlevel => 7, minlevel => 0, utf8 => 1, } ); # # Log the settings being used # $log->info("---- Running version $VERSION"); $log->info("Configuration file $cfgfile"); $log->info("Processing field '$field'"); $log->info("Skipping $skip non-ASCII rows") if $skip; $log->info("Update limit is $limit") if $limit; $log->info("Dry-run mode") if ($dry_run); # # Adjust limit # $limit += $skip if $skip; #------------------------------------------------------------------------------- # Perform a scan of episodes for the chosen field which contains non-ASCII #------------------------------------------------------------------------------- $sql = sprintf( q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id}, $field, $field, $field ); $sth1 = $dbh->prepare($sql) or die $DBI::errstr; $sth1->execute; if ( $dbh->err ) { warn $dbh->errstr; } # # Prepare SQL::Abstract and the SQL template for the updates # my $sqla = SQL::Abstract->new; my $stmt1 = sprintf( q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)}, $field, $field ); #------------------------------------------------------------------------------- # Loop through what we get from the main query, attempting to convert each field #------------------------------------------------------------------------------- $viewed = 0; while ( $h1 = $sth1->fetchrow_hashref ) { $viewed++; next if $viewed <= $skip; # # Prepare the 'WHERE' part of the SQL # my %where = ( id => $h1->{id} ); my ( $stmt2, @bind ) = $sqla->where( \%where ); my $stmt = "${stmt1}${stmt2}"; # # In dry-run mode just report what would have been done, otherwise try and # make the change. # if ($dry_run) { if ($verbose) { printf "[%04d] %s\n", $h1->{id}, ( $field eq 'notes' ? '' : $h1->{$field} ); } say "SQL: ${stmt}"; say "Arguments: ",join( ',', @bind ); } else { $sth2 = $dbh->prepare($stmt) or die $DBI::errstr; # # The SQL could generate an error which we'll try and intercept # try { $sth2->execute(@bind) or die $DBI::errstr; $log->info("Updated $field field for row $h1->{id}"); } catch ($e) { $log->info("Failed to update $field field for row $h1->{id}"); $log->info("Error: $e"); } } } continue { if ($limit) { if ($viewed >= $limit) { $log->info("Update limit reached"); last; }; } } exit; #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "doc", "dry-run!", "verbose!", "config=s", "field=s", "skip=i", "limit=i", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME convert_latin1 - a script to convert fields in the HPR database to UTF-8 =head1 VERSION This documentation refers to convert_latin1 version 0.1.2 =head1 USAGE ./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run] [-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N] ./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title ./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes -limit=10 =head1 OPTIONS =over 8 =item B<-help> Prints a brief help message describing the usage of the program, and then exits. =item B<-doc> Displays the entirety of the documentation (using a pager), and then exits. To generate a PDF version use: pod2pdf convert_latin1 --out=convert_latin1.pdf =item B<-config=FILE> This option allows an alternative configuration file to be used. This file defines the location of the database, its port, its name and the username and password to be used to access it. This feature was added to allow the script to access alternative databases or the live database over an SSH tunnel. See the CONFIGURATION AND ENVIRONMENT section below for the file format. If the option is omitted the default file is used: B<.hpr_db.cfg> =item B<-[no]dry-run> Controls whether the program runs in a mode where it performs database updates. When enabled the details of the updates to be performed are shown, otherwise the updates are applied. The default B<-nodry-run> allows the program to perform the changes. =item B<-[no]verbose> Normally very little is reported by the script, although details of errors are reported. When B<-verbose> is selected more information about the number of rows needing work, the updates performed (or which would have been performed) and how many changes were made is reported. =item B<-field=FIELDNAME> This option defines the database field name to be converted. The permitted names are B