forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: convert_latin1
 | |
| #
 | |
| #        USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N]
 | |
| #
 | |
| #  DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR
 | |
| #               database
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.1.2
 | |
| #      CREATED: 2023-05-04 10:07:04
 | |
| #     REVISION: 2023-05-08 12:15:49
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use v5.16;
 | |
| use strict;
 | |
| use warnings;
 | |
| #use utf8;
 | |
| 
 | |
| # Using experimental features, some of which require warnings to be turned off
 | |
| use feature qw{ postderef say signatures state try };
 | |
| no warnings qw{
 | |
|     experimental::postderef
 | |
|     experimental::signatures
 | |
|     experimental::try
 | |
| };
 | |
| 
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use Config::General;
 | |
| 
 | |
| #use Encode qw( encode decode is_utf8 );
 | |
| #use Try::Tiny;
 | |
| #use TryCatch;
 | |
| 
 | |
| use SQL::Abstract;
 | |
| use DBI;
 | |
| 
 | |
| use Log::Handler;
 | |
| use Log::Handler::Output::File;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.1.2';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir    = "$ENV{HOME}/HPR/Database";
 | |
| my $configfile = "$basedir/.hpr_db.cfg";
 | |
| my $logfile    = "$basedir/${PROG}.log";
 | |
| 
 | |
| my ( $dbh, $sth1, $sth2, $h1 );
 | |
| my ( $sql, $utf8, $viewed );
 | |
| 
 | |
| #
 | |
| # Map of latin1 characters with their Unicode equivalents {{{
 | |
| #
 | |
| # Commented out 2023-05-10 since no longer wanted
 | |
| #
 | |
| #my %map_latin1 = (
 | |
| #    q{€â‚¬}  => "\N{U+20AC}",
 | |
| #    q{ÀÀ}   => "\N{U+00C0}",
 | |
| #    q{ÁÃ}    => "\N{U+00C1}",
 | |
| #    q{‚‚}  => "\N{U+201A}",
 | |
| #    q{ÂÂ}   => "\N{U+00C2}",
 | |
| #    q{ƒÆ’}   => "\N{U+0192}",
 | |
| #    q{ÃÃ}   => "\N{U+00C3}",
 | |
| #    q{„„}  => "\N{U+201E}",
 | |
| #    q{ÄÄ}   => "\N{U+00C4}",
 | |
| #    q{……}  => "\N{U+2026}",
 | |
| #    q{ÅÃ…}   => "\N{U+00C5}",
 | |
| #    q{†â€}   => "\N{U+2020}",
 | |
| #    q{ÆÃ†}   => "\N{U+00C6}",
 | |
| #    q{‡â€¡}  => "\N{U+2021}",
 | |
| #    q{ÇÇ}   => "\N{U+00C7}",
 | |
| #    q{ˆË†}   => "\N{U+02C6}",
 | |
| #    q{ÈÈ}   => "\N{U+00C8}",
 | |
| #    q{‰â€°}  => "\N{U+2030}",
 | |
| #    q{ÉÉ}   => "\N{U+00C9}",
 | |
| #    q{ŠÅ}    => "\N{U+0160}",
 | |
| #    q{ÊÊ}   => "\N{U+00CA}",
 | |
| #    q{‹â€¹}  => "\N{U+2039}",
 | |
| #    q{ËË}   => "\N{U+00CB}",
 | |
| #    q{ŒÅ’}   => "\N{U+0152}",
 | |
| #    q{ÌÃŒ}   => "\N{U+00CC}",
 | |
| #    q{ÍÃ}    => "\N{U+00CD}",
 | |
| #    q{ŽÅ½}   => "\N{U+017D}",
 | |
| #    q{ÎÃŽ}   => "\N{U+00CE}",
 | |
| #    q{ÏÃ}    => "\N{U+00CF}",
 | |
| #    q{ÐÃ}    => "\N{U+00D0}",
 | |
| #    q{‘‘}  => "\N{U+2018}",
 | |
| #    q{ÑÑ}   => "\N{U+00D1}",
 | |
| #    q{Չ۪}  => "\N{U+2019}",
 | |
| #    q{ÒÃ’}   => "\N{U+00D2}",
 | |
| #    q{““}  => "\N{U+201C}",
 | |
| #    q{ÓÓ}   => "\N{U+00D3}",
 | |
| #    q{”â€}   => "\N{U+201D}",
 | |
| #    q{ÔÔ}   => "\N{U+00D4}",
 | |
| #    q{••}  => "\N{U+2022}",
 | |
| #    q{ÕÕ}   => "\N{U+00D5}",
 | |
| #    q{––}  => "\N{U+2013}",
 | |
| #    q{ÖÖ}   => "\N{U+00D6}",
 | |
| #    q{——}  => "\N{U+2014}",
 | |
| #    q{××}   => "\N{U+00D7}",
 | |
| #    q{˜Ëœ}   => "\N{U+02DC}",
 | |
| #    q{ØÃ˜}   => "\N{U+00D8}",
 | |
| #    q{™â„¢}  => "\N{U+2122}",
 | |
| #    q{ÙÙ}   => "\N{U+00D9}",
 | |
| #    q{šÅ¡}   => "\N{U+0161}",
 | |
| #    q{ÚÚ}   => "\N{U+00DA}",
 | |
| #    q{݉ۼ}  => "\N{U+203A}",
 | |
| #    q{ÛÛ}   => "\N{U+00DB}",
 | |
| #    q{œÅ“}   => "\N{U+0153}",
 | |
| #    q{ÜÜ}   => "\N{U+00DC}",
 | |
| #    q{ÝÃ}    => "\N{U+00DD}",
 | |
| #    q{žÅ¾}   => "\N{U+017E}",
 | |
| #    q{ÞÞ}   => "\N{U+00DE}",
 | |
| #    q{ŸÅ¸}   => "\N{U+0178}",
 | |
| #    q{ßß}   => "\N{U+00DF}",
 | |
| #    q{Â}     => "\N{U+00A0}",
 | |
| #    q{àÃ}    => "\N{U+00E0}",
 | |
| #    q{¡Â¡}   => "\N{U+00A1}",
 | |
| #    q{áá}   => "\N{U+00E1}",
 | |
| #    q{¢Â¢}   => "\N{U+00A2}",
 | |
| #    q{ââ}   => "\N{U+00E2}",
 | |
| #    q{£Â£}   => "\N{U+00A3}",
 | |
| #    q{ãã}   => "\N{U+00E3}",
 | |
| #    q{¤Â¤}   => "\N{U+00A4}",
 | |
| #    q{ää}   => "\N{U+00E4}",
 | |
| #    q{¥Â¥}   => "\N{U+00A5}",
 | |
| #    q{åÃ¥}   => "\N{U+00E5}",
 | |
| #    q{¦Â¦}   => "\N{U+00A6}",
 | |
| #    q{æÃ¦}   => "\N{U+00E6}",
 | |
| #    q{§Â§}   => "\N{U+00A7}",
 | |
| #    q{çç}   => "\N{U+00E7}",
 | |
| #    q{¨Â¨}   => "\N{U+00A8}",
 | |
| #    q{èè}   => "\N{U+00E8}",
 | |
| #    q{©Â©}   => "\N{U+00A9}",
 | |
| #    q{éé}   => "\N{U+00E9}",
 | |
| #    q{ªÂª}   => "\N{U+00AA}",
 | |
| #    q{êê}   => "\N{U+00EA}",
 | |
| #    q{«Â«}   => "\N{U+00AB}",
 | |
| #    q{ëë}   => "\N{U+00EB}",
 | |
| #    q{¬Â¬}   => "\N{U+00AC}",
 | |
| #    q{ìì}   => "\N{U+00EC}",
 | |
| #    q{Â}   => "\N{U+00AD}",
 | |
| #    q{íÃ}   => "\N{U+00ED}",
 | |
| #    q{®Â®}   => "\N{U+00AE}",
 | |
| #    q{îî}   => "\N{U+00EE}",
 | |
| #    q{¯Â¯}   => "\N{U+00AF}",
 | |
| #    q{ïï}   => "\N{U+00EF}",
 | |
| #    q{°Â°}   => "\N{U+00B0}",
 | |
| #    q{ðð}   => "\N{U+00F0}",
 | |
| #    q{±Â±}   => "\N{U+00B1}",
 | |
| #    q{ññ}   => "\N{U+00F1}",
 | |
| #    q{²Â²}   => "\N{U+00B2}",
 | |
| #    q{òò}   => "\N{U+00F2}",
 | |
| #    q{³Â³}   => "\N{U+00B3}",
 | |
| #    q{óó}   => "\N{U+00F3}",
 | |
| #    q{´Â´}   => "\N{U+00B4}",
 | |
| #    q{ôô}   => "\N{U+00F4}",
 | |
| #    q{µÂµ}   => "\N{U+00B5}",
 | |
| #    q{õõ}   => "\N{U+00F5}",
 | |
| #    q{¶Â¶}   => "\N{U+00B6}",
 | |
| #    q{öö}   => "\N{U+00F6}",
 | |
| #    q{·Â·}   => "\N{U+00B7}",
 | |
| #    q{÷÷}   => "\N{U+00F7}",
 | |
| #    q{¸Â¸}   => "\N{U+00B8}",
 | |
| #    q{øÃ¸}   => "\N{U+00F8}",
 | |
| #    q{¹Â¹}   => "\N{U+00B9}",
 | |
| #    q{ùù}   => "\N{U+00F9}",
 | |
| #    q{ºÂº}   => "\N{U+00BA}",
 | |
| #    q{úú}   => "\N{U+00FA}",
 | |
| #    q{»Â»}   => "\N{U+00BB}",
 | |
| #    q{ûû}   => "\N{U+00FB}",
 | |
| #    q{¼Â¼}   => "\N{U+00BC}",
 | |
| #    q{üü}   => "\N{U+00FC}",
 | |
| #    q{½Â½}   => "\N{U+00BD}",
 | |
| #    q{ýý}   => "\N{U+00FD}",
 | |
| #    q{¾Â¾}   => "\N{U+00BE}",
 | |
| #    q{þþ}   => "\N{U+00FE}",
 | |
| #    q{¿Â¿}   => "\N{U+00BF}",
 | |
| #    q{ÿÿ}   => "\N{U+00FF}",
 | |
| #);
 | |
| 
 | |
| #
 | |
| # Build a regex from all of the hash keys
 | |
| #
 | |
| #my $regex = join('|',sort(keys(%map_latin1)));
 | |
| #$regex=qr{$regex};
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| #
 | |
| # Enable Unicode output mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options and arguments {{{
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Process options
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # Full documentation if requested with -doc
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
 | |
|     if ( $options{'doc'} );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| my $cfgfile
 | |
|     = ( defined( $options{config} ) ? $options{config} : $configfile );
 | |
| my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
 | |
| my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 );
 | |
| my $field   = $options{field};
 | |
| my $skip    = $options{skip} // 0;
 | |
| my $limit   = $options{limit} // 0;
 | |
| 
 | |
| # }}}
 | |
| 
 | |
| #
 | |
| # Sanity checks
 | |
| #
 | |
| die "Unable to find $cfgfile\n" unless ( -e $cfgfile );
 | |
| if ($field) {
 | |
|     $field = lc($field);
 | |
|     die "Invalid value for -field=FIELD\n"
 | |
|         unless ( $field =~ /title|summary|tags|notes/ );
 | |
| }
 | |
| else {
 | |
|     $field = 'title';
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Load configuration data
 | |
| #-------------------------------------------------------------------------------
 | |
| my $conf = new Config::General(
 | |
|     -ConfigFile      => $cfgfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1
 | |
| );
 | |
| my %config = $conf->getall();
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Connect to the database
 | |
| #-------------------------------------------------------------------------------
 | |
| my $dbhost = $config{database}->{host} // '127.0.0.1';
 | |
| my $dbport = $config{database}->{port} // 3306;
 | |
| my $dbname = $config{database}->{name};
 | |
| my $dbuser = $config{database}->{user};
 | |
| my $dbpwd  = $config{database}->{password};
 | |
| $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
 | |
|     $dbuser, $dbpwd, { AutoCommit => 1 } )
 | |
|     or die $DBI::errstr;
 | |
| 
 | |
| #
 | |
| # Enable client-side UTF8
 | |
| #
 | |
| $dbh->{mysql_enable_utf8} = 1;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up logging keeping the default log layout except for the date
 | |
| #-------------------------------------------------------------------------------
 | |
| my $log = Log::Handler->new();
 | |
| 
 | |
| $log->add(
 | |
|     file => {
 | |
|         timeformat => "%Y-%m-%d %H:%M:%S",
 | |
|         filename   => $logfile,
 | |
|         maxlevel   => 7,
 | |
|         minlevel   => 0,
 | |
|         utf8       => 1,
 | |
|     }
 | |
| );
 | |
| 
 | |
| #
 | |
| # Log the settings being used
 | |
| #
 | |
| $log->info("---- Running version $VERSION");
 | |
| $log->info("Configuration file $cfgfile");
 | |
| $log->info("Processing field '$field'");
 | |
| $log->info("Skipping $skip non-ASCII rows") if $skip;
 | |
| $log->info("Update limit is $limit") if $limit;
 | |
| $log->info("Dry-run mode") if ($dry_run);
 | |
| 
 | |
| #
 | |
| # Adjust limit
 | |
| #
 | |
| $limit += $skip if $skip;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Perform a scan of episodes for the chosen field which contains non-ASCII
 | |
| #-------------------------------------------------------------------------------
 | |
| $sql = sprintf(
 | |
|     q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id},
 | |
|     $field, $field, $field
 | |
| );
 | |
| 
 | |
| $sth1 = $dbh->prepare($sql) or die $DBI::errstr;
 | |
| 
 | |
| $sth1->execute;
 | |
| if ( $dbh->err ) {
 | |
|     warn $dbh->errstr;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Prepare SQL::Abstract and the SQL template for the updates
 | |
| #
 | |
| my $sqla = SQL::Abstract->new;
 | |
| 
 | |
| my $stmt1 = sprintf(
 | |
|     q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)},
 | |
|     $field, $field
 | |
| );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Loop through what we get from the main query, attempting to convert each field
 | |
| #-------------------------------------------------------------------------------
 | |
| $viewed = 0;
 | |
| while ( $h1 = $sth1->fetchrow_hashref ) {
 | |
|     $viewed++;
 | |
|     next if $viewed <= $skip;
 | |
| 
 | |
|     #
 | |
|     # Prepare the 'WHERE' part of the SQL
 | |
|     #
 | |
|     my %where = ( id => $h1->{id} );
 | |
|     my ( $stmt2, @bind ) = $sqla->where( \%where );
 | |
|     my $stmt = "${stmt1}${stmt2}";
 | |
| 
 | |
|     #
 | |
|     # In dry-run mode just report what would have been done, otherwise try and
 | |
|     # make the change.
 | |
|     #
 | |
|     if ($dry_run) {
 | |
|         if ($verbose) {
 | |
|             printf "[%04d] %s\n", $h1->{id},
 | |
|                 (
 | |
|                   $field eq 'notes'
 | |
|                 ? ''
 | |
|                 : $h1->{$field}
 | |
|                 );
 | |
|         }
 | |
| 
 | |
|         say "SQL: ${stmt}";
 | |
|         say "Arguments: ",join( ',', @bind );
 | |
|     }
 | |
|     else {
 | |
|         $sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
 | |
| 
 | |
|         #
 | |
|         # The SQL could generate an error which we'll try and intercept
 | |
|         #
 | |
|         try {
 | |
|             $sth2->execute(@bind)
 | |
|                 or die $DBI::errstr;
 | |
|             $log->info("Updated $field field for row $h1->{id}");
 | |
|         }
 | |
|         catch ($e) {
 | |
|             $log->info("Failed to update $field field for row $h1->{id}");
 | |
|             $log->info("Error: $e");
 | |
|         }
 | |
|     }
 | |
| 
 | |
| }
 | |
| continue {
 | |
|     if ($limit) {
 | |
|         if ($viewed >= $limit) {
 | |
|             $log->info("Update limit reached");
 | |
|             last;
 | |
|         };
 | |
|     }
 | |
| }
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",     "doc",     "dry-run!", "verbose!",
 | |
|         "config=s", "field=s", "skip=i",   "limit=i",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| convert_latin1 - a script to convert fields in the HPR database to UTF-8
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to convert_latin1 version 0.1.2
 | |
| 
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     ./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run]
 | |
|         [-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N]
 | |
| 
 | |
|     ./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title
 | |
|     ./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes
 | |
|         -limit=10
 | |
| 
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Prints a brief help message describing the usage of the program, and then exits.
 | |
| 
 | |
| =item B<-doc>
 | |
| 
 | |
| Displays the entirety of the documentation (using a pager), and then exits. To
 | |
| generate a PDF version use:
 | |
| 
 | |
|     pod2pdf convert_latin1 --out=convert_latin1.pdf
 | |
| 
 | |
| =item B<-config=FILE>
 | |
| 
 | |
| This option allows an alternative configuration file to be used. This file
 | |
| defines the location of the database, its port, its name and the username and
 | |
| password to be used to access it. This feature was added to allow the script
 | |
| to access alternative databases or the live database over an SSH tunnel.
 | |
| 
 | |
| See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | |
| 
 | |
| If the option is omitted the default file is used: B<.hpr_db.cfg>
 | |
| 
 | |
| =item B<-[no]dry-run>
 | |
| 
 | |
| Controls whether the program runs in a mode where it performs database
 | |
| updates. When enabled the details of the updates to be performed are shown,
 | |
| otherwise the updates are applied. The default B<-nodry-run> allows the
 | |
| program to perform the changes.
 | |
| 
 | |
| =item B<-[no]verbose>
 | |
| 
 | |
| Normally very little is reported by the script, although details of errors
 | |
| are reported. When B<-verbose> is selected more information
 | |
| about the number of rows needing work, the updates performed (or which would
 | |
| have been performed) and how many changes were made is reported.
 | |
| 
 | |
| =item B<-field=FIELDNAME>
 | |
| 
 | |
| This option defines the database field name to be converted. The permitted
 | |
| names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed
 | |
| to be B<eps>. If the option is not provided the default field B<title> will be
 | |
| used.
 | |
| 
 | |
| =item B<-skip=N>
 | |
| 
 | |
| This option defines the number of database rows to skip when processing the
 | |
| selected field. If omitted then no rows are skipped. The option is useful to
 | |
| allow the work to be split into manageable batches, in conjunction with the
 | |
| B<-limit=N> option below.
 | |
| 
 | |
| =item B<-limit=N>
 | |
| 
 | |
| This option defines the number of database rows to work on when processing the
 | |
| selected field. If omitted then all rows are processed (after any skip defined
 | |
| with te B<-skip=N> option). The option is useful to allow the work to split
 | |
| into manageable batches, in conjunction with the B<-skip=N> option above.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| =head2 OVERVIEW
 | |
| 
 | |
| The script is designed to repair the HPR MySQL (MariaDB) database which holds
 | |
| show metadata. The database was created with 'latin1' encoding, and was later
 | |
| changed to use UTF-8. However, no action was taken to ensure the PHP software
 | |
| managing the database also used UTF-8. This meant that the 'latin1' encoded data
 | |
| was still being generated as Unicode UTF-8 data was being added, and was being
 | |
| rendered in the expected way, while there was little or no UTF-8 data being
 | |
| stored.
 | |
| 
 | |
| The PHP deficiencies were rectified in April 2023 but this meant that all
 | |
| non-ASCII characters stored in the database before that were rendered
 | |
| incorrectly. The solution was to convert all 'latin1' non-ASCII data into
 | |
| UTF-8, and that is what this script does.
 | |
| 
 | |
| Detecting non ASCII in database fields was performed with the following SQL:
 | |
| 
 | |
|     SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id
 | |
| 
 | |
| This is used to generate a list of all rows which might need conversion to
 | |
| UTF-8. However, the test is only whether there is non-ASCII data in the row.
 | |
| 
 | |
| Ideally, the conversion could have been performed entirely within the database
 | |
| with SQL such as the following (for each field):
 | |
| 
 | |
|     UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8)
 | |
|     WHERE field <> CONVERT(field USING ASCII);
 | |
| 
 | |
| However, the conversion to UTF-8 fails when the field already contains such
 | |
| characters, stopping the query.
 | |
| 
 | |
| MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in
 | |
| various programming languages), but only in stored procedures. It was felt to
 | |
| be undesirable to create stored procedures on the HPR database since this was
 | |
| only possible through B<phpMyAdmin> which is due to be phased out.
 | |
| 
 | |
| This script was written to enable the catching of errors instead.
 | |
| 
 | |
| =head2 SCRIPT DESIGN
 | |
| 
 | |
| The main loop returns all rows with non-ASCII characters in the field being
 | |
| processed. For each row an 'UPDATE' query is performed using the 'id' field
 | |
| (episode number) to select it:
 | |
| 
 | |
|     UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8)
 | |
|     WHERE id = value
 | |
| 
 | |
| This is performed inside a B<try/catch> statement so that if the query fails
 | |
| it does not stop the script. Successes and failures are logged.
 | |
| 
 | |
| This algorithm is fairly slow, particularly for the 'notes' field which has
 | |
| the most (nearly 600) non-ASCII rows. However, it seems to work as desired.
 | |
| 
 | |
| The B<-skip=N> and B<-limit=N> options allow control over the conversion
 | |
| process such that the work can be done in batches.
 | |
| 
 | |
| Note that the log file used by the script is called B<convert_latin1.log>. It
 | |
| is appended to on every run. The file name can only be changed by editing the
 | |
| script.
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| A list of every error and warning message that the application can generate
 | |
| (even the ones that will "never happen"), with a full explanation of each
 | |
| problem, one or more likely causes, and any suggested remedies. If the
 | |
| application generates exit status codes (e.g. under Unix) then list the exit
 | |
| status associated with each error.
 | |
| 
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| The script obtains the credentials it requires to open the HPR database from
 | |
| a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the
 | |
| directory holding the script. This can be changed by use of the
 | |
| B<-configuration=FILE> option as described above.
 | |
| 
 | |
| The configuration file format is as follows:
 | |
| 
 | |
|  <database>
 | |
|      host = 127.0.0.1
 | |
|      port = PORT
 | |
|      name = DATABASE
 | |
|      user = USERNAME
 | |
|      password = PASSWORD
 | |
|  </database>
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
|     Config::General
 | |
|     DBI
 | |
|     Data::Dumper
 | |
|     Getopt::Long
 | |
|     Log::Handler
 | |
|     Log::Handler::Output::File
 | |
|     Pod::Usage
 | |
|     SQL::Abstract
 | |
| 
 | |
| The script uses the experimental B<try> feature and disables the warning that
 | |
| this feature generates. Note that this feature is only available in Perl
 | |
| versions at 5.34.0 or above (the script was developed under v5.36.0).
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to Dave Morriss  (Dave.Morriss@gmail.com) Patches are
 | |
| welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Dave Morriss  (Dave.Morriss@gmail.com)
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) 2023 Dave Morriss  (Dave.Morriss@gmail.com). All rights reserved.
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 | |
| 
 |