forked from HPR/hpr-tools
		
	
		
			
	
	
		
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			658 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: convert_latin1 | ||
|  | # | ||
|  | #        USAGE: ./convert_latin1 [-help] [-doc] [-config=FILE] [-debug=N] | ||
|  | # | ||
|  | #  DESCRIPTION: Find and convert 'latin1' characters to 'utf8' in the HPR | ||
|  | #               database | ||
|  | # | ||
|  | #      OPTIONS: --- | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: --- | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.1.2 | ||
|  | #      CREATED: 2023-05-04 10:07:04 | ||
|  | #     REVISION: 2023-05-08 12:15:49 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use v5.16; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | #use utf8; | ||
|  | 
 | ||
|  | # Using experimental features, some of which require warnings to be turned off | ||
|  | use feature qw{ postderef say signatures state try }; | ||
|  | no warnings qw{ | ||
|  |     experimental::postderef | ||
|  |     experimental::signatures | ||
|  |     experimental::try | ||
|  | }; | ||
|  | 
 | ||
|  | use Getopt::Long; | ||
|  | use Pod::Usage; | ||
|  | 
 | ||
|  | use Config::General; | ||
|  | 
 | ||
|  | #use Encode qw( encode decode is_utf8 ); | ||
|  | #use Try::Tiny; | ||
|  | #use TryCatch; | ||
|  | 
 | ||
|  | use SQL::Abstract; | ||
|  | use DBI; | ||
|  | 
 | ||
|  | use Log::Handler; | ||
|  | use Log::Handler::Output::File; | ||
|  | 
 | ||
|  | use Data::Dumper; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (manually incremented) | ||
|  | # | ||
|  | our $VERSION = '0.1.2'; | ||
|  | 
 | ||
|  | # | ||
|  | # Script and directory names | ||
|  | # | ||
|  | ( my $PROG = $0 ) =~ s|.*/||mx; | ||
|  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | ||
|  | $DIR = '.' unless $DIR; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Declarations | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Constants and other declarations | ||
|  | # | ||
|  | my $basedir    = "$ENV{HOME}/HPR/Database"; | ||
|  | my $configfile = "$basedir/.hpr_db.cfg"; | ||
|  | my $logfile    = "$basedir/${PROG}.log"; | ||
|  | 
 | ||
|  | my ( $dbh, $sth1, $sth2, $h1 ); | ||
|  | my ( $sql, $utf8, $viewed ); | ||
|  | 
 | ||
|  | # | ||
|  | # Map of latin1 characters with their Unicode equivalents {{{ | ||
|  | # | ||
|  | # Commented out 2023-05-10 since no longer wanted | ||
|  | # | ||
|  | #my %map_latin1 = ( | ||
|  | #    q{€â‚¬}  => "\N{U+20AC}", | ||
|  | #    q{ÀÀ}   => "\N{U+00C0}", | ||
|  | #    q{ÁÃ}    => "\N{U+00C1}", | ||
|  | #    q{‚‚}  => "\N{U+201A}", | ||
|  | #    q{ÂÂ}   => "\N{U+00C2}", | ||
|  | #    q{ƒÆ’}   => "\N{U+0192}", | ||
|  | #    q{ÃÃ}   => "\N{U+00C3}", | ||
|  | #    q{„„}  => "\N{U+201E}", | ||
|  | #    q{ÄÄ}   => "\N{U+00C4}", | ||
|  | #    q{……}  => "\N{U+2026}", | ||
|  | #    q{ÅÃ…}   => "\N{U+00C5}", | ||
|  | #    q{†â€}   => "\N{U+2020}", | ||
|  | #    q{ÆÃ†}   => "\N{U+00C6}", | ||
|  | #    q{‡â€¡}  => "\N{U+2021}", | ||
|  | #    q{ÇÇ}   => "\N{U+00C7}", | ||
|  | #    q{ˆË†}   => "\N{U+02C6}", | ||
|  | #    q{ÈÈ}   => "\N{U+00C8}", | ||
|  | #    q{‰â€°}  => "\N{U+2030}", | ||
|  | #    q{ÉÉ}   => "\N{U+00C9}", | ||
|  | #    q{ŠÅ}    => "\N{U+0160}", | ||
|  | #    q{ÊÊ}   => "\N{U+00CA}", | ||
|  | #    q{‹â€¹}  => "\N{U+2039}", | ||
|  | #    q{ËË}   => "\N{U+00CB}", | ||
|  | #    q{ŒÅ’}   => "\N{U+0152}", | ||
|  | #    q{ÌÃŒ}   => "\N{U+00CC}", | ||
|  | #    q{ÍÃ}    => "\N{U+00CD}", | ||
|  | #    q{ŽÅ½}   => "\N{U+017D}", | ||
|  | #    q{ÎÃŽ}   => "\N{U+00CE}", | ||
|  | #    q{ÏÃ}    => "\N{U+00CF}", | ||
|  | #    q{ÐÃ}    => "\N{U+00D0}", | ||
|  | #    q{‘‘}  => "\N{U+2018}", | ||
|  | #    q{ÑÑ}   => "\N{U+00D1}", | ||
|  | #    q{Չ۪}  => "\N{U+2019}", | ||
|  | #    q{ÒÃ’}   => "\N{U+00D2}", | ||
|  | #    q{““}  => "\N{U+201C}", | ||
|  | #    q{ÓÓ}   => "\N{U+00D3}", | ||
|  | #    q{”â€}   => "\N{U+201D}", | ||
|  | #    q{ÔÔ}   => "\N{U+00D4}", | ||
|  | #    q{••}  => "\N{U+2022}", | ||
|  | #    q{ÕÕ}   => "\N{U+00D5}", | ||
|  | #    q{––}  => "\N{U+2013}", | ||
|  | #    q{ÖÖ}   => "\N{U+00D6}", | ||
|  | #    q{——}  => "\N{U+2014}", | ||
|  | #    q{××}   => "\N{U+00D7}", | ||
|  | #    q{˜Ëœ}   => "\N{U+02DC}", | ||
|  | #    q{ØÃ˜}   => "\N{U+00D8}", | ||
|  | #    q{™â„¢}  => "\N{U+2122}", | ||
|  | #    q{ÙÙ}   => "\N{U+00D9}", | ||
|  | #    q{šÅ¡}   => "\N{U+0161}", | ||
|  | #    q{ÚÚ}   => "\N{U+00DA}", | ||
|  | #    q{݉ۼ}  => "\N{U+203A}", | ||
|  | #    q{ÛÛ}   => "\N{U+00DB}", | ||
|  | #    q{œÅ“}   => "\N{U+0153}", | ||
|  | #    q{ÜÜ}   => "\N{U+00DC}", | ||
|  | #    q{ÝÃ}    => "\N{U+00DD}", | ||
|  | #    q{žÅ¾}   => "\N{U+017E}", | ||
|  | #    q{ÞÞ}   => "\N{U+00DE}", | ||
|  | #    q{ŸÅ¸}   => "\N{U+0178}", | ||
|  | #    q{ßß}   => "\N{U+00DF}", | ||
|  | #    q{Â}     => "\N{U+00A0}", | ||
|  | #    q{àÃ}    => "\N{U+00E0}", | ||
|  | #    q{¡Â¡}   => "\N{U+00A1}", | ||
|  | #    q{áá}   => "\N{U+00E1}", | ||
|  | #    q{¢Â¢}   => "\N{U+00A2}", | ||
|  | #    q{ââ}   => "\N{U+00E2}", | ||
|  | #    q{£Â£}   => "\N{U+00A3}", | ||
|  | #    q{ãã}   => "\N{U+00E3}", | ||
|  | #    q{¤Â¤}   => "\N{U+00A4}", | ||
|  | #    q{ää}   => "\N{U+00E4}", | ||
|  | #    q{¥Â¥}   => "\N{U+00A5}", | ||
|  | #    q{åÃ¥}   => "\N{U+00E5}", | ||
|  | #    q{¦Â¦}   => "\N{U+00A6}", | ||
|  | #    q{æÃ¦}   => "\N{U+00E6}", | ||
|  | #    q{§Â§}   => "\N{U+00A7}", | ||
|  | #    q{çç}   => "\N{U+00E7}", | ||
|  | #    q{¨Â¨}   => "\N{U+00A8}", | ||
|  | #    q{èè}   => "\N{U+00E8}", | ||
|  | #    q{©Â©}   => "\N{U+00A9}", | ||
|  | #    q{éé}   => "\N{U+00E9}", | ||
|  | #    q{ªÂª}   => "\N{U+00AA}", | ||
|  | #    q{êê}   => "\N{U+00EA}", | ||
|  | #    q{«Â«}   => "\N{U+00AB}", | ||
|  | #    q{ëë}   => "\N{U+00EB}", | ||
|  | #    q{¬Â¬}   => "\N{U+00AC}", | ||
|  | #    q{ìì}   => "\N{U+00EC}", | ||
|  | #    q{Â}   => "\N{U+00AD}", | ||
|  | #    q{íÃ}   => "\N{U+00ED}", | ||
|  | #    q{®Â®}   => "\N{U+00AE}", | ||
|  | #    q{îî}   => "\N{U+00EE}", | ||
|  | #    q{¯Â¯}   => "\N{U+00AF}", | ||
|  | #    q{ïï}   => "\N{U+00EF}", | ||
|  | #    q{°Â°}   => "\N{U+00B0}", | ||
|  | #    q{ðð}   => "\N{U+00F0}", | ||
|  | #    q{±Â±}   => "\N{U+00B1}", | ||
|  | #    q{ññ}   => "\N{U+00F1}", | ||
|  | #    q{²Â²}   => "\N{U+00B2}", | ||
|  | #    q{òò}   => "\N{U+00F2}", | ||
|  | #    q{³Â³}   => "\N{U+00B3}", | ||
|  | #    q{óó}   => "\N{U+00F3}", | ||
|  | #    q{´Â´}   => "\N{U+00B4}", | ||
|  | #    q{ôô}   => "\N{U+00F4}", | ||
|  | #    q{µÂµ}   => "\N{U+00B5}", | ||
|  | #    q{õõ}   => "\N{U+00F5}", | ||
|  | #    q{¶Â¶}   => "\N{U+00B6}", | ||
|  | #    q{öö}   => "\N{U+00F6}", | ||
|  | #    q{·Â·}   => "\N{U+00B7}", | ||
|  | #    q{÷÷}   => "\N{U+00F7}", | ||
|  | #    q{¸Â¸}   => "\N{U+00B8}", | ||
|  | #    q{øÃ¸}   => "\N{U+00F8}", | ||
|  | #    q{¹Â¹}   => "\N{U+00B9}", | ||
|  | #    q{ùù}   => "\N{U+00F9}", | ||
|  | #    q{ºÂº}   => "\N{U+00BA}", | ||
|  | #    q{úú}   => "\N{U+00FA}", | ||
|  | #    q{»Â»}   => "\N{U+00BB}", | ||
|  | #    q{ûû}   => "\N{U+00FB}", | ||
|  | #    q{¼Â¼}   => "\N{U+00BC}", | ||
|  | #    q{üü}   => "\N{U+00FC}", | ||
|  | #    q{½Â½}   => "\N{U+00BD}", | ||
|  | #    q{ýý}   => "\N{U+00FD}", | ||
|  | #    q{¾Â¾}   => "\N{U+00BE}", | ||
|  | #    q{þþ}   => "\N{U+00FE}", | ||
|  | #    q{¿Â¿}   => "\N{U+00BF}", | ||
|  | #    q{ÿÿ}   => "\N{U+00FF}", | ||
|  | #); | ||
|  | 
 | ||
|  | # | ||
|  | # Build a regex from all of the hash keys | ||
|  | # | ||
|  | #my $regex = join('|',sort(keys(%map_latin1))); | ||
|  | #$regex=qr{$regex}; | ||
|  | 
 | ||
|  | #}}} | ||
|  | 
 | ||
|  | # | ||
|  | # Enable Unicode output mode | ||
|  | # | ||
|  | binmode STDOUT, ":encoding(UTF-8)"; | ||
|  | binmode STDERR, ":encoding(UTF-8)"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Options and arguments {{{ | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Process options | ||
|  | # | ||
|  | my %options; | ||
|  | Options( \%options ); | ||
|  | 
 | ||
|  | # | ||
|  | # Default help | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ) | ||
|  |     if ( $options{'help'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # Full documentation if requested with -doc | ||
|  | # | ||
|  | pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 ) | ||
|  |     if ( $options{'doc'} ); | ||
|  | 
 | ||
|  | # | ||
|  | # Collect options | ||
|  | # | ||
|  | my $cfgfile | ||
|  |     = ( defined( $options{config} ) ? $options{config} : $configfile ); | ||
|  | my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 ); | ||
|  | my $verbose = ( defined( $options{verbose} )   ? $options{verbose}   : 0 ); | ||
|  | my $field   = $options{field}; | ||
|  | my $skip    = $options{skip} // 0; | ||
|  | my $limit   = $options{limit} // 0; | ||
|  | 
 | ||
|  | # }}} | ||
|  | 
 | ||
|  | # | ||
|  | # Sanity checks | ||
|  | # | ||
|  | die "Unable to find $cfgfile\n" unless ( -e $cfgfile ); | ||
|  | if ($field) { | ||
|  |     $field = lc($field); | ||
|  |     die "Invalid value for -field=FIELD\n" | ||
|  |         unless ( $field =~ /title|summary|tags|notes/ ); | ||
|  | } | ||
|  | else { | ||
|  |     $field = 'title'; | ||
|  | } | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Load configuration data | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $conf = new Config::General( | ||
|  |     -ConfigFile      => $cfgfile, | ||
|  |     -InterPolateVars => 1, | ||
|  |     -ExtendedAccess  => 1 | ||
|  | ); | ||
|  | my %config = $conf->getall(); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Connect to the database | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $dbhost = $config{database}->{host} // '127.0.0.1'; | ||
|  | my $dbport = $config{database}->{port} // 3306; | ||
|  | my $dbname = $config{database}->{name}; | ||
|  | my $dbuser = $config{database}->{user}; | ||
|  | my $dbpwd  = $config{database}->{password}; | ||
|  | $dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname", | ||
|  |     $dbuser, $dbpwd, { AutoCommit => 1 } ) | ||
|  |     or die $DBI::errstr; | ||
|  | 
 | ||
|  | # | ||
|  | # Enable client-side UTF8 | ||
|  | # | ||
|  | $dbh->{mysql_enable_utf8} = 1; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Set up logging keeping the default log layout except for the date | ||
|  | #------------------------------------------------------------------------------- | ||
|  | my $log = Log::Handler->new(); | ||
|  | 
 | ||
|  | $log->add( | ||
|  |     file => { | ||
|  |         timeformat => "%Y-%m-%d %H:%M:%S", | ||
|  |         filename   => $logfile, | ||
|  |         maxlevel   => 7, | ||
|  |         minlevel   => 0, | ||
|  |         utf8       => 1, | ||
|  |     } | ||
|  | ); | ||
|  | 
 | ||
|  | # | ||
|  | # Log the settings being used | ||
|  | # | ||
|  | $log->info("---- Running version $VERSION"); | ||
|  | $log->info("Configuration file $cfgfile"); | ||
|  | $log->info("Processing field '$field'"); | ||
|  | $log->info("Skipping $skip non-ASCII rows") if $skip; | ||
|  | $log->info("Update limit is $limit") if $limit; | ||
|  | $log->info("Dry-run mode") if ($dry_run); | ||
|  | 
 | ||
|  | # | ||
|  | # Adjust limit | ||
|  | # | ||
|  | $limit += $skip if $skip; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Perform a scan of episodes for the chosen field which contains non-ASCII | ||
|  | #------------------------------------------------------------------------------- | ||
|  | $sql = sprintf( | ||
|  |     q{SELECT id,%s FROM eps WHERE %s <> CONVERT(%s USING ASCII) ORDER BY id}, | ||
|  |     $field, $field, $field | ||
|  | ); | ||
|  | 
 | ||
|  | $sth1 = $dbh->prepare($sql) or die $DBI::errstr; | ||
|  | 
 | ||
|  | $sth1->execute; | ||
|  | if ( $dbh->err ) { | ||
|  |     warn $dbh->errstr; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Prepare SQL::Abstract and the SQL template for the updates | ||
|  | # | ||
|  | my $sqla = SQL::Abstract->new; | ||
|  | 
 | ||
|  | my $stmt1 = sprintf( | ||
|  |     q{UPDATE eps SET %s = CONVERT(BINARY CONVERT(%s USING latin1) USING utf8)}, | ||
|  |     $field, $field | ||
|  | ); | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Loop through what we get from the main query, attempting to convert each field | ||
|  | #------------------------------------------------------------------------------- | ||
|  | $viewed = 0; | ||
|  | while ( $h1 = $sth1->fetchrow_hashref ) { | ||
|  |     $viewed++; | ||
|  |     next if $viewed <= $skip; | ||
|  | 
 | ||
|  |     # | ||
|  |     # Prepare the 'WHERE' part of the SQL | ||
|  |     # | ||
|  |     my %where = ( id => $h1->{id} ); | ||
|  |     my ( $stmt2, @bind ) = $sqla->where( \%where ); | ||
|  |     my $stmt = "${stmt1}${stmt2}"; | ||
|  | 
 | ||
|  |     # | ||
|  |     # In dry-run mode just report what would have been done, otherwise try and | ||
|  |     # make the change. | ||
|  |     # | ||
|  |     if ($dry_run) { | ||
|  |         if ($verbose) { | ||
|  |             printf "[%04d] %s\n", $h1->{id}, | ||
|  |                 ( | ||
|  |                   $field eq 'notes' | ||
|  |                 ? '' | ||
|  |                 : $h1->{$field} | ||
|  |                 ); | ||
|  |         } | ||
|  | 
 | ||
|  |         say "SQL: ${stmt}"; | ||
|  |         say "Arguments: ",join( ',', @bind ); | ||
|  |     } | ||
|  |     else { | ||
|  |         $sth2 = $dbh->prepare($stmt) or die $DBI::errstr; | ||
|  | 
 | ||
|  |         # | ||
|  |         # The SQL could generate an error which we'll try and intercept | ||
|  |         # | ||
|  |         try { | ||
|  |             $sth2->execute(@bind) | ||
|  |                 or die $DBI::errstr; | ||
|  |             $log->info("Updated $field field for row $h1->{id}"); | ||
|  |         } | ||
|  |         catch ($e) { | ||
|  |             $log->info("Failed to update $field field for row $h1->{id}"); | ||
|  |             $log->info("Error: $e"); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  | } | ||
|  | continue { | ||
|  |     if ($limit) { | ||
|  |         if ($viewed >= $limit) { | ||
|  |             $log->info("Update limit reached"); | ||
|  |             last; | ||
|  |         }; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | exit; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: Options | ||
|  | #      PURPOSE: Processes command-line options | ||
|  | #   PARAMETERS: $optref     Hash reference to hold the options | ||
|  | #      RETURNS: Undef | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: no exceptions | ||
|  | #     COMMENTS: none | ||
|  | #     SEE ALSO: n/a | ||
|  | #=============================================================================== | ||
|  | sub Options { | ||
|  |     my ($optref) = @_; | ||
|  | 
 | ||
|  |     my @options = ( | ||
|  |         "help",     "doc",     "dry-run!", "verbose!", | ||
|  |         "config=s", "field=s", "skip=i",   "limit=i", | ||
|  |     ); | ||
|  | 
 | ||
|  |     if ( !GetOptions( $optref, @options ) ) { | ||
|  |         pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 ); | ||
|  |     } | ||
|  | 
 | ||
|  |     return; | ||
|  | } | ||
|  | 
 | ||
|  | __END__ | ||
|  | 
 | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #  Application Documentation | ||
|  | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|  | #{{{ | ||
|  | 
 | ||
|  | =head1 NAME | ||
|  | 
 | ||
|  | convert_latin1 - a script to convert fields in the HPR database to UTF-8 | ||
|  | 
 | ||
|  | =head1 VERSION | ||
|  | 
 | ||
|  | This documentation refers to convert_latin1 version 0.1.2 | ||
|  | 
 | ||
|  | 
 | ||
|  | =head1 USAGE | ||
|  | 
 | ||
|  |     ./convert_latin1 [-help] [-doc] [-config=FILE] [-[no]dry-run] | ||
|  |         [-[no]verbose] [-field=FIELDNAME] [-skip=N] [-limit=N] | ||
|  | 
 | ||
|  |     ./convert_latin1 -config=.hpr_livedb.cfg -verb -field=title | ||
|  |     ./convert_latin1 -config=.hpr_livedb.cfg -verb -dry-run -field=notes | ||
|  |         -limit=10 | ||
|  | 
 | ||
|  | 
 | ||
|  | =head1 OPTIONS | ||
|  | 
 | ||
|  | =over 8 | ||
|  | 
 | ||
|  | =item B<-help> | ||
|  | 
 | ||
|  | Prints a brief help message describing the usage of the program, and then exits. | ||
|  | 
 | ||
|  | =item B<-doc> | ||
|  | 
 | ||
|  | Displays the entirety of the documentation (using a pager), and then exits. To | ||
|  | generate a PDF version use: | ||
|  | 
 | ||
|  |     pod2pdf convert_latin1 --out=convert_latin1.pdf | ||
|  | 
 | ||
|  | =item B<-config=FILE> | ||
|  | 
 | ||
|  | This option allows an alternative configuration file to be used. This file | ||
|  | defines the location of the database, its port, its name and the username and | ||
|  | password to be used to access it. This feature was added to allow the script | ||
|  | to access alternative databases or the live database over an SSH tunnel. | ||
|  | 
 | ||
|  | See the CONFIGURATION AND ENVIRONMENT section below for the file format. | ||
|  | 
 | ||
|  | If the option is omitted the default file is used: B<.hpr_db.cfg> | ||
|  | 
 | ||
|  | =item B<-[no]dry-run> | ||
|  | 
 | ||
|  | Controls whether the program runs in a mode where it performs database | ||
|  | updates. When enabled the details of the updates to be performed are shown, | ||
|  | otherwise the updates are applied. The default B<-nodry-run> allows the | ||
|  | program to perform the changes. | ||
|  | 
 | ||
|  | =item B<-[no]verbose> | ||
|  | 
 | ||
|  | Normally very little is reported by the script, although details of errors | ||
|  | are reported. When B<-verbose> is selected more information | ||
|  | about the number of rows needing work, the updates performed (or which would | ||
|  | have been performed) and how many changes were made is reported. | ||
|  | 
 | ||
|  | =item B<-field=FIELDNAME> | ||
|  | 
 | ||
|  | This option defines the database field name to be converted. The permitted | ||
|  | names are B<title>, B<summary>, B<tags> and B<notes> and the table is asumed | ||
|  | to be B<eps>. If the option is not provided the default field B<title> will be | ||
|  | used. | ||
|  | 
 | ||
|  | =item B<-skip=N> | ||
|  | 
 | ||
|  | This option defines the number of database rows to skip when processing the | ||
|  | selected field. If omitted then no rows are skipped. The option is useful to | ||
|  | allow the work to be split into manageable batches, in conjunction with the | ||
|  | B<-limit=N> option below. | ||
|  | 
 | ||
|  | =item B<-limit=N> | ||
|  | 
 | ||
|  | This option defines the number of database rows to work on when processing the | ||
|  | selected field. If omitted then all rows are processed (after any skip defined | ||
|  | with te B<-skip=N> option). The option is useful to allow the work to split | ||
|  | into manageable batches, in conjunction with the B<-skip=N> option above. | ||
|  | 
 | ||
|  | =back | ||
|  | 
 | ||
|  | =head1 DESCRIPTION | ||
|  | 
 | ||
|  | =head2 OVERVIEW | ||
|  | 
 | ||
|  | The script is designed to repair the HPR MySQL (MariaDB) database which holds | ||
|  | show metadata. The database was created with 'latin1' encoding, and was later | ||
|  | changed to use UTF-8. However, no action was taken to ensure the PHP software | ||
|  | managing the database also used UTF-8. This meant that the 'latin1' encoded data | ||
|  | was still being generated as Unicode UTF-8 data was being added, and was being | ||
|  | rendered in the expected way, while there was little or no UTF-8 data being | ||
|  | stored. | ||
|  | 
 | ||
|  | The PHP deficiencies were rectified in April 2023 but this meant that all | ||
|  | non-ASCII characters stored in the database before that were rendered | ||
|  | incorrectly. The solution was to convert all 'latin1' non-ASCII data into | ||
|  | UTF-8, and that is what this script does. | ||
|  | 
 | ||
|  | Detecting non ASCII in database fields was performed with the following SQL: | ||
|  | 
 | ||
|  |     SELECT id,field FROM eps WHERE field <> CONVERT(field USING ASCII) ORDER BY id | ||
|  | 
 | ||
|  | This is used to generate a list of all rows which might need conversion to | ||
|  | UTF-8. However, the test is only whether there is non-ASCII data in the row. | ||
|  | 
 | ||
|  | Ideally, the conversion could have been performed entirely within the database | ||
|  | with SQL such as the following (for each field): | ||
|  | 
 | ||
|  |     UPDATE eps SET field = CONVERT(binary CONVERT(field USING latin1) USING utf8) | ||
|  |     WHERE field <> CONVERT(field USING ASCII); | ||
|  | 
 | ||
|  | However, the conversion to UTF-8 fails when the field already contains such | ||
|  | characters, stopping the query. | ||
|  | 
 | ||
|  | MySQL and MariaDB are capable of trapping errors (like using B<try/catch> in | ||
|  | various programming languages), but only in stored procedures. It was felt to | ||
|  | be undesirable to create stored procedures on the HPR database since this was | ||
|  | only possible through B<phpMyAdmin> which is due to be phased out. | ||
|  | 
 | ||
|  | This script was written to enable the catching of errors instead. | ||
|  | 
 | ||
|  | =head2 SCRIPT DESIGN | ||
|  | 
 | ||
|  | The main loop returns all rows with non-ASCII characters in the field being | ||
|  | processed. For each row an 'UPDATE' query is performed using the 'id' field | ||
|  | (episode number) to select it: | ||
|  | 
 | ||
|  |     UPDATE eps SET field = CONVERT(BINARY CONVERT(field USING latin1) USING utf8) | ||
|  |     WHERE id = value | ||
|  | 
 | ||
|  | This is performed inside a B<try/catch> statement so that if the query fails | ||
|  | it does not stop the script. Successes and failures are logged. | ||
|  | 
 | ||
|  | This algorithm is fairly slow, particularly for the 'notes' field which has | ||
|  | the most (nearly 600) non-ASCII rows. However, it seems to work as desired. | ||
|  | 
 | ||
|  | The B<-skip=N> and B<-limit=N> options allow control over the conversion | ||
|  | process such that the work can be done in batches. | ||
|  | 
 | ||
|  | Note that the log file used by the script is called B<convert_latin1.log>. It | ||
|  | is appended to on every run. The file name can only be changed by editing the | ||
|  | script. | ||
|  | 
 | ||
|  | =head1 DIAGNOSTICS | ||
|  | 
 | ||
|  | A list of every error and warning message that the application can generate | ||
|  | (even the ones that will "never happen"), with a full explanation of each | ||
|  | problem, one or more likely causes, and any suggested remedies. If the | ||
|  | application generates exit status codes (e.g. under Unix) then list the exit | ||
|  | status associated with each error. | ||
|  | 
 | ||
|  | 
 | ||
|  | =head1 CONFIGURATION AND ENVIRONMENT | ||
|  | 
 | ||
|  | The script obtains the credentials it requires to open the HPR database from | ||
|  | a configuration file. The name of the file it expects is B<.hpr_db.cfg> in the | ||
|  | directory holding the script. This can be changed by use of the | ||
|  | B<-configuration=FILE> option as described above. | ||
|  | 
 | ||
|  | The configuration file format is as follows: | ||
|  | 
 | ||
|  |  <database> | ||
|  |      host = 127.0.0.1 | ||
|  |      port = PORT | ||
|  |      name = DATABASE | ||
|  |      user = USERNAME | ||
|  |      password = PASSWORD | ||
|  |  </database> | ||
|  | 
 | ||
|  | =head1 DEPENDENCIES | ||
|  | 
 | ||
|  |     Config::General | ||
|  |     DBI | ||
|  |     Data::Dumper | ||
|  |     Getopt::Long | ||
|  |     Log::Handler | ||
|  |     Log::Handler::Output::File | ||
|  |     Pod::Usage | ||
|  |     SQL::Abstract | ||
|  | 
 | ||
|  | The script uses the experimental B<try> feature and disables the warning that | ||
|  | this feature generates. Note that this feature is only available in Perl | ||
|  | versions at 5.34.0 or above (the script was developed under v5.36.0). | ||
|  | 
 | ||
|  | =head1 BUGS AND LIMITATIONS | ||
|  | 
 | ||
|  | There are no known bugs in this module. | ||
|  | Please report problems to Dave Morriss  (Dave.Morriss@gmail.com) Patches are | ||
|  | welcome. | ||
|  | 
 | ||
|  | =head1 AUTHOR | ||
|  | 
 | ||
|  | Dave Morriss  (Dave.Morriss@gmail.com) | ||
|  | 
 | ||
|  | =head1 LICENCE AND COPYRIGHT | ||
|  | 
 | ||
|  | Copyright (c) 2023 Dave Morriss  (Dave.Morriss@gmail.com). All rights reserved. | ||
|  | 
 | ||
|  | This module is free software; you can redistribute it and/or | ||
|  | modify it under the same terms as Perl itself. See perldoc perlartistic. | ||
|  | 
 | ||
|  | This program is distributed in the hope that it will be useful, | ||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
|  | 
 | ||
|  | =cut | ||
|  | 
 | ||
|  | #}}} | ||
|  | 
 | ||
|  | # [zo to open fold, zc to close] | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | ||
|  | 
 |