hpr-tools/Database/fix_urls

709 lines
20 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: fix_urls
#
# USAGE: ./fix_urls [-help] [-doc] [-debug=N] [-dry-run] [-config=FILE]
# [-limit=N] tablename
#
# DESCRIPTION: Scans the HPR database to find URLs which do not have the
# 'https:' scheme and correct them.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: Had to revert to MySQL because of a problem with DBD::MariaDB
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.5
# CREATED: 2021-12-29 13:57:28
# REVISION: 2022-02-28 10:51:27
#
#===============================================================================
use v5.16;
use strict;
use warnings;
use utf8;
use feature qw{ postderef say signatures state };
no warnings qw{ experimental::postderef experimental::signatures };
use Getopt::Long;
use Pod::Usage;
use Config::General;
use DBI;
use SQL::Abstract;
use Log::Handler;
use Log::Handler::Output::File;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.5';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Database";
my $configfile = "$basedir/.hpr_db.cfg";
my $logfile = "$basedir/${PROG}.log";
my $dbh;
#<<< do not let perltidy change formatting
#
# The database tables we'll search and how to do what we need.
#
# main_key - the name of a table (with a hashref as the value)
# index the database field we're using to find a row
# fields fields we may need to edit (an arrayref)
# logfields fields we will log (an arrayref)
# mainQ the query that finds all rows needing work
# rowQ a query for finding a row using the index value
#
my %tables = (
'comments' => {
index => 'id',
fields => [qw{comment_title comment_text}],
logfields => [qw{eps_id comment_title}],
mainQ => q{
select id
from comments
where comment_text regexp 'http://[^[:space:]]'
or comment_title regexp 'http://[^[:space:]]'
order by id},
rowQ => q{select * from comments where id = ?},
},
'eps' => {
index => 'id',
fields => [qw{title summary notes}],
logfields => [qw{title date}],
mainQ => q{
select id
from eps
where title regexp 'http://[^[:space:]]'
or summary regexp 'http://[^[:space:]]'
or notes regexp 'http://[^[:space:]]'
order by id
},
rowQ => q{select * from eps where id = ?},
},
'hosts' => {
index => 'hostid',
fields => [qw{profile}],
logfields => [qw{host email}],
mainQ => q{
select hostid
from hosts
where profile regexp 'http://[^[:space:]]'
order by hostid},
rowQ => q{select * from hosts where hostid = ?},
},
'miniseries' => {
index => 'id',
fields => [qw{description}],
logfields => [qw{name}],
mainQ => q{
select id
from miniseries
where description regexp 'http://[^[:space:]]'
order by id},
rowQ => q{select * from miniseries where id = ?},
},
);
#>>>
my @table_names = keys(%tables);
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
my $DEF_DEBUG = 0;
my $DEF_LIMIT = 0;
#
# Process options
#
my %options;
Options( \%options );
#
# Default help
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 0, -exitval => 1 )
if ( $options{'help'} );
#
# Full documentation if requested with -doc
#
pod2usage(
-msg => "$PROG version $VERSION\n",
-verbose => 2,
-exitval => 1,
-noperldoc => 0,
) if ( $options{'doc'} );
#
# Collect options
#
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEF_DEBUG );
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $verbose = ( defined( $options{'verbose'} ) ? $options{'verbose'} : 0 );
my $limit = ( defined( $options{limit} ) ? $options{limit} : $DEF_LIMIT );
$limit = abs($limit);
#
# Sanity check
#
die "Unable to find config file '$cfgfile'\n" unless ( -e $cfgfile );
#
# Table choice
#
my $table = shift;
die "Database table not specified\n" unless $table;
die "Not a valid table name: $table\n"
unless ( grep { $_ =~ /^$table$/ } @table_names );
#
# Load configuration data
#
my $conf = new Config::General(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
# Connect to the database
#-------------------------------------------------------------------------------
my $dbhost = $config{database}->{host} // '127.0.0.1';
my $dbport = $config{database}->{port} // 3306;
my $dbname = $config{database}->{name};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
#$dbh = DBI->connect( "DBI:MariaDB:host=$dbhost;port=$dbport;database=$dbname",
# $dbuser, $dbpwd, { AutoCommit => 1, RaiseError => 1 } )
# or die $DBI::errstr;
$dbh = DBI->connect( "dbi:mysql:host=$dbhost;port=$dbport;database=$dbname",
$dbuser, $dbpwd, { AutoCommit => 1 } )
or die $DBI::errstr;
#
# Enable client-side UTF8
#
$dbh->{mysql_enable_utf8} = 1;
#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date
#-------------------------------------------------------------------------------
my $log = Log::Handler->new();
$log->add(
file => {
timeformat => "%Y-%m-%d %H:%M:%S",
filename => $logfile,
maxlevel => 7,
minlevel => 0,
utf8 => 1,
}
);
$log->info("Configuration file $cfgfile");
$log->info("Processing table $table");
$log->info("Dry-run mode") if ($dry_run);
process_table( $dbh, \%tables, $table, $dry_run, $limit );
exit;
#=== FUNCTION ================================================================
# NAME: process_table
# PURPOSE: Processes a table to change any instances of 'http://' to
# 'https://'
# PARAMETERS: $dbh open database handle
# $rtables reference to the %tables hash
# $table name of the table being processed
# $dry_run Boolean showing whether this is dry run or not
# $limit number of updates to apply, 0 = no limit
# RETURNS:
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub process_table {
my ( $dbh, $rtables, $table, $dry_run, $limit ) = @_;
my ( $sth1, $sth2, $h );
my ( $work_query, @work, $row_query, @fields, @logfields, $new, $index );
my ( $sql, $stmt, @bind, %fieldvals, %where );
my ( $workcount, $count, $updates, $logfmt );
#
# Prepare to build SQL
#
$sql = SQL::Abstract->new;
$count = 0;
$updates = 0;
#
# Find any rows in need of work as an array/list of the index values. The
# selectall_arrayref returns a reference to an array containing arrayrefs,
# so the 'map' flattens that structure.
#
$work_query = $rtables->{$table}->{mainQ};
@work = map { $_->[0] } @{ $dbh->selectall_arrayref($work_query) };
$workcount = scalar(@work);
printf "Number of rows requiring work: %d\n", $workcount if $verbose;
_debug( $DEBUG >= 1, "Number of rows requiring work: $workcount" )
unless $verbose;
_debug( $DEBUG >= 2,
"Rows requiring work: " . join( ",", @work ) . "\n" );
#
# If there's nothing to do say so and leave
#
unless (@work) {
print "Nothing to do to table '$table'!\n";
return;
}
#
# Pull configuration values from the hash
#
$row_query = $rtables->{$table}->{rowQ};
@fields = @{ $rtables->{$table}->{fields} };
@logfields = @{ $rtables->{$table}->{logfields} };
$index = $rtables->{$table}->{index};
_debug( $DEBUG >= 3, "\$row_query = $row_query" );
_debug( $DEBUG >= 3, "\@fields = " . join( ",", @fields ) );
_debug( $DEBUG >= 3, "\$index = $index" );
#
# Prepare for logging by making a format string for sprintf
#
$logfmt = 'Updated row with ';
$logfmt .= join( ", ", map {"$_ = '%s'"} $index, @logfields );
_debug( $DEBUG >= 3, "\$logfmt = $logfmt" );
#
# Set up query for the next eligible row
#
$sth1 = $dbh->prepare($row_query) or die $DBI::errstr;
if ( $dbh->err ) {
die $dbh->errstr;
}
#
# Loop through rows needing work
#
foreach my $pkey (@work) {
#
# The row is indexed by the per-table key
#
$sth1->execute($pkey) or die $DBI::errstr;
if ( $dbh->err ) {
die $dbh->errstr;
}
#
# Grab the row (there must be only one with this index)
#
if ( $h = $sth1->fetchrow_hashref ) {
#
# Set up the 'where' options for SQL::Abstract
#
%where = ( $index => { '=', $pkey } );
_debug( $DEBUG >= 3, Dumper( \%where ) );
#
# Work on the fields we know might contain HTML
#
for my $field (@fields) {
#
# Perform the change
#
( $new = $h->{$field} ) =~ s{\bhttp://(\S)}{https://$1}ig;
#
# Set up SQL::Abstract parameters
#
$fieldvals{$field} = $new;
}
#
# Use SQL::Abstract to make the statement and the bind parameters
#
( $stmt, @bind ) = $sql->update( $table, \%fieldvals, \%where );
#
# Do the change or report it depending on dry-run mode
#
unless ($dry_run) {
$sth2 = $dbh->prepare($stmt) or die $DBI::errstr;
$sth2->execute(@bind) or die $DBI::errstr;
$log->info( sprintf( $logfmt, $pkey, @{$h}{@logfields} ) );
printf $logfmt. "\n", $pkey, @{$h}{@logfields} if $verbose;
$updates++;
}
else {
print "No change made in dry-run mode\n";
if ($verbose) {
print "SQL: $stmt\n";
print "Bind> ", join( "\nBind> ", @bind ), "\n";
print '-' x 80, "\n";
}
}
}
#
# Apply the limit if appropriate
#
$count++;
unless ( $limit == 0 ) {
last if ( $count >= $limit );
}
}
unless ($dry_run) {
$log->info("Number of updates = $updates");
if ($verbose) {
print "Number of updates = $updates\n";
printf "Remaining rows needing attention: %d\n",
$workcount - $updates;
}
}
}
#=== FUNCTION ================================================================
# NAME: concat
# PURPOSE: Reimplementation of join but with any undefined or empty
# arguments removed
# PARAMETERS: $sep The string to be used to separate elements in
# the result
# [variable args] Any number of arguments to be joined together
# with the separator
# RETURNS: The concatenated arguments
# DESCRIPTION: Giving 'join' an array that may contain undefined elements will
# result in empty results in the output string and error
# messages as the undefined elements are processed. Giving it
# empty string elements will result in dangling separators in
# the output. This routine removes the undefined and empty
# elements before joining the rest.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub concat {
my $sep = shift;
my @args = grep { defined($_) && length($_) > 0 } @_;
return join( $sep, @args );
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $message Message to print
# RETURNS: Nothing
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
# newline and then adds one in the 'print' to the caller doesn't
# have to bother. Prepends the message with 'D> ' to show it's
# a debug message.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my ( $active, $message ) = @_;
chomp($message);
print "D> $message\n" if $active;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "doc", "debug=i", "verbose!",
"dry-run!", "config=s", "limit=i",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
fix_urls - Fixes 'http://' urls in the HPR database
=head1 VERSION
This documentation refers to fix_urls version 0.0.5
=head1 USAGE
./fix_urls [-help] [-doc] [-debug=N] [-[no]dry-run] [-[no]verbose]
[-config=FILE] [-limit=N] tablename
fix_urls -help
fix_urls -doc
fix_urls -limit=10 -dry-run comments
fix_urls -limit=10 comments
fix_urls -limit=10 -verbose comments
fix_urls -config=.hpr_livedb.cfg -debug=1 -dry-run -limit=1 comments
=head1 REQUIRED ARGUMENTS
=over 4
=item B<tablename>
The mandatory argument required by the script is the name of the table to
process. The choices are:
comments
eps
hosts
miniseries
=back
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
=item B<-doc>
Displays the entirety of the documentation (using a pager), and then exits. To
generate a PDF version use:
pod2pdf fix_urls --out=fix_urls.pdf
=item B<-debug=N>
Selects a level of debugging. Debug information consists of a line or series
of lines prefixed with the characters 'D>':
=over 4
=item B<0>
No debug output is generated: this is the default
=item B<1>
Displays the number of updates required in a table.
=item B<2>
As for level 1, and also displays the primary key values of all rows requiring
work in the table.
=item B<3>
As for level 2, and also displays some internal values for verification.
=back
=item B<-[no]dry-run>
Controls whether the program runs in a mode where it performs database
updates. When enabled the details of the updates to be performed are shown,
otherwise the updates are applied. The default B<-nodry-run> allows the
program to perform the changes.
=item B<-[no]verbose>
Normally very little is reported by the script, although details of which rows
have been changed are logged. When B<-verbose> is selected more information
about the number of rows needing work, the updates performed (or which would
have been performed) and how many changes were made is reported.
=item B<-limit=N>
This option allows the number of rows in the chosen table to be limited during
a B<-dry-run> pass or an update pass. If omitted, or if a value of zero is
given, then all eligible rows are processed.
=item B<-config=FILE>
This option allows an alternative configuration file to be used. This file
defines the location of the database, its port, its name and the username and
password to be used to access it. This feature was added to allow the script
to access alternative databases or the live database over an SSH tunnel.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<.hpr_db.cfg>
=back
=head1 DESCRIPTION
The B<fix_urls> script performs edits on fields in tables in the HPR database.
As written, the purpose is to change all occurrences of 'http://' to
'https://', though it could be used for other tasks. It is not designed to be
easily changeable from one to another, but the code can be changed to do this
if needed.
A single table is processed in each run, and the number of rows may be limited
if required by using the B<-limit=N> option.
The eligible tables are defined in a hash structure B<%tables> which defines
the fields to be processed and the queries needed to search for all rows
requiring work and to get a particular row to work on. It also defines which
fields are to be reported in the log file.
A log file is appended to when the script is run, which has the name
B<fix_urls.log> in the same directory as the script.
=head1 DIAGNOSTICS
=over 4
=item B<Unable to find config file '...'>
Type: fatal
The configuration file in the B<-config=FILE> option cannot be found.
=item B<Database table not specified>
Type: fatal
The mandatory table name argument was not provided.
=item B<Not a valid table name: ...>
Type: fatal
The mandatory table name argument specified an unknown table name.
=item B<[DBI error messages]>
Type: fatal
Generated when a database interface error has been detected, such as failure
to connect to the database or failure to prepare or execute a query.
=back
=head1 CONFIGURATION AND ENVIRONMENT
The script obtains the credentials it requires to open a local copy of the HPR
database from a configuration file. The name of the file it expects is
B<.hpr_db.cfg> in the directory holding the script. This configuration file
can be overridden using the B<-config=FILE> option as described above.
The configuration file format is as follows:
<database>
host = 127.0.0.1
port = PORT
name = DATABASE
user = USERNAME
password = PASSWORD
</database>
=head1 DEPENDENCIES
Config::General
DBI
Data::Dumper
Getopt::Long
Log::Handler
Pod::Usage
SQL::Abstract
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to <Dave Morriss> (<Dave.Morriss@gmail.com>)
Patches are welcome.
=head1 AUTHOR
<Dave Morriss> (<Dave.Morriss@gmail.com>)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2021-2020 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker