commit 3c4d96db1b630d3b0fc34c6f1ec11878b2b52074 Author: Dave Morriss Date: Sat Nov 19 21:27:51 2022 +0000 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f284234 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Ignore vim backup and swap files +*~ +*.swp +# Ignore test audio files +*.flac +*.mp3 +*.ogg +*.spx +*.wav +*.opus +*.FLAC +*.MP3 +*.OGG +*.SPX +*.WAV +*.OPUS +# Playing with image files, so don't want to save them +*.png +*.jpg +*.gif +*.svg +# Odds and ends +*.pdf +*.json +*.BAK +# LibreOffice lock files +.~lock* +# +*.csv +# +# Directories +# ----------- +# These directories are either managed locally or are not wanted in the +# repository. +# +feed_cache/ +reportedashtml/ +# +# Files +# ----- +# No need to track tags (ctags) files +# +tags +# link to the master file +.pdmenurc +# SQLte Browser project file +*.sqbpro +# Saved HTML +feedWatcher.html_* +Journal.html +README.html +index.html diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/db_regen b/db_regen new file mode 100755 index 0000000..5f8408d --- /dev/null +++ b/db_regen @@ -0,0 +1,14 @@ +#!/bin/bash + +# +# Make a new empty database using the schema file +# +BASEDIR="$HOME/HPR/feed_watcher" +DB="$BASEDIR/feedWatcher.db" +SCHEMA="$BASEDIR/feedWatcher_schema.sql" + +if [[ -e $DB ]]; then + rm -f "$DB" +fi + +sqlite3 "$DB" < "$SCHEMA" diff --git a/feedWatcher b/feedWatcher new file mode 100755 index 0000000..cf70737 --- /dev/null +++ b/feedWatcher @@ -0,0 +1,2623 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: feedWatcher +# +# USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] +# [-report[=title]] [-[no]check] [-out=FILE] [-json[=FILE]] +# [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N] +# [URL ...] +# +# DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux +# podcasts. Ken's script was inspired by Dann Washko's site at +# http://www.thelinuxlink.net/ and prepared for OggCamp 12 in +# August 2012 where it was used to generate handouts. +# +# The script has not been developed since 2014, but is now in +# use in 2020 helping to prepare podcast information for +# a FOSDEM visit under the heading of "Free Culture Podcasts". +# See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html' +# (created with Pandoc) in the same directory as this script for +# details of what has been done to develop the original design. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.15 +# CREATED: 2013-12-25 12:40:33 +# REVISION: 2022-11-18 22:27:35 +# +#------------------------------------------------------------------------------- +# Released under the terms of the GNU Affero General Public License (AGPLv3) +# http://www.gnu.org/licenses/agpl-3.0.html +#------------------------------------------------------------------------------- +# +#=============================================================================== + +use 5.030; +use strict; +use warnings; +use utf8; +use feature qw{ postderef say signatures state }; +no warnings qw{ experimental::postderef experimental::signatures } ; + +use Getopt::Long; +use Pod::Usage; +use Config::General; +use List::MoreUtils qw{uniq}; +use Log::Handler; + +use Try::Tiny; + +use URI; +use Net::DNS; +use IO::Socket; +use LWP::UserAgent; +use WWW::RobotRules; +use XML::RSS::Parser; +use XML::Feed; +use Feed::Find; + +use Template; +use Template::Filters; +Template::Filters->use_html_entities; # Use HTML::Entities in the template + +use HTML::Entities; + +use IO::Prompter; + +use JSON; + +use DBI; +use DateTime::Format::SQLite; +use DateTime::Format::ISO8601; +use SQL::Abstract; + +use Data::Dumper; + +# +# Version number (manually incremented) +# +our $VERSION = '0.0.15'; + +# +# Script name +# +( my $PROG = $0 ) =~ s|.*/||mx; + +# +# Declarations +# +my ( @new_urls, @deletions ); +my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" ); +my ( $search_target ); +my ( $sth1, $h1, $rv ); + +my $feeds; + +# +# File and directory paths +# +my $basedir = "$ENV{HOME}/HPR/feed_watcher"; +my $tempdir = "$basedir/tmp"; +my $configfile = "$basedir/$PROG.cfg"; +my $logfile = "$basedir/${PROG}.log"; +my $deftemplate = "$basedir/${PROG}.tpl"; + +# +# Hash key map to database field names (chevron comments are to stop Perltidy +# messing with the layout) +# + #<<< +my %keymap = ( + AUTHOR => 'author', + COPYRIGHT => 'copyright', + DESCRIPTION => 'description', + DNS => 'dns', +# ENCLOSURES => undef, +# ENCLOSURE_COUNT => undef, + FORMAT => 'feedformat', + GENERATOR => 'generator', +# HOST => undef, + HOSTUP => 'host_up', + HTTP_CONTENT_TYPE => 'content_type', + HTTP_STATUS => 'http_status', + IMAGE => 'image', + LANGUAGE => 'language', + LINK => 'link', + MODIFIED => 'modified', +# PORT => undef, +# ROBOTS => undef, +# SAVE => undef, + TITLE => 'title', + TYPE => 'urltype', + URI => 'url', +# URI_ID => undef, +); + #>>> + +# +# Headers for LWP::UserAgent +# +my %headers = ( + 'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; ' + . 'rv:15.0) Gecko/20100101 Firefox/15.0.1', + 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, ' + . 'image/pjpeg, image/png, */*', + 'Accept-Charset' => 'iso-8859-1,*,utf-8', + 'Accept-Language' => 'en-US', +); + +# +# Enable Unicode mode +# +binmode STDOUT, ":encoding(UTF-8)"; +binmode STDERR, ":encoding(UTF-8)"; + +# +# Options and arguments +# +my %options; +Options( \%options ); + +# +# Any arguments are taken to be URLs +# +@new_urls = @ARGV; + +# +# Default help +# +pod2usage( -msg => "Version $VERSION\n", -exitval => 1 ) + if ( $options{'help'} ); + +# +# Detailed help +# +pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 ) + if ( $options{'manpage'} ); + +# +# Collect options +# +my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 ); + +my $cfgfile + = ( defined( $options{config} ) ? $options{config} : $configfile ); +my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); +my $loadfile = $options{'load'}; +my $deletefile = $options{'delete'}; + +my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 ); +my $check = ( defined( $options{check} ) ? $options{check} : 0 ); + +my $outfile = $options{out}; +my $report = $options{report}; +my $json = $options{json}; +my $opml = $options{opml}; +my $template = $options{template}; + +# +# Check the configuration file +# +die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile ); + +# +# Check the load file +# +if ($loadfile) { + die "File $loadfile does not exist\n" unless -e $loadfile; + die "File $loadfile is not readable\n" unless -r $loadfile; +} + +# +# Check the delete file +# +if ($deletefile) { + die "File $deletefile does not exist\n" unless -e $deletefile; + die "File $deletefile is not readable\n" unless -r $deletefile; +} + +# +# We accept -report, meaning report everything or -report='title' to report +# just the feed with the given title. +# +if ( defined($report) ) { + if ($report =~ /^$/) { + $search_target = undef; + } + else { + $search_target = $report; + } +} +# +# We accept -json or -json=filename. In the former case we make a default +# name, otherwise we use the one provided. +# +if ( defined($json) ) { + if ($json =~ /^$/) { + $json = "$PROG.json" + } +} + +# +# We accept -opml or -opml=filename. In the former case we make a default +# name, otherwise we use the one provided. +# +if ( defined($opml) ) { + if ($opml =~ /^$/) { + $opml = "$PROG.opml" + } +} + +# +# Similarly we accept -template or -template=filename. In the former case we +# make a default name, otherwise we use the one provided. +# +if ( defined($template) ) { + if ($template =~ /^$/) { + $template = "$deftemplate"; + } + + die "Error: Unable to find template $template\n" unless -r $template; +} + +# +# Load configuration data +# +my $conf = new Config::General( + -ConfigFile => $cfgfile, + -InterPolateVars => 1, + -ExtendedAccess => 1 +); +my %config = $conf->getall(); + +# +# Connect to the database +# +my $dbtype = $config{database}->{type}; +my $dbfile = $config{database}->{file}; +my $dbuser = $config{database}->{user}; +my $dbpwd = $config{database}->{password}; +my $dbh + = DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd, + { AutoCommit => 1, sqlite_unicode => 1, } ) + or die $DBI::errstr; + +# +# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE +# foreign key constraints to function) +# +$dbh->do('PRAGMA foreign_keys = ON'); + +# +# Check we have something to do +# +my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' ); +my $work = ( + scalar(@new_urls) > 0 + || defined($loadfile) + || defined($deletefile) + || ( defined($report) + || defined($json) + || defined($opml) + || defined($template) + || $scan && $rows > 0 ) +); + +die "Nothing to do!\n" unless $work; + +#------------------------------------------------------------------------------- +# Set up logging keeping the default log layout except for the date +#------------------------------------------------------------------------------- +my $LOG = Log::Handler->new(); + +$LOG->add( + file => { + timeformat => "%Y/%m/%d %H:%M:%S", + filename => $logfile, + minlevel => 0, + maxlevel => 7, + } +); + +#------------------------------------------------------------------------------- +# Open the output file (or STDOUT) - we may need the date to do it +#------------------------------------------------------------------------------- +my $outfh; +if ($outfile) { + open( $outfh, ">:encoding(UTF-8)", $outfile ) + or warn "Unable to open $outfile for writing: $!"; +} +else { + open( $outfh, ">&:encoding(UTF-8)", \*STDOUT ) + or warn "Unable to initialise for writing: $!"; +} + + +# +# Set up a robot.txt rules parser +# +$rules = WWW::RobotRules->new($robot_name); + +# +# Slurp the load file into @new_urls if the file is provided +# +if ($loadfile) { + # + # Load the input file + # + open( my $in, '<:encoding(utf8)', $loadfile ) + or die "$PROG : failed to open load file '$loadfile' : $!\n"; + chomp( my @loaded = <$in> ); + close($in) + or warn "$PROG : failed to close load file '$loadfile' : $!\n"; + + $LOG->info("Loaded URLs from $loadfile"); + + # + # Add the loaded URLs to the array + # + push( @new_urls, @loaded ); +} + +# +# Now, we either have URLs from the command line, or from the load file, so we +# process these. +# +#while (@new_urls) { +# TODO: Why was this a loop? +if (@new_urls) { + # + # Remove duplicates + # + @new_urls = uniq(@new_urls); + + # + # Remove any commented out lines + # + @new_urls = grep {!/^\s*#/} @new_urls; + + $LOG->info( "Received ", scalar(@new_urls), + " URLs to add to the database" ); + + # + # Load these URLs as appropriate, returning any more that we find by + # following HTML urls. + # + @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap ); + + # + # Now process any URLs that came back. Since we are explicitly looking for + # feeds we can assume that's what we have so don't need to recurse again. + # + # TODO +} + +# +# Process the delete file if there is one +# +if ($deletefile) { + # + # Load the delete file + # + open( my $del, '<:encoding(utf8)', $deletefile ) + or die "$PROG : failed to open load file '$deletefile' : $!\n"; + chomp( @deletions = <$del> ); + close($del) + or warn "$PROG : failed to close load file '$deletefile' : $!\n"; + + # + # Remove duplicates + # + @deletions = uniq(@deletions); + + $LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" ); + + # + # There are URLs to delete. Process them on by one. + # + if (@deletions) { + $sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?}); + foreach my $rec (@deletions) { + $rv = $sth1->execute($rec); + if ( $dbh->err ) { + warn $dbh->errstr; + } + if ( $rv != 0 ) { + emit ( $silent, "Deleted $rec ($rv rows)\n" ); + } + else { + emit ( $silent, "Failed to delete $rec\n" ); + } + } + + } + +} + +#------------------------------------------------------------------------------- +# Perform a database scan if requested +#------------------------------------------------------------------------------- +if ($scan) { + $LOG->info( "Scan is not fully implemented yet" ); + scanDB($dbh, \%keymap); +} + +#------------------------------------------------------------------------------- +# Report all or selected database contents if requested +#------------------------------------------------------------------------------- +if ( defined($report) ) { + if ( defined($search_target) ) { + # + # Reporting a specific title + # + my @matches = searchTitle( $dbh, $search_target ); + if (@matches) { + # + # Too many matches! + # + if ( $#matches > 1 ) { + say "Multiple matches:\n"; + say "- ", join( "\n- ", @matches ); + say "\nTry again with a different title"; + } + else { + # + # One match, report it + # + reportFeed( findFeed( $dbh, $matches[0] ), $outfh ); + } + } + else { + say "Feed not found with target '$search_target'"; + } + } + else { + # + # Reporting everything + # + reportDB( $dbh, $outfh ); + $LOG->info("Report generated"); + } +} + +# +# Do any image repairs we need, but temporarily +# +if ($json || $opml || $template) { + $feeds = collectData($dbh); + + # + # Clean up the image references (until we have a proper fix) + # + for (my $i = 0; $i < scalar(@$feeds); $i++) { + my $f = $feeds->[$i]; + if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) { + $f->{urls_image} =~ s/^(ARRAY\([^)]+\))//; + $LOG->info( "Fixed image for $f->{urls_url}" ); + } + } + +} + +#------------------------------------------------------------------------------- +# Output all feeds to a JSON file if requested +#------------------------------------------------------------------------------- +if ($json) { + my $js = JSON->new->utf8->canonical->pretty; + open( my $out, '>:encoding(UTF-8)', $json ) + or die "Unable to open output file $json $!\n"; + + print $out $js->encode($feeds), "\n"; + close($out); + emit ( $silent, "JSON data written to $json\n" ); + $LOG->info( "JSON data written to $json" ); +} + +#------------------------------------------------------------------------------- +# Requesting the -opml option means to dump the entire feed table as OPML +#------------------------------------------------------------------------------- +if ($opml) { + require XML::OPML; + + open( my $out, '>:encoding(UTF-8)', $opml ) + or die "Unable to open output file $opml $!\n"; + + # + # Start building OPML. Mandatory attributes are 'type', 'text' and + # 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type' + # attribute is poorly defined; this module uses 'rss' as the type and + # 'RSS' as the value of the 'version' attribute. This is not linked to the + # type of the feed. + # + my $OPML = XML::OPML->new(version => '1.1'); + + # + # Create the 'head' element + # + my $DT = normaliseDT(DateTime->now()); + $OPML->head( + title => 'Free Culture Podcasts', + dateCreated => $DT, + dateModified => $DT, + ); + + # + # Walk the feeds generating OPML + # + for (my $i = 0; $i < scalar(@$feeds); $i++) { + my $f = $feeds->[$i]; + + $OPML->add_outline( + title => $f->{urls_title}, + text => $f->{urls_title}, + description => $f->{urls_description}, + xmlUrl => $f->{urls_url}, + htmlUrl => $f->{urls_link}, + ); + } + + # + # Output OPML to the nominated file + # + print $out $OPML->as_string; + + close($out); + emit ( $silent, "OPML data written to $opml\n" ); + $LOG->info( "OPML data written to $opml" ); +} + + +#------------------------------------------------------------------------------- +# Fill and print the template if requested +#------------------------------------------------------------------------------- +if ($template) { + my $tt = Template->new( + { ABSOLUTE => 1, + ENCODING => 'utf8', + INCLUDE_PATH => $basedir, + } + ); + + # + # Make the structure the template needs + # + my $vars = { + feeds => $feeds, + }; + +# print Dumper($vars),"\n"; + + my $document; + $tt->process( $template, $vars, \$document, { binmode => ':utf8' } ) + || die $tt->error(), "\n"; + print $outfh $document; + + emit ( $silent, "Data processed with $template\n" ); + $LOG->info( "Data processed with $template" ); +} + +#$dbh->disconnect; + +exit; + +#=== FUNCTION ================================================================ +# NAME: loadUrls +# PURPOSE: To load URLs read from the input file into the database +# PARAMETERS: $dbh database handle +# $new_urls arrayref containing URLs +# $rules WWW::RobotRules object +# $keymap hashref containing a map of key names to +# database field names +# RETURNS: Any new URLs discovered by investigating non-feed URLs. +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub loadUrls { + my ( $dbh, $new_urls, $rules, $keymap ) = @_; + + my ( $stream, $feed ); + my ( %uridata, $roboturl, @found_urls ); + + # + # Work through the list of URLs + # + foreach my $rec (@new_urls) { + %uridata = (); + + # + # By default save the collected data + # + $uridata{SAVE} = 1; + + # + # Check that we have a valid URL. We don't save them if they are + # invalid ($uridata{SAVE} is set to 0 in the routine). + # + my $uri = validateURI($rec,\%uridata); + next unless $uri; + + # + # Check whether we already have this URI in the database + # + if (dbSearch( + $dbh, 'SELECT * FROM urls WHERE url = ?', + $uridata{URI} + ) + ) + { + emit( $silent, "$uri is already in the database\n" ); + $uridata{SAVE} = 0; + next; + } + + # + # Check the hostname resolves in DNS + # + if ( checkDNS( $uri, \%uridata ) ) { + $uridata{DNS} = join( ", ", @{ $uridata{DNS} } ); + emit( $silent, "DNS: $uridata{DNS}\n" ); + } + else { + emit( $silent, "$uri has no DNS entry\n" ); + $uridata{SAVE} = 0; + next; + } + + # + # Check the server is available. Don't save if it's not (probably will + # not get the feed anyway). + # + if ( serverUp( $uri, \%uridata ) ) { + emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) ); + } + else { + emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) ); + $uridata{SAVE} = 0; + next; + } + + # + # Check the site's robots.txt. If there's a block then don't save. + # + if ( robotRulesOK( $uri, $rules, \%uridata ) ) { + emit( $silent, "Check of robots.txt rules succeeded\n" ); + } + else { + if ( $uridata{ROBOTS} =~ /404/ ) { + emit( $silent, "Search for robots.txt file failed\n" ); + } + else { + emit( $silent, "Check of robots.txt rules failed\n" ); + $uridata{SAVE} = 0; + next; + } + } + + # + # Look for the HTTP content type. Don't save if the request failed. + # + if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) { + emit( $silent, "HTTP request OK\n" ); + } + else { + emit( $silent, "HTTP request failed\n" ); + $uridata{SAVE} = 0; + next; + } + + # + # Parse the feed + # + if ( $uridata{TYPE} eq 'Feed' ) { + $LOG->info('Processing feed: ',$uridata{URI}); + $stream = getFeed( $uridata{URI} ); + if ($stream) { + $feed = parseFeed( $uridata{URI}, $stream ); + unless ( $feed ) { + $uridata{SAVE} = 0; + next; + } + + # + # Provide a means of examining the returned feed object. There + # are some weird behaviours in there (partly because of the + # weirdness of RSS and poor adherence to what standards there + # are). + # + print Dumper($feed), "\n" if ( $DEBUG > 2 ); + storeFeed($feed,\%uridata); + + # + # Perform a check on the copyright. Routine sets + # $uridata{SAVE} = 0 if the copyright is not acceptable. + # + if ($check) { + next unless checkCopyright(\%uridata); + } + + } + else { + emit($silent, "Problem collecting feed"); + $uridata{SAVE} = 0; + next; + } + } + elsif ( $uridata{TYPE} eq 'HTML' ) { + # + # TODO Save the URL and find any feeds it contains + # + if (@found_urls) { + emit( $silent, + "Type $uridata{TYPE} contained " + . scalar(@found_urls) + . " feeds; to be investigated\n" ); + } + else { + emit( $silent, + "Type $uridata{TYPE} contained no feeds; skipped\n" ); + } + next; + } + else { + emit( $silent, + "Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" ); + $uridata{SAVE} = 0; + next; + } + + } + continue { + # { This 'continue' block is executed for each iteration or 'next' } + + # + # Decide whether to save what we have collected + # + if ( $uridata{SAVE} ) { + if ( addURI( $dbh, \%uridata, $keymap ) ) { + emit( $silent, "$uridata{URI} added to the database\n" ); + $LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database'); + + # + # Get the id the database allocated for the row we added + # + $uridata{URI_ID} + = $dbh->last_insert_id( undef, undef, 'urls', undef ); + + # + # Add any enclosures we found + # + if ( defined( $uridata{ENCLOSURE_COUNT} ) + && $uridata{ENCLOSURE_COUNT} > 0 ) + { + if ( addEnclosures( $dbh, \%uridata ) ) { + emit( $silent, $uridata{ENCLOSURE_COUNT}, + " enclosures for $uridata{URI} added to the database\n" + ); + } + } + + } + else { + emit( $silent, + "$uridata{URI} was not added to the database\n" ); + $LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database'); + } + } + + # + # Dump what we have if requested + # + print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 ); + + emit( $silent, '-' x 80, "\n" ); + } + + # + # Return any further urls we found + # + return @found_urls; + +} + +#=== FUNCTION ================================================================ +# NAME: searchTitle +# PURPOSE: Search the database for a feed with a given title +# PARAMETERS: $dbh database handle +# $target search target +# RETURNS: A list of titles +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub searchTitle { + my ($dbh, $target) = @_; + + my ( $sql1, $sql2, $sth, $rv, $h ); + my ( $count, @result ); + + $sql1 = q{ + SELECT count(*) AS count + FROM urls + WHERE urltype = 'Feed' + AND title REGEXP ? + }; + + $sql2 = q{ + SELECT title + FROM urls + WHERE urltype = 'Feed' + AND title REGEXP ? + ORDER BY title + }; + + # + # Count the number of matches + # + $sth = $dbh->prepare($sql1); + $rv = $sth->execute($search_target); + if ( $dbh->err ) { + warn $dbh->errstr; + return; + } + + # + # Proceed if we have 1 or more matches + # + if ( $h = $sth->fetchrow_hashref ) { + $count = $h->{count}; + $sth->finish; + + if ( $count >= 1 ) { + $sth = $dbh->prepare($sql2); + $rv = $sth->execute($search_target); + if ( $dbh->err ) { + warn $dbh->errstr; + return; + } + while ( $h = $sth->fetchrow_hashref ) { + push(@result, $h->{title}); + } + } + + return @result; + } + else { + return; + } +} + + +#=== FUNCTION ================================================================ +# NAME: scanDB +# PURPOSE: To scan the URLs in the database and update the stored data +# PARAMETERS: $dbh database handle +# $keymap hashref containing a map of key names to +# database field names +# RETURNS: Nothing +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub scanDB { + my ($dbh, $keymap) = @_; + + my ( $sql1, $sth1, $rv1, $h1 ); + my ( $aref, @urls, $DT, $stream, $feed ); + my ( %uridata, $urichanges, $enc_changes ); + + # + # Query to return all feed URLs + # + $sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title}; + + $sth1 = $dbh->prepare($sql1); + $rv1 = $sth1->execute(); + if ( $dbh->err ) { + warn $dbh->errstr; + } + + # + # Collect everything as an arrayref pointing to a bunch of arrayrefs + # containing the column details requested + # + $aref = $sth1->fetchall_arrayref; + + # + # Extract just the URL strings + # + @urls = map { $_->[1] } @{$aref}; + + # + # Now process these URLs from the database one at a time + # +# $sql1 = q{SELECT * FROM urls WHERE url = ?}; +# $sth1 = $dbh->prepare($sql1); + + my $count = 0; + + foreach my $url (@urls) { + %uridata = (); + + $count++; + + scanFeed($dbh,$url,\%uridata); + +# # +# # Record the scan for this URL +# # +# $DT = normaliseDT( DateTime->now() ); +# $uridata{SCANNED_ON} = $DT; +# +# # +# # Default to OK +# # +# $uridata{SCAN_OK} = 1; +# +# emit( $silent, "Scanning '$url'\n" ); +# $rv1 = $sth1->execute($url); +# if ( $dbh->err ) { +# warn $dbh->errstr; +# } +# $h1 = $sth1->fetchrow_hashref; +# emit( $silent, $h1->{title}, "\n" ); +# +# # +# # The URL should be valid already from the point at which it was added +# # to the database, but if we do this we get a canonical version (it +# # sets $uridata{SAVE} which makes no sense here, but we'll just ignore +# # it. It also sets $uridata{URI}, which is useful.) +# # +# my $uri = validateURI( $url, \%uridata ); +# +# # +# # Check the hostname resolves in DNS +# # +# if ( checkDNS( $uri, \%uridata ) ) { +# $uridata{DNS} = join( ", ", @{ $uridata{DNS} } ); +# emit( $silent, "DNS: $uridata{DNS}\n" ); +# } +# else { +# emit( $silent, "$uri has no DNS entry\n" ); +# $uridata{SCAN_OK} = 0; +# next; +# } +# +# # +# # Check the server is available +# # +# if ( serverUp( $uri, \%uridata ) ) { +# emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) ); +# } +# else { +# emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) ); +# $uridata{SCAN_OK} = 0; +# next; +# } +# +# # +# # Look for the HTTP content type. +# # +# if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) { +# emit( $silent, "HTTP request OK\n" ); +# } +# else { +# emit( $silent, "HTTP request failed\n" ); +# $uridata{SCAN_OK} = 0; +# next; +# } +# +# # +# # Note: not doing the robots.txt check since it was done at load time +# # +# +# # +# # We know we have a feed, so go get it +# # +# $stream = getFeed( $uridata{URI} ); +# if ($stream) { +# # +# # Parse the feed. The routine generates its own error messages +# # +# $feed = parseFeed( $uridata{URI}, $stream ); +# unless ( $feed ) { +# $uridata{SCAN_OK} = 0; +# next; +# } +# +# # +# # Save the important feed components in the %uridata hash +# # +# print Dumper($feed), "\n" if ( $DEBUG > 2 ); +# storeFeed( $feed, \%uridata ); +# } +# else { +# emit( $silent, "Problem collecting feed" ); +# $uridata{SCAN_OK} = 0; +# next; +# } +# + } + continue { + # { This 'continue' block is executed for each iteration or 'next' } + + # + # If the scan went OK then perform a comparison between the new feed + # data and that which is stored + # + if ( $uridata{SCAN_OK} ) { + $urichanges = updateURI( $dbh, \%uridata, $keymap ); + $LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ', + $urichanges ) + if $urichanges > 0; + + # TODO Update enclosures + } + else { + emit( $silent, "$uridata{URI} was not scanned successfully\n" ); + $LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' ); + } + + # + # Finished scanning this feed + # + emit( $silent, '-' x 80, "\n" ); + + # + # NOTE: Temporarily stop after the first N feeds + # + last if $count == 2; + } + + return; +} + + +#=== FUNCTION ================================================================ +# NAME: scanFeed +# PURPOSE: Performs a scan on a single feed +# PARAMETERS: $dbh database handle +# $url feed URL to look up +# $uridata hashref to the hash of data collected from the +# feed +# RETURNS: True (1) if all the steps worked, otherwise false (0) +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub scanFeed { + my ( $dbh, $url, $uridata ) = @_; + + my ( $sql, $sth, $rv, $h ); + my ( $DT, $stream, $feed ); + my ( $urichanges, $enc_changes ); + + $sql = q{SELECT * FROM urls WHERE url = ?}; + $sth = $dbh->prepare($sql); + + # + # Record the scan for this URL + # + $DT = normaliseDT( DateTime->now() ); + $uridata->{SCANNED_ON} = $DT; + + # + # Default to OK + # + $uridata->{SCAN_OK} = 1; + + emit( $silent, "Scanning '$url'\n" ); + $rv = $sth->execute($url); + if ( $dbh->err ) { + warn $dbh->errstr; + } + $h = $sth->fetchrow_hashref; + emit( $silent, $h->{title}, "\n" ); + + # + # The URL should be valid already from the point at which it was added + # to the database, but if we do this we get a canonical version (it + # sets $uridata->{SAVE} which makes no sense here, but we'll just ignore + # it. It also sets $uridata->{URI}, which is useful.) + # + my $uri = validateURI( $url, $uridata ); + + # + # Check the hostname resolves in DNS + # + if ( checkDNS( $uri, $uridata ) ) { + $uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } ); + emit( $silent, "DNS: $uridata->{DNS}\n" ); + } + else { + emit( $silent, "$uri has no DNS entry\n" ); + $uridata->{SCAN_OK} = 0; + return 0; + } + + # + # Check the server is available + # + if ( serverUp( $uri, $uridata ) ) { + emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) ); + } + else { + emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) ); + $uridata->{SCAN_OK} = 0; + return 0; + } + + # + # Look for the HTTP content type. + # + if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) { + emit( $silent, "HTTP request OK\n" ); + } + else { + emit( $silent, "HTTP request failed\n" ); + $uridata->{SCAN_OK} = 0; + return 0; + } + + # + # Note: not doing the robots.txt check since it was done at load time + # + + # + # We know we have a feed, so go get it + # + $stream = getFeed( $uridata->{URI} ); + if ($stream) { + # + # Parse the feed. The routine generates its own error messages + # + $feed = parseFeed( $uridata->{URI}, $stream ); + unless ( $feed ) { + $uridata->{SCAN_OK} = 0; + return 0; + } + + # + # Save the important feed components in the %uridata hash + # + print Dumper($feed), "\n" if ( $DEBUG > 2 ); + storeFeed( $feed, $uridata ); + } + else { + emit( $silent, "Problem collecting feed" ); + $uridata->{SCAN_OK} = 0; + return 0; + } + + return 1; + +} + +#=== FUNCTION ================================================================ +# NAME: reportDB +# PURPOSE: To generate a printed report from the database +# PARAMETERS: $dbh database handle +# $fh output file handle +# RETURNS: Nothing +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: reportFeed generates a standardised report +# SEE ALSO: N/A +#=============================================================================== +sub reportDB { + my ($dbh, $fh) = @_; + + # + # Query to report the contents of the 'all_episodes' view with the latest + # associated episode + # + my $sql = q{ + SELECT urls.id, ae.*, + max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep + FROM all_episodes ae + JOIN urls ON (ae.urls_id = urls.id) + GROUP by urls.id + HAVING ae.urls_urltype = 'Feed' + ORDER BY urls_title + }; + + my $sth1 = $dbh->prepare($sql); + my $rv = $sth1->execute(); + if ( $dbh->err ) { + warn $dbh->errstr; + } + + while ( my $h1 = $sth1->fetchrow_hashref() ) { + reportFeed( $h1, $fh ); + } + + $sth1->finish; + + return; +} + +#=== FUNCTION ================================================================ +# NAME: findFeed +# PURPOSE: Find and return a single feed by title +# PARAMETERS: $dbh database handle +# $target search target +# RETURNS: Hashref indexing the feed +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub findFeed { + my ( $dbh, $target ) = @_; + + my ( $sql, $sth, $rv, $h ); + + # + # Query to report the contents of the 'all_episodes' view with the latest + # associated episode + # + $sql = q{ + SELECT urls.id, ae.*, + max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep + FROM all_episodes ae + JOIN urls ON (ae.urls_id = urls.id) + GROUP by urls.id + HAVING ae.urls_urltype = 'Feed' + AND ae.urls_title = ? + }; + + $sth = $dbh->prepare($sql); + $rv = $sth->execute($target); + if ( $dbh->err ) { + warn $dbh->errstr; + } + + $h = $sth->fetchrow_hashref(); + + return $h; + +} + +#=== FUNCTION ================================================================ +# NAME: reportFeed +# PURPOSE: Reports a single feed from the database +# PARAMETERS: $feed a hash from the database containing the +# details of a particular feed +# $fh output file handle +# RETURNS: Nothing +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub reportFeed { + my ($feed, $fh) = @_; + + my $lwidth = 12; + + # + # Hash for converting database keys to labels for the report and arrays + # for controlling the sequence. + # + state ( %keys, @seq1, @seq2 ); + %keys = ( + 'id' => 'DB Primary key', + 'latest_ep' => 'Latest episode date', + 'ep_author' => 'Author', + 'ep_byte_length' => 'Byte length', + 'ep_category' => 'Category', + 'ep_enclosure' => 'URL', + 'ep_ep_id' => 'ID', + 'ep_id' => 'Key', + 'ep_issued' => 'Issued on', + 'ep_last_update' => 'Last updated', + 'ep_link' => 'Permalink', + 'ep_mime_type' => 'MIME type', + 'ep_modified' => 'Modified on', + 'ep_source' => 'Source', + 'ep_title' => 'Title', + 'ep_urls_id' => 'URL key', + 'urls_author' => 'Author', + 'urls_content_type' => 'Content type', + 'urls_copyright' => 'Copyright', + 'urls_description' => 'Description', + 'urls_dns' => 'DNS', + 'urls_feedformat' => 'Feed format', + 'urls_generator' => 'Generator', + 'urls_host_up' => 'Host up', + 'urls_http_status' => 'HTTP status', + 'urls_id' => 'Key', + 'urls_image' => 'Image', + 'urls_language' => 'Language', + 'urls_last_update' => 'Last updated', + 'urls_link' => 'Link', + 'urls_modified' => 'Modified on', + 'urls_title' => 'Title', + 'urls_url' => 'Feed URL', + 'urls_urltype' => 'URL type', + 'urls_parent_id' => 'Parent ID', + 'urls_child_count' => 'Child count', + + ); + @seq1 = ( + 'urls_title', + 'urls_url', + 'urls_feedformat', + 'urls_link', + 'urls_author', + 'urls_content_type', + 'urls_copyright', + 'urls_description', + 'urls_dns', + 'urls_generator', + 'urls_host_up', + 'urls_http_status', + 'urls_image', + 'urls_language', + 'urls_last_update', + 'urls_modified', + 'urls_parent_id', + 'urls_child_count', + ); + @seq2 = ( + 'ep_title', + 'ep_enclosure', + 'ep_category', + 'ep_link', + 'ep_author', + 'ep_byte_length', + 'ep_ep_id', + 'ep_mime_type', + 'ep_issued', + 'ep_last_update', + 'ep_modified', + 'ep_source', + ); + + if ($feed) { + print $fh "Channel:\n"; + foreach my $key (@seq1) { + printf $fh " %-*s: %s\n", $lwidth, $keys{$key}, + coalesce( $feed->{$key}, '--' ); + } + + print $fh "\nLatest episode:\n"; + foreach my $key (@seq2) { + printf $fh " %-*s: %s\n", $lwidth, $keys{$key}, + coalesce( $feed->{$key}, '--' ); + } + print $fh "\n"; + } + + return; +} + +#=== FUNCTION ================================================================ +# NAME: collectData +# PURPOSE: Collects data from the database for generating a report +# PARAMETERS: $dbh database handle +# RETURNS: An array of hashrefs from the query +# DESCRIPTION: Runs a fixed query looking for feed details in the 'urls' +# table and the 'all_episodes' view, showing the date of the +# latest episode. The result is an array of rows, each +# represented as a hash, all sorted by the feed title. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub collectData { + my ($dbh) = @_; + + # + # Query to report the contents of the 'urls' table with the details of the + # latest episode + # + my $sql = q{ + SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep + FROM all_episodes ae + JOIN urls ON (ae.urls_id = urls.id) + GROUP by urls.id + HAVING ae.urls_urltype = 'Feed' + ORDER BY ae.urls_title + }; + + my $sth1 = $dbh->prepare($sql); + my $rv = $sth1->execute(); + if ( $dbh->err ) { + warn $dbh->errstr; + } + + # + # Return everything as an arrayref of hashrefs + # + my $tbl_ary_ref = $sth1->fetchall_arrayref({}); + + $sth1->finish; + + return $tbl_ary_ref; +} + +#=== FUNCTION ================================================================ +# NAME: countRows +# PURPOSE: To count the rows in a table +# PARAMETERS: $dbh database handle +# $sql SQL expression to use +# RETURNS: Number of rows found (note that zero is returned as 0E0) +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub countRows { + my ( $dbh, $sql ) = @_; + + my $sth1 = $dbh->prepare($sql); + my $rv = $sth1->execute(); + if ( $dbh->err ) { + warn $dbh->errstr; + } + my $h1 = $sth1->fetch; + $sth1->finish; + + return @$h1[0]; +} + +#=== FUNCTION ================================================================ +# NAME: dbSearch +# PURPOSE: To perform a simple search in the database +# PARAMETERS: $dbh database handle +# $sql SQL expression to use (expected to be +# a SELECT) +# @args arguments for the 'execute' +# RETURNS: True (1) if the row exists, otherwise false (0). +# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same +# SQL without incurring the overhead of repeated 'prepare' +# calls. Only the first row is fetched (we expect there to be +# only one) and the success or failure is determined by its +# existence. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub dbSearch { + my ( $dbh, $sql, @args ) = @_; + + my $sth1 = $dbh->prepare_cached($sql); + my $rv = $sth1->execute(@args); + if ( $dbh->err ) { + warn $dbh->errstr; + } + my $h1 = $sth1->fetchrow_hashref(); + $sth1->finish; + + return defined($h1); +} + +#=== FUNCTION ================================================================ +# NAME: execSQL +# PURPOSE: To perform a non-SELECT query +# PARAMETERS: $dbh database handle +# $sql SQL expression to use +# @args arguments for the 'execute' +# RETURNS: True (1) if the query succeeded, otherwise false (0). +# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same +# SQL without incurring the overhead of repeated 'prepare' +# calls. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub execSQL { + my ( $dbh, $sql, @args ) = @_; + + my $sth1 = $dbh->prepare_cached($sql); + my $rv = $sth1->execute(@args); + if ( $dbh->err ) { + warn $dbh->errstr; + } + $sth1->finish; + + return $rv; +} + +#=== FUNCTION ================================================================ +# NAME: validateURI +# PURPOSE: Checks a URL for validity +# PARAMETERS: $rec the URL handed to the script +# $uridata hashref containing data for this URI +# RETURNS: A URI object if valid otherwise undef +# DESCRIPTION: The URL string is validated with the URI module. A canonical +# string version is stored in the hash referenced by $uridata +# hash if valid otherwise the URL is marked as invalid. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub validateURI { + my ( $rec, $uridata ) = @_; + + # + # Check that we have a valid URL. We don't save them if they are + # invalid. + # + my $uri = URI->new( $rec, 'http' ); + if ( $uri->scheme ) { + emit( $silent, "URI $uri is valid\n" ); + $uridata->{URI} = $uri->canonical->as_string; + return $uri; + } + else { + emit( $silent, "URI $uri is not valid\n" ); + $uridata->{SAVE} = 0; + return; + } +} + +#=== FUNCTION ================================================================ +# NAME: checkDNS +# PURPOSE: Looks up a host DNS entry +# PARAMETERS: $uri URI object +# $uridata hashref containing data for this URI +# RETURNS: True (1) if the DNS query was successful, otherwise false (0) +# DESCRIPTION: The host name is extracted from the URI (and stored). The +# hostname is searched for in the DNS and if successful, an +# array of addresses from the 'A' records is built. This is +# sorted and stored in the hash referenced by $uridata. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub checkDNS { + my ( $uri, $uridata ) = @_; + + my @adds; + my $hostname = $uri->host; + $uridata->{HOST} = $hostname; + + my $res = Net::DNS::Resolver->new; + my $query = $res->search($hostname); + if ($query) { + foreach my $rr ( $query->answer ) { + next unless $rr->type eq "A"; + push( @adds, $rr->address ); + } + @adds = sort(@adds); + $uridata->{DNS} = \@adds; + return 1; + } + else { + warn "Query failed: ", $res->errorstring, "\n"; + return 0; + } + +} + +#=== FUNCTION ================================================================ +# NAME: serverUp +# PURPOSE: Checks that a given host is responding +# PARAMETERS: $uri URI object +# $uridata hashref containing data for this URI +# RETURNS: True (1) if the host responds to a TCP connection, false (0) +# otherwise +# DESCRIPTION: Given an URL parses out the hostname and the port (defaulting +# to the appropriate default for the scheme, such as 80 for +# http). Attempts to connect to this host and port. If the +# connect fails then details are written to the data structure +# pointed to by $uridata. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub serverUp { + my ( $uri, $uridata ) = @_; + + my ( $hostname, $port, $socket ); + + $hostname = $uri->host(); + $port = $uri->port(); + $uridata->{PORT} = $port; + + $socket = IO::Socket::INET->new( + Proto => "tcp", + PeerAddr => $hostname, + PeerPort => $port, + Reuse => 1, + Timeout => 10 + ); + + if ($socket) { + $socket->close; + $uridata->{HOSTUP} = 1; + return 1; + } + else { + $uridata->{HOSTUP} = 0; + return 0; + } +} + +#=== FUNCTION ================================================================ +# NAME: robotRulesOK +# PURPOSE: To check the intended URL against the site's robots.txt rules +# PARAMETERS: $uri URI object +# $rules WWW::RobotRules object +# $uridata hashref containing data for this URI +# RETURNS: True (1) if the GET of the robots.txt file succeeded and the +# rules allow the URI object to be fetched, false (0) otherwise. +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub robotRulesOK { + my ( $uri, $rules, $uridata ) = @_; + + # + # Build the URL of robots.txt + # + my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt'; + my $robots_txt; + + # + # Use LWP::UserAgent to get the feed and handle errors + # + my $ua = LWP::UserAgent->new; + $ua->timeout(10); + $ua->agent("$PROG/$VERSION"); + + my $response = $ua->get($roboturl); + if ( $response->is_success ) { + $uridata->{ROBOTS} = 'Found'; + $robots_txt = $response->decoded_content; + $rules->parse( $roboturl, $robots_txt ); + return $rules->allowed("$uri"); + } + else { + $uridata->{ROBOTS} = $response->status_line; + warn "Failed to get $roboturl\n"; + warn $response->status_line . "\n"; + return; # undef + } + +} + +#=== FUNCTION ================================================================ +# NAME: checkContentType +# PURPOSE: Check the content_type of the url +# PARAMETERS: $uri URI object +# $uridata hashref containing data for this URI +# $headers hashref containing query headers +# $children arrayref to return any 'child' feeds we might +# find; they are appended to the array +# $log Log::Handler object +# RETURNS: True (1) if all was well, otherwise false (0) +# DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers +# for the URI object using the LWP::UserAgent head method. Then +# we examine the 'content-type' header looking for the string +# 'xml' or 'html' in it. The former denotes a feed, and the +# latter a normal HTML page. +# We use Feed::Find if the page is HTML in ordfer to find any +# "child" feeds which we really want to process if we find any. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub checkContentType { + my ( $uri, $uridata, $headers, $children, $log ) = @_; + + my @feeds; + + $uridata->{HTTP_STATUS} = 'Unknown'; + + my $browser = LWP::UserAgent->new or return 0; + + my $response = $browser->head( $uri->as_string, %{$headers} ) + or return 0; + + $uridata->{HTTP_STATUS} = $response->status_line; + + if ( $response->is_success ) { + $uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type'); + + # + # Decode the content-type we received + # + if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) { + $uridata->{TYPE} = 'Feed'; + } + elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) { + $uridata->{TYPE} = 'HTML'; + } + else { + $uridata->{TYPE} = 'Unknown'; + } + + # + # Sometimes perfectly valid feeds misleadingly return text/html so we + # try to spot such cases here and adjust the internal type accordingly + # + if ( $uridata->{TYPE} eq 'HTML' ) { + @feeds = Feed::Find->find( $uri->as_string ); + if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) { + emit( $silent, "Feed found with wrong content-type\n" ); + $uridata->{TYPE} = 'Feed'; + } + else { + # + # This HTML and we found 'child' feeds of some kind + # + emit( $silent, "Found ", scalar(@feeds), + " feeds within this HTML page\n" ); + print Dumper( \@feeds ), "\n" if $DEBUG > 0; + push(@{$children}, @feeds); + } + } + + $log->info( "URL content classified as: ", $uridata->{TYPE} ); + emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" ); + return 1; + } + else { + $uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown'; + return 0; + } +} + +#=== FUNCTION ================================================================ +# NAME: getFeed +# PURPOSE: Download the contents of a feed URL +# PARAMETERS: $feed_url URL of the feed to download +# RETURNS: String representation of the feed contents or undef if the +# download failed. +# DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but +# need not be). If successful the contents are decoded and +# returned, otherwise undef is returned. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: +#=============================================================================== +sub getFeed { + my ($feed_url) = @_; + + # + # Use LWP::UserAgent to get the feed and handle errors + # + my $ua = LWP::UserAgent->new; + $ua->timeout(10); + $ua->agent("$PROG/$VERSION"); + my $response = $ua->get($feed_url); + + my $feed_content; + if ( $response->is_success ) { + $feed_content = $response->decoded_content; + return $feed_content; + } + else { + warn "Failed to get $feed_url\n"; + warn $response->status_line, "\n"; + return; # undef + } +} + +#=== FUNCTION ================================================================ +# NAME: parseFeed +# PURPOSE: Parse a podcast feed that has already been downloaded +# PARAMETERS: $feed_url URL of the feed previously downloaded +# $feed_content String containing the content of the feed, for +# parsing +# RETURNS: An XML::Feed object containing the parsed feed or undef if the +# parse failed +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub parseFeed { + my ( $feed_url, $feed_content ) = @_; + + my $feed = XML::Feed->parse( \$feed_content ); + unless ($feed) { + # + # Something went wrong. Abort this feed + # + warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n"; + return; # undef + } + + return $feed; + +} + +#=== FUNCTION ================================================================ +# NAME: storeFeed +# PURPOSE: Stores feed attributes in a hash +# PARAMETERS: $feed XML::Feed object returned from parsing the +# feed +# $uridata hashref containing data for this URI +# RETURNS: Nothing +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub storeFeed { + my ($feed, $uridata) = @_; + + ( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g; + ( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g; + + $uridata->{LINK} = $feed->link; + $uridata->{DESCRIPTION} = clean_string( $feed->tagline ); + $uridata->{AUTHOR} = $feed->author; + $uridata->{MODIFIED} = normaliseDT( $feed->modified ); + $uridata->{IMAGE} = flattenArray( $feed->image ); + $uridata->{COPYRIGHT} = $feed->copyright; + $uridata->{GENERATOR} = $feed->generator; + $uridata->{LANGUAGE} = $feed->language; + #print coalesce($feed->webMaster,'No webMaster'),"\n"; + + $uridata->{ENCLOSURES} = extractEnclosures($feed); + $uridata->{ENCLOSURE_COUNT} + = scalar( @{ $uridata->{ENCLOSURES} } ); + +} + +#=== FUNCTION ================================================================ +# NAME: checkCopyright +# PURPOSE: Ask the user to check the copyright of a feed +# PARAMETERS: +# RETURNS: +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub checkCopyright { + my ($uridata) = @_; + + my $decision; + $LOG->info('Checking copyright of feed'); + + # + # Prompt the user, failing gracefully if there's + # a problem. If the user types 'Y' or 'y' we accept the + # feed, otherwise we do not (thus a blank return = 'no'). + # + try { + printf STDERR + "Feed '%s' has the copyright string:\n%s\n", + $uridata->{TITLE}, + coalesce( $uridata->{COPYRIGHT}, '' ); + $decision = prompt( + -in => *STDIN, + -prompt => 'OK to add this feed?', + -style => 'bold red underlined', + -yes + ); + } + catch { + warn "Problem processing copyright decision: $_"; + $decision = 0; + }; + + # + # Take action on the decision (or default) + # + $uridata->{SAVE} = $decision; + if ($decision) { + emit( $silent, "Feed added\n" ); + $LOG->info('Copyright OK'); + return 1; + } + else { + emit( $silent, "Feed not added\n" ); + $LOG->info('Copyright not OK'); + return 0; + } +} + +#=== FUNCTION ================================================================ +# NAME: parseRSS +# PURPOSE: Attempt to parse a feed as RSS +# PARAMETERS: $feed_url URL of the feed previously downloaded +# $feed_content String containing the content of the feed, for +# parsing +# RETURNS: An XML::RSS object containing the parsed feed or undef if the +# parse failed +# DESCRIPTION: ** Incomplete ** +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub parseRSS { + my ( $feed_url, $feed_content ) = @_; + + my $rss = XML::RSS->parse(\$feed_content); + unless ($rss) { + } +} + +#=== FUNCTION ================================================================ +# NAME: addURI +# PURPOSE: Adds the data for a URI to the database +# PARAMETERS: $dbh database handle +# $uridata hashref containing data for the current URI +# $keymap hashref containing a map of key names to +# database field names +# RETURNS: True (1) if the insert succeeded, false (0) otherwise +# DESCRIPTION: The hash keys are defined as an array to make it easy to slice +# the hash and the SQL is defined internally using the size of +# the key array as a guide to the number of '?' placeholders. +# These are passed to execSQL to do the work. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub addURI { + my ( $dbh, $uridata, $keymap ) = @_; + + my @keys = ( + 'URI', 'DNS', + 'HOSTUP', 'HTTP_STATUS', + 'HTTP_CONTENT_TYPE', 'TYPE', + 'FORMAT', 'TITLE', + 'DESCRIPTION', 'AUTHOR', + 'MODIFIED', 'LINK', + 'IMAGE', 'COPYRIGHT', + 'GENERATOR', 'LANGUAGE', + ); + + my $sql + = 'INSERT INTO urls (' + . join( ",", @{$keymap}{@keys} ) . ') ' + . 'VALUES(' + . join( ',', ('?') x scalar(@keys) ) . ')'; + + print "addURI query: $sql\n" if $DEBUG > 0; + + return execSQL( $dbh, $sql, @{$uridata}{@keys} ); + +} + +#=== FUNCTION ================================================================ +# NAME: extractEnclosures +# PURPOSE: Builds an array of hashes containing enclosure data from an +# XML::Feed object +# PARAMETERS: $feed XML::Feed object +# RETURNS: A reference to the array of anonymous hashes built from the +# feed. +# DESCRIPTION: The XML::Feed object is expected to contain an array of +# entries. These are converted to hashes, references to which +# are stored in an array. The two DateTime components are +# converted to ISO8601 dates. If there is an enclosure then its +# elements are saved. Note that there could be multiple +# enclosures, but XML::Feed does not cater for them unless +# explicitly requested. We do not deal with such a case here. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub extractEnclosures { + my ($feed) = @_; + + my @encs; + + foreach my $entry ( $feed->entries ) { + my %ent; + + $ent{title} = $entry->title; + $ent{base} = $entry->base; + $ent{link} = $entry->link; + $ent{category} = join( ", ", $entry->category ); + $ent{author} = $entry->author; + $ent{id} = $entry->id; + $ent{issued} = normaliseDT( $entry->issued ); + $ent{modified} = normaliseDT( $entry->modified ); + + # + # Work around a bug in XML::Feed where the source method is only + # available for Atom feeds. TODO report this. + # + if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) { + $ent{source} = $entry->source; + } + + my ($enclosure) = $entry->enclosure; + if ( defined($enclosure) ) { + $ent{url} = $enclosure->url; + $ent{type} = $enclosure->type; + $ent{length} = $enclosure->length; + } + + push( @encs, \%ent ); + } + + return \@encs; + +} + +#=== FUNCTION ================================================================ +# NAME: addEnclosures +# PURPOSE: Adds episodes extracted from a feed into the database +# PARAMETERS: $dbh database handle +# $uridata hashref containing data for the current URI +# including an arrayref of hashrefs of episode +# data +# RETURNS: True (1) if all the inserts succeeded, false (0) otherwise +# DESCRIPTION: The SQL is defined internally and the hash keys are defined as +# an array to make it easy to slice the hash. The enclosures (or +# more correctly, feed items) are present in the hash as an +# array of anonymous hashes. These are processed one at a time +# and inserted into the database. A count of the number of +# successful inserts is kept. This is compared with the number +# of enclosures to determine the boolean value to return. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub addEnclosures { + my ( $dbh, $uridata ) = @_; + + my $sql = q{INSERT INTO episodes + (urls_id, link, enclosure, title, author, category, source, ep_id, + issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)}; + + my @keys = ( + 'link', 'url', 'title', 'author', 'category', 'source', + 'id', 'issued', 'modified', 'length', 'type' + ); + + my $successes = 0; + + foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) { + if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) { + $successes++; + } + else { + emit( $silent, "Failed to add episode $enc->{url}\n" ); + } + } + + return ( $successes == $uridata->{ENCLOSURE_COUNT} ); +} + +#=== FUNCTION ================================================================ +# NAME: updateURI +# PURPOSE: Compare the data in a hash with that in the database +# PARAMETERS: $dbh database handle +# $uridata hashref containing data for the current URI +# including an arrayref of hashrefs of episode +# data +# $keymap hashref containing a map of key names to +# database field names +# RETURNS: The number of changes made +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub updateURI { + my ( $dbh, $uridata, $keymap ) = @_; + + my ( $sql1, $sth1, $rv1, $h1 ); + my ( %fieldvals, %where ); + my ( $diffs, $updates ) = ( 0, 0 ); + my @keys = ( + 'URI', 'DNS', + 'HOSTUP', 'HTTP_STATUS', + 'HTTP_CONTENT_TYPE', 'TYPE', + 'FORMAT', 'TITLE', + 'DESCRIPTION', 'AUTHOR', + 'MODIFIED', 'LINK', + 'IMAGE', 'COPYRIGHT', + 'GENERATOR', 'LANGUAGE', + ); + + # + # Get the row from the urls table + # + $sql1 = q{SELECT * FROM urls WHERE url = ?}; + $sth1 = $dbh->prepare($sql1); + $rv1 = $sth1->execute( $uridata->{URI} ); + if ( $dbh->err ) { + warn $dbh->errstr; + } + $h1 = $sth1->fetchrow_hashref; + + for my $key (@keys) { + unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) { + $diffs++; + $fieldvals{$key} = $uridata->{$key}; + + # + # Temporary report + # + print "Difference: ($key)\n"; + print " Feed: ", coalesce($uridata->{$key},''), "\n"; + print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n"; + } + } + + if ($diffs > 0) { + # + # Prepare to use SQL::Abstract + # + my $sql = SQL::Abstract->new(); + + # + # Build the update statement + # + $where{id} = $h1->{id}; + my ( $stmt, @bindvals ) + = $sql->update( 'urls', \%fieldvals, \%where ); + + # Temporary + print "Statement: $stmt\n"; + print "Bind values: ", join(",",@bindvals),"\n"; + + # + # Perform the updates + # + $sth1 = $dbh->prepare($stmt); + $sth1->execute(@bindvals); + if ( $dbh->err ) { + warn "Processing $h1->{url}\n", $dbh->errstr; + } + else { + emit($silent, "Updated $h1->{url}\n"); + $updates++; + } + + } + + return $updates; + +} + +#=== FUNCTION ================================================================ +# NAME: updateEnclosures +# PURPOSE: Update the enclosures stored with a feed URL +# PARAMETERS: $dbh database handle +# $uridata hashref containing data for the current URI +# including an arrayref of hashrefs of episode +# data +# RETURNS: The number of changes made +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub updateEnclosures { + my ( $dbh, $uridata ) = @_; + + my ( $sql1, $sth1, $rv1, $h1 ); + my ( %fieldvals, %where ); + my ( $diffs, $updates ) = ( 0, 0 ); + + # + # Get the enclosures associated with this feed + # + $sql1 = q{ + SELECT * FROM episodes + WHERE urls_id = (SELECT id FROM urls WHERE title = ?) + }; + $sth1 = $dbh->prepare($sql1); + $rv1 = $sth1->execute( $uridata->{URI} ); + if ( $dbh->err ) { + warn $dbh->errstr; + } + $h1 = $sth1->fetchrow_hashref; +} + +#=== FUNCTION ================================================================ +# NAME: equal +# PURPOSE: Compare two strings even if undefined +# PARAMETERS: $s1 The first string +# $s2 The second string +# RETURNS: True if both strings are undefined, false if one isn't +# defined, otherwise the result of comparing them. +# DESCRIPTION: Works on the principle that two undefined strings are equal, +# a defined and an undefined string are not, and otherwise they +# are equal if they are equal! +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: +#=============================================================================== +sub equal { + my ( $s1, $s2 ) = @_; + + return 1 if ( !defined($s1) && !defined($s2) ); + return 0 if ( !defined($s1) || !defined($s2) ); + + return ( $s1 eq $s2 ); +} + +#=== FUNCTION ================================================================ +# NAME: clean_string +# PURPOSE: Clean a string of non-printables, newlines, multiple spaces +# PARAMETERS: $str The string to process +# RETURNS: The processed string +# DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable +# characters. Removes all CR/LF sequences. Removes multiple +# spaces. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: +#=============================================================================== +sub clean_string { + my ($str) = @_; + + $str =~ s/(^\s+|\s+$)//g; + $str =~ tr/[[:graph:]]//c; + $str =~ tr/\x0A\x0D/ /; + $str =~ tr/ / /s; + + return $str; +} + +#=== FUNCTION ================================================================ +# NAME: normaliseDT +# PURPOSE: Normalise an ISO8601 date for comparison, etc. +# PARAMETERS: $dt a DateTime object +# RETURNS: The DateTime object formatted as an ISO8601 string +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub normaliseDT { + my ($dt) = @_; + + my $p = DateTime::Format::SQLite->new(); + + return ( + defined($dt) + ? $p->format_datetime( + DateTime::Format::ISO8601->parse_datetime($dt) + ) + : undef + ); +} + +#=== FUNCTION ================================================================ +# NAME: flattenArray +# PURPOSE: Turns an arrayref into a simple list in a string +# PARAMETERS: $item - the item that may be an arrayref +# RETURNS: The plain item if it's not an array otherwise the flattened +# version +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub flattenArray { + my ($item) = @_; + + my $result; + if ( ref( $item ) eq 'ARRAY' ) { + $result = join(", ",@{$item}); + } + else { + $result = $item; + } + + return $item; +} + +#=== FUNCTION ================================================================ +# NAME: coalesce +# PURPOSE: To find the defined argument and return it +# PARAMETERS: Arbitrary number of arguments +# RETURNS: The first defined argument +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub coalesce { + foreach (@_) { + return $_ if defined($_); + } + return; # undef +} + +#=== FUNCTION ================================================================ +# NAME: emit +# PURPOSE: Print text on STDERR unless silent mode has been selected +# PARAMETERS: - Boolean indicating whether to be silent or not +# - list of arguments to 'print' +# RETURNS: Nothing +# DESCRIPTION: This is a wrapper around 'print' to determine whether to send +# a message to STDERR depending on a boolean. We need this to be +# able to make the script silent when the -silent option is +# selected +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub emit { + unless (shift) { + print STDERR @_; + } +} + +#=== FUNCTION ================================================================ +# NAME: Options +# PURPOSE: Processes command-line options +# PARAMETERS: $optref Hash reference to hold the options +# RETURNS: Undef +# DESCRIPTION: +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub Options { + my ($optref) = @_; + + my @options = ( + "help", "manpage", "debug=i", "silent!", + "load=s", "delete=s", "scan!", "report:s", + "check!", "json:s", "opml:s", "config=s", + "out=s", "template:s", + ); + + if ( !GetOptions( $optref, @options ) ) { + pod2usage( -msg => "Version $VERSION\n", -exitval => 1 ); + } + + return; +} + +__END__ + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Application Documentation +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#{{{ + + +=head1 NAME + +feedWatcher - watch a collection of podcast feeds + +=head1 VERSION + +This documentation refers to I version 0.0.15 + + +=head1 USAGE + + feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report] + [-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE] + [-[no]silent] [-config=FILE] [-debug=N] [URL ...] + +=head1 ARGUMENTS + +Arguments are optional and may consist of an arbitrarily long list of URLs to +be processed and possibly added to the database by the script. + +These URLs are prepended to any which may be provided through the +B<-load=FILE> option described below. The script makes sure the URL collection +contains no duplicates. + +=head1 OPTIONS + +=over 8 + +=item B<-help> + +Prints a brief help message describing the usage of the program, and then exits. + +The full documentation can be displayed with the command: + + perldoc -oman feedWatcher + +Alternatively, use the option B<-manpage> + +=item B<-manpage> + +Prints the entire documentation for the script. Note that this mode uses a +simpler type of formatter than I. + +=item B<-load=FILE> + +Defines a file from which new URLs are to be added to the database. These URLs +are checked in various ways before adding to the database. If arguments are +provided when the script is invoked these URLs are appended to the argument list. + +=item B<-delete=FILE> + +Defines a file from which a list of URLs is to be read which are to be +deleted from the database. + +Note that it is possible (though inadvisable) to both add and delete an URL in +the same run of the script. The URL will first be added (from the load file or +argument list) then deleted. This is a pointless exercise which wastes +bandwidth, so don't do it! + +=item B<-[no]scan> + +This option (B<-scan>) causes the URLs stored in the database to be scanned +and updated. The negated form, which is also the default behaviour of the +script, (B<-noscan>) omits the scan. + +NOTE: This function is not implemented yet. + +=item B<-out=FILE> + +This option defines an output file to receive any output. If the option is +omitted the data is written to STDOUT, allowing it to be redirected if +required. + +=item B<-[no]check> + +This option (B<-check>) causes each feed which is being to be checked against +the script user to check that it's OK to add it. The script reports the +I field and requests a I or I response. + +=item B<-[no]report> + +This option (B<-report>) causes a report of the contents of the database to be +generated. The negated form, which is also the default behaviour of the +script, (B<-noreport>) omits the report. + +NOTE: The report is currently very simple. + +=item B<-json[=FILE]> + +This option, which may be omitted, defines the location where the feed and +episode details are to be written. If omitted no JSON data is written. + +If the option is given as B<-json=FILE> the data is written to the nominated +file. + +If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is +used. + +=item B<-opml[=FILE]> + +This option, which may be omitted, defines the location where the feed details +are to be written. If omitted no OPML data is written. + +If the option is given as B<-opml=FILE> the data is written to the nominated +file. + +If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is +used. + +=item B<-template=FILE> + +This option defines the template used to generate a form of the feed data. The +template is written using the B