Free_Culture_Podcasts/feedWatcher

2625 lines
78 KiB
Plaintext
Raw Normal View History

2022-11-19 21:27:51 +00:00
#!/usr/bin/env perl
#===============================================================================
#
# FILE: feedWatcher
#
# USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
# [-report[=title]] [-[no]check] [-out=FILE] [-json[=FILE]]
# [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
# [URL ...]
#
# DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
# podcasts. Ken's script was inspired by Dann Washko's site at
# http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
# August 2012 where it was used to generate handouts.
#
# The script has not been developed since 2014, but is now in
# use in 2020 helping to prepare podcast information for
# a FOSDEM visit under the heading of "Free Culture Podcasts".
# See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
# (created with Pandoc) in the same directory as this script for
# details of what has been done to develop the original design.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.15
# CREATED: 2013-12-25 12:40:33
# REVISION: 2022-11-18 22:27:35
#
#-------------------------------------------------------------------------------
# Released under the terms of the GNU Affero General Public License (AGPLv3)
# http://www.gnu.org/licenses/agpl-3.0.html
#-------------------------------------------------------------------------------
#
#===============================================================================
use 5.030;
use strict;
use warnings;
use utf8;
use feature qw{ postderef say signatures state };
no warnings qw{ experimental::postderef experimental::signatures } ;
use Getopt::Long;
use Pod::Usage;
use Config::General;
use List::MoreUtils qw{uniq};
use Log::Handler;
use Try::Tiny;
use URI;
use Net::DNS;
use IO::Socket;
use LWP::UserAgent;
use WWW::RobotRules;
use XML::RSS::Parser;
use XML::Feed;
use Feed::Find;
use Template;
use Template::Filters;
Template::Filters->use_html_entities; # Use HTML::Entities in the template
use HTML::Entities;
use IO::Prompter;
use JSON;
use DBI;
use DateTime::Format::SQLite;
use DateTime::Format::ISO8601;
use SQL::Abstract;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.15';
#
# Script name
#
( my $PROG = $0 ) =~ s|.*/||mx;
#
# Declarations
#
my ( @new_urls, @deletions );
my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" );
my ( $search_target );
my ( $sth1, $h1, $rv );
my $feeds;
#
# File and directory paths
#
my $basedir = "$ENV{HOME}/HPR/feed_watcher";
my $tempdir = "$basedir/tmp";
my $configfile = "$basedir/$PROG.cfg";
my $logfile = "$basedir/${PROG}.log";
my $deftemplate = "$basedir/${PROG}.tpl";
#
# Hash key map to database field names (chevron comments are to stop Perltidy
# messing with the layout)
#
#<<<
my %keymap = (
AUTHOR => 'author',
COPYRIGHT => 'copyright',
DESCRIPTION => 'description',
DNS => 'dns',
# ENCLOSURES => undef,
# ENCLOSURE_COUNT => undef,
FORMAT => 'feedformat',
GENERATOR => 'generator',
# HOST => undef,
HOSTUP => 'host_up',
HTTP_CONTENT_TYPE => 'content_type',
HTTP_STATUS => 'http_status',
IMAGE => 'image',
LANGUAGE => 'language',
LINK => 'link',
MODIFIED => 'modified',
# PORT => undef,
# ROBOTS => undef,
# SAVE => undef,
TITLE => 'title',
TYPE => 'urltype',
URI => 'url',
# URI_ID => undef,
);
#>>>
#
# Headers for LWP::UserAgent
#
my %headers = (
'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
. 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
. 'image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#
# Options and arguments
#
my %options;
Options( \%options );
#
# Any arguments are taken to be URLs
#
@new_urls = @ARGV;
#
# Default help
#
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
if ( $options{'help'} );
#
# Detailed help
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
if ( $options{'manpage'} );
#
# Collect options
#
my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $loadfile = $options{'load'};
my $deletefile = $options{'delete'};
my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 );
my $check = ( defined( $options{check} ) ? $options{check} : 0 );
my $outfile = $options{out};
my $report = $options{report};
my $json = $options{json};
my $opml = $options{opml};
my $template = $options{template};
#
# Check the configuration file
#
die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
#
# Check the load file
#
if ($loadfile) {
die "File $loadfile does not exist\n" unless -e $loadfile;
die "File $loadfile is not readable\n" unless -r $loadfile;
}
#
# Check the delete file
#
if ($deletefile) {
die "File $deletefile does not exist\n" unless -e $deletefile;
die "File $deletefile is not readable\n" unless -r $deletefile;
}
#
# We accept -report, meaning report everything or -report='title' to report
# just the feed with the given title.
#
if ( defined($report) ) {
if ($report =~ /^$/) {
$search_target = undef;
}
else {
$search_target = $report;
}
}
#
# We accept -json or -json=filename. In the former case we make a default
# name, otherwise we use the one provided.
#
if ( defined($json) ) {
if ($json =~ /^$/) {
$json = "$PROG.json"
}
}
#
# We accept -opml or -opml=filename. In the former case we make a default
# name, otherwise we use the one provided.
#
if ( defined($opml) ) {
if ($opml =~ /^$/) {
$opml = "$PROG.opml"
}
}
#
# Similarly we accept -template or -template=filename. In the former case we
# make a default name, otherwise we use the one provided.
#
if ( defined($template) ) {
if ($template =~ /^$/) {
$template = "$deftemplate";
}
die "Error: Unable to find template $template\n" unless -r $template;
}
#
# Load configuration data
#
my $conf = new Config::General(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1
);
my %config = $conf->getall();
#
# Connect to the database
#
my $dbtype = $config{database}->{type};
my $dbfile = $config{database}->{file};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
my $dbh
= DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
{ AutoCommit => 1, sqlite_unicode => 1, } )
or die $DBI::errstr;
#
# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
# foreign key constraints to function)
#
$dbh->do('PRAGMA foreign_keys = ON');
#
# Check we have something to do
#
my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
my $work = (
scalar(@new_urls) > 0
|| defined($loadfile)
|| defined($deletefile)
|| ( defined($report)
|| defined($json)
|| defined($opml)
|| defined($template)
|| $scan && $rows > 0 )
);
die "Nothing to do!\n" unless $work;
#-------------------------------------------------------------------------------
# Set up logging keeping the default log layout except for the date
#-------------------------------------------------------------------------------
my $LOG = Log::Handler->new();
$LOG->add(
file => {
timeformat => "%Y/%m/%d %H:%M:%S",
filename => $logfile,
minlevel => 0,
maxlevel => 7,
}
);
#-------------------------------------------------------------------------------
# Open the output file (or STDOUT) - we may need the date to do it
#-------------------------------------------------------------------------------
my $outfh;
if ($outfile) {
open( $outfh, ">:encoding(UTF-8)", $outfile )
or warn "Unable to open $outfile for writing: $!";
}
else {
open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
or warn "Unable to initialise for writing: $!";
}
#
# Set up a robot.txt rules parser
#
$rules = WWW::RobotRules->new($robot_name);
#
# Slurp the load file into @new_urls if the file is provided
#
if ($loadfile) {
#
# Load the input file
#
open( my $in, '<:encoding(utf8)', $loadfile )
or die "$PROG : failed to open load file '$loadfile' : $!\n";
chomp( my @loaded = <$in> );
close($in)
or warn "$PROG : failed to close load file '$loadfile' : $!\n";
$LOG->info("Loaded URLs from $loadfile");
#
# Add the loaded URLs to the array
#
push( @new_urls, @loaded );
}
#
# Now, we either have URLs from the command line, or from the load file, so we
# process these.
#
#while (@new_urls) {
# TODO: Why was this a loop?
if (@new_urls) {
#
# Remove duplicates
#
@new_urls = uniq(@new_urls);
#
# Remove any commented out lines
#
@new_urls = grep {!/^\s*#/} @new_urls;
$LOG->info( "Received ", scalar(@new_urls),
" URLs to add to the database" );
#
# Load these URLs as appropriate, returning any more that we find by
# following HTML urls.
#
@new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
#
# Now process any URLs that came back. Since we are explicitly looking for
# feeds we can assume that's what we have so don't need to recurse again.
#
# TODO
}
#
# Process the delete file if there is one
#
if ($deletefile) {
#
# Load the delete file
#
open( my $del, '<:encoding(utf8)', $deletefile )
or die "$PROG : failed to open load file '$deletefile' : $!\n";
chomp( @deletions = <$del> );
close($del)
or warn "$PROG : failed to close load file '$deletefile' : $!\n";
#
# Remove duplicates
#
@deletions = uniq(@deletions);
$LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" );
#
# There are URLs to delete. Process them on by one.
#
if (@deletions) {
$sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
foreach my $rec (@deletions) {
$rv = $sth1->execute($rec);
if ( $dbh->err ) {
warn $dbh->errstr;
}
if ( $rv != 0 ) {
emit ( $silent, "Deleted $rec ($rv rows)\n" );
}
else {
emit ( $silent, "Failed to delete $rec\n" );
}
}
}
}
#-------------------------------------------------------------------------------
# Perform a database scan if requested
#-------------------------------------------------------------------------------
if ($scan) {
$LOG->info( "Scan is not fully implemented yet" );
2022-11-20 22:49:57 +00:00
# Testing. Processes the first two feeds
2022-11-19 21:27:51 +00:00
scanDB($dbh, \%keymap);
}
#-------------------------------------------------------------------------------
# Report all or selected database contents if requested
#-------------------------------------------------------------------------------
if ( defined($report) ) {
if ( defined($search_target) ) {
#
# Reporting a specific title
#
my @matches = searchTitle( $dbh, $search_target );
if (@matches) {
#
# Too many matches!
#
if ( $#matches > 1 ) {
say "Multiple matches:\n";
say "- ", join( "\n- ", @matches );
say "\nTry again with a different title";
}
else {
#
# One match, report it
#
reportFeed( findFeed( $dbh, $matches[0] ), $outfh );
}
}
else {
say "Feed not found with target '$search_target'";
}
}
else {
#
# Reporting everything
#
reportDB( $dbh, $outfh );
$LOG->info("Report generated");
}
}
#
# Do any image repairs we need, but temporarily
#
if ($json || $opml || $template) {
$feeds = collectData($dbh);
#
# Clean up the image references (until we have a proper fix)
#
for (my $i = 0; $i < scalar(@$feeds); $i++) {
my $f = $feeds->[$i];
if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
$f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
$LOG->info( "Fixed image for $f->{urls_url}" );
}
}
}
#-------------------------------------------------------------------------------
# Output all feeds to a JSON file if requested
#-------------------------------------------------------------------------------
if ($json) {
my $js = JSON->new->utf8->canonical->pretty;
open( my $out, '>:encoding(UTF-8)', $json )
or die "Unable to open output file $json $!\n";
print $out $js->encode($feeds), "\n";
close($out);
emit ( $silent, "JSON data written to $json\n" );
$LOG->info( "JSON data written to $json" );
}
#-------------------------------------------------------------------------------
# Requesting the -opml option means to dump the entire feed table as OPML
#-------------------------------------------------------------------------------
if ($opml) {
require XML::OPML;
open( my $out, '>:encoding(UTF-8)', $opml )
or die "Unable to open output file $opml $!\n";
#
# Start building OPML. Mandatory attributes are 'type', 'text' and
# 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
# attribute is poorly defined; this module uses 'rss' as the type and
# 'RSS' as the value of the 'version' attribute. This is not linked to the
# type of the feed.
#
my $OPML = XML::OPML->new(version => '1.1');
#
# Create the 'head' element
#
my $DT = normaliseDT(DateTime->now());
$OPML->head(
title => 'Free Culture Podcasts',
dateCreated => $DT,
dateModified => $DT,
);
#
# Walk the feeds generating OPML
#
for (my $i = 0; $i < scalar(@$feeds); $i++) {
my $f = $feeds->[$i];
$OPML->add_outline(
title => $f->{urls_title},
text => $f->{urls_title},
description => $f->{urls_description},
xmlUrl => $f->{urls_url},
htmlUrl => $f->{urls_link},
);
}
#
# Output OPML to the nominated file
#
print $out $OPML->as_string;
close($out);
emit ( $silent, "OPML data written to $opml\n" );
$LOG->info( "OPML data written to $opml" );
}
#-------------------------------------------------------------------------------
# Fill and print the template if requested
#-------------------------------------------------------------------------------
if ($template) {
my $tt = Template->new(
{ ABSOLUTE => 1,
ENCODING => 'utf8',
INCLUDE_PATH => $basedir,
}
);
#
# Make the structure the template needs
#
my $vars = {
feeds => $feeds,
};
# print Dumper($vars),"\n";
my $document;
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|| die $tt->error(), "\n";
print $outfh $document;
emit ( $silent, "Data processed with $template\n" );
$LOG->info( "Data processed with $template" );
}
#$dbh->disconnect;
exit;
#=== FUNCTION ================================================================
# NAME: loadUrls
# PURPOSE: To load URLs read from the input file into the database
# PARAMETERS: $dbh database handle
# $new_urls arrayref containing URLs
# $rules WWW::RobotRules object
# $keymap hashref containing a map of key names to
# database field names
# RETURNS: Any new URLs discovered by investigating non-feed URLs.
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub loadUrls {
my ( $dbh, $new_urls, $rules, $keymap ) = @_;
my ( $stream, $feed );
my ( %uridata, $roboturl, @found_urls );
#
# Work through the list of URLs
#
foreach my $rec (@new_urls) {
%uridata = ();
#
# By default save the collected data
#
$uridata{SAVE} = 1;
#
# Check that we have a valid URL. We don't save them if they are
# invalid ($uridata{SAVE} is set to 0 in the routine).
#
my $uri = validateURI($rec,\%uridata);
next unless $uri;
#
# Check whether we already have this URI in the database
#
if (dbSearch(
$dbh, 'SELECT * FROM urls WHERE url = ?',
$uridata{URI}
)
)
{
emit( $silent, "$uri is already in the database\n" );
$uridata{SAVE} = 0;
next;
}
#
# Check the hostname resolves in DNS
#
if ( checkDNS( $uri, \%uridata ) ) {
$uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
emit( $silent, "DNS: $uridata{DNS}\n" );
}
else {
emit( $silent, "$uri has no DNS entry\n" );
$uridata{SAVE} = 0;
next;
}
#
# Check the server is available. Don't save if it's not (probably will
# not get the feed anyway).
#
if ( serverUp( $uri, \%uridata ) ) {
emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
}
else {
emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
$uridata{SAVE} = 0;
next;
}
#
# Check the site's robots.txt. If there's a block then don't save.
#
if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
emit( $silent, "Check of robots.txt rules succeeded\n" );
}
else {
if ( $uridata{ROBOTS} =~ /404/ ) {
emit( $silent, "Search for robots.txt file failed\n" );
}
else {
emit( $silent, "Check of robots.txt rules failed\n" );
$uridata{SAVE} = 0;
next;
}
}
#
# Look for the HTTP content type. Don't save if the request failed.
#
if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) {
emit( $silent, "HTTP request OK\n" );
}
else {
emit( $silent, "HTTP request failed\n" );
$uridata{SAVE} = 0;
next;
}
#
# Parse the feed
#
if ( $uridata{TYPE} eq 'Feed' ) {
$LOG->info('Processing feed: ',$uridata{URI});
$stream = getFeed( $uridata{URI} );
if ($stream) {
$feed = parseFeed( $uridata{URI}, $stream );
unless ( $feed ) {
$uridata{SAVE} = 0;
next;
}
#
# Provide a means of examining the returned feed object. There
# are some weird behaviours in there (partly because of the
# weirdness of RSS and poor adherence to what standards there
# are).
#
print Dumper($feed), "\n" if ( $DEBUG > 2 );
storeFeed($feed,\%uridata);
#
# Perform a check on the copyright. Routine sets
# $uridata{SAVE} = 0 if the copyright is not acceptable.
#
if ($check) {
next unless checkCopyright(\%uridata);
}
}
else {
emit($silent, "Problem collecting feed");
$uridata{SAVE} = 0;
next;
}
}
elsif ( $uridata{TYPE} eq 'HTML' ) {
#
# TODO Save the URL and find any feeds it contains
#
if (@found_urls) {
emit( $silent,
"Type $uridata{TYPE} contained "
. scalar(@found_urls)
. " feeds; to be investigated\n" );
}
else {
emit( $silent,
"Type $uridata{TYPE} contained no feeds; skipped\n" );
}
next;
}
else {
emit( $silent,
"Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
$uridata{SAVE} = 0;
next;
}
}
continue {
# { This 'continue' block is executed for each iteration or 'next' }
#
# Decide whether to save what we have collected
#
if ( $uridata{SAVE} ) {
if ( addURI( $dbh, \%uridata, $keymap ) ) {
emit( $silent, "$uridata{URI} added to the database\n" );
$LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database');
#
# Get the id the database allocated for the row we added
#
$uridata{URI_ID}
= $dbh->last_insert_id( undef, undef, 'urls', undef );
#
# Add any enclosures we found
#
if ( defined( $uridata{ENCLOSURE_COUNT} )
&& $uridata{ENCLOSURE_COUNT} > 0 )
{
if ( addEnclosures( $dbh, \%uridata ) ) {
emit( $silent, $uridata{ENCLOSURE_COUNT},
" enclosures for $uridata{URI} added to the database\n"
);
}
}
}
else {
emit( $silent,
"$uridata{URI} was not added to the database\n" );
$LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database');
}
}
#
# Dump what we have if requested
#
print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 );
emit( $silent, '-' x 80, "\n" );
}
#
# Return any further urls we found
#
return @found_urls;
}
#=== FUNCTION ================================================================
# NAME: searchTitle
# PURPOSE: Search the database for a feed with a given title
# PARAMETERS: $dbh database handle
# $target search target
# RETURNS: A list of titles
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub searchTitle {
my ($dbh, $target) = @_;
my ( $sql1, $sql2, $sth, $rv, $h );
my ( $count, @result );
$sql1 = q{
SELECT count(*) AS count
FROM urls
WHERE urltype = 'Feed'
AND title REGEXP ?
};
$sql2 = q{
SELECT title
FROM urls
WHERE urltype = 'Feed'
AND title REGEXP ?
ORDER BY title
};
#
# Count the number of matches
#
$sth = $dbh->prepare($sql1);
$rv = $sth->execute($search_target);
if ( $dbh->err ) {
warn $dbh->errstr;
return;
}
#
# Proceed if we have 1 or more matches
#
if ( $h = $sth->fetchrow_hashref ) {
$count = $h->{count};
$sth->finish;
if ( $count >= 1 ) {
$sth = $dbh->prepare($sql2);
$rv = $sth->execute($search_target);
if ( $dbh->err ) {
warn $dbh->errstr;
return;
}
while ( $h = $sth->fetchrow_hashref ) {
push(@result, $h->{title});
}
}
return @result;
}
else {
return;
}
}
#=== FUNCTION ================================================================
# NAME: scanDB
# PURPOSE: To scan the URLs in the database and update the stored data
# PARAMETERS: $dbh database handle
# $keymap hashref containing a map of key names to
# database field names
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub scanDB {
my ($dbh, $keymap) = @_;
my ( $sql1, $sth1, $rv1, $h1 );
my ( $aref, @urls, $DT, $stream, $feed );
my ( %uridata, $urichanges, $enc_changes );
#
# Query to return all feed URLs
#
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Collect everything as an arrayref pointing to a bunch of arrayrefs
# containing the column details requested
#
$aref = $sth1->fetchall_arrayref;
#
# Extract just the URL strings
#
@urls = map { $_->[1] } @{$aref};
#
# Now process these URLs from the database one at a time
#
# $sql1 = q{SELECT * FROM urls WHERE url = ?};
# $sth1 = $dbh->prepare($sql1);
my $count = 0;
foreach my $url (@urls) {
%uridata = ();
$count++;
scanFeed($dbh,$url,\%uridata);
# #
# # Record the scan for this URL
# #
# $DT = normaliseDT( DateTime->now() );
# $uridata{SCANNED_ON} = $DT;
#
# #
# # Default to OK
# #
# $uridata{SCAN_OK} = 1;
#
# emit( $silent, "Scanning '$url'\n" );
# $rv1 = $sth1->execute($url);
# if ( $dbh->err ) {
# warn $dbh->errstr;
# }
# $h1 = $sth1->fetchrow_hashref;
# emit( $silent, $h1->{title}, "\n" );
#
# #
# # The URL should be valid already from the point at which it was added
# # to the database, but if we do this we get a canonical version (it
# # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
# # it. It also sets $uridata{URI}, which is useful.)
# #
# my $uri = validateURI( $url, \%uridata );
#
# #
# # Check the hostname resolves in DNS
# #
# if ( checkDNS( $uri, \%uridata ) ) {
# $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
# emit( $silent, "DNS: $uridata{DNS}\n" );
# }
# else {
# emit( $silent, "$uri has no DNS entry\n" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Check the server is available
# #
# if ( serverUp( $uri, \%uridata ) ) {
# emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
# }
# else {
# emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Look for the HTTP content type.
# #
# if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
# emit( $silent, "HTTP request OK\n" );
# }
# else {
# emit( $silent, "HTTP request failed\n" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Note: not doing the robots.txt check since it was done at load time
# #
#
# #
# # We know we have a feed, so go get it
# #
# $stream = getFeed( $uridata{URI} );
# if ($stream) {
# #
# # Parse the feed. The routine generates its own error messages
# #
# $feed = parseFeed( $uridata{URI}, $stream );
# unless ( $feed ) {
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Save the important feed components in the %uridata hash
# #
# print Dumper($feed), "\n" if ( $DEBUG > 2 );
# storeFeed( $feed, \%uridata );
# }
# else {
# emit( $silent, "Problem collecting feed" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
}
continue {
# { This 'continue' block is executed for each iteration or 'next' }
#
# If the scan went OK then perform a comparison between the new feed
# data and that which is stored
#
if ( $uridata{SCAN_OK} ) {
$urichanges = updateURI( $dbh, \%uridata, $keymap );
$LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
$urichanges )
if $urichanges > 0;
# TODO Update enclosures
}
else {
emit( $silent, "$uridata{URI} was not scanned successfully\n" );
$LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
}
#
# Finished scanning this feed
#
emit( $silent, '-' x 80, "\n" );
#
# NOTE: Temporarily stop after the first N feeds
#
last if $count == 2;
}
return;
}
#=== FUNCTION ================================================================
# NAME: scanFeed
# PURPOSE: Performs a scan on a single feed
# PARAMETERS: $dbh database handle
# $url feed URL to look up
# $uridata hashref to the hash of data collected from the
# feed
# RETURNS: True (1) if all the steps worked, otherwise false (0)
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub scanFeed {
my ( $dbh, $url, $uridata ) = @_;
my ( $sql, $sth, $rv, $h );
my ( $DT, $stream, $feed );
my ( $urichanges, $enc_changes );
$sql = q{SELECT * FROM urls WHERE url = ?};
$sth = $dbh->prepare($sql);
#
# Record the scan for this URL
#
$DT = normaliseDT( DateTime->now() );
$uridata->{SCANNED_ON} = $DT;
#
# Default to OK
#
$uridata->{SCAN_OK} = 1;
emit( $silent, "Scanning '$url'\n" );
$rv = $sth->execute($url);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h = $sth->fetchrow_hashref;
emit( $silent, $h->{title}, "\n" );
#
# The URL should be valid already from the point at which it was added
# to the database, but if we do this we get a canonical version (it
# sets $uridata->{SAVE} which makes no sense here, but we'll just ignore
# it. It also sets $uridata->{URI}, which is useful.)
#
my $uri = validateURI( $url, $uridata );
#
# Check the hostname resolves in DNS
#
if ( checkDNS( $uri, $uridata ) ) {
$uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } );
emit( $silent, "DNS: $uridata->{DNS}\n" );
}
else {
emit( $silent, "$uri has no DNS entry\n" );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Check the server is available
#
if ( serverUp( $uri, $uridata ) ) {
emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) );
}
else {
emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Look for the HTTP content type.
#
if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) {
emit( $silent, "HTTP request OK\n" );
}
else {
emit( $silent, "HTTP request failed\n" );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Note: not doing the robots.txt check since it was done at load time
#
#
# We know we have a feed, so go get it
#
$stream = getFeed( $uridata->{URI} );
if ($stream) {
#
# Parse the feed. The routine generates its own error messages
#
$feed = parseFeed( $uridata->{URI}, $stream );
unless ( $feed ) {
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Save the important feed components in the %uridata hash
#
print Dumper($feed), "\n" if ( $DEBUG > 2 );
storeFeed( $feed, $uridata );
}
else {
emit( $silent, "Problem collecting feed" );
$uridata->{SCAN_OK} = 0;
return 0;
}
return 1;
}
#=== FUNCTION ================================================================
# NAME: reportDB
# PURPOSE: To generate a printed report from the database
# PARAMETERS: $dbh database handle
# $fh output file handle
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: reportFeed generates a standardised report
# SEE ALSO: N/A
#===============================================================================
sub reportDB {
my ($dbh, $fh) = @_;
#
# Query to report the contents of the 'all_episodes' view with the latest
# associated episode
#
my $sql = q{
SELECT urls.id, ae.*,
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
ORDER BY urls_title
};
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
while ( my $h1 = $sth1->fetchrow_hashref() ) {
reportFeed( $h1, $fh );
}
$sth1->finish;
return;
}
#=== FUNCTION ================================================================
# NAME: findFeed
# PURPOSE: Find and return a single feed by title
# PARAMETERS: $dbh database handle
# $target search target
# RETURNS: Hashref indexing the feed
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub findFeed {
my ( $dbh, $target ) = @_;
my ( $sql, $sth, $rv, $h );
#
# Query to report the contents of the 'all_episodes' view with the latest
# associated episode
#
$sql = q{
SELECT urls.id, ae.*,
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
AND ae.urls_title = ?
};
$sth = $dbh->prepare($sql);
$rv = $sth->execute($target);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h = $sth->fetchrow_hashref();
return $h;
}
#=== FUNCTION ================================================================
# NAME: reportFeed
# PURPOSE: Reports a single feed from the database
# PARAMETERS: $feed a hash from the database containing the
# details of a particular feed
# $fh output file handle
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub reportFeed {
my ($feed, $fh) = @_;
my $lwidth = 12;
#
# Hash for converting database keys to labels for the report and arrays
# for controlling the sequence.
#
state ( %keys, @seq1, @seq2 );
%keys = (
'id' => 'DB Primary key',
'latest_ep' => 'Latest episode date',
'ep_author' => 'Author',
'ep_byte_length' => 'Byte length',
'ep_category' => 'Category',
'ep_enclosure' => 'URL',
'ep_ep_id' => 'ID',
'ep_id' => 'Key',
'ep_issued' => 'Issued on',
'ep_last_update' => 'Last updated',
'ep_link' => 'Permalink',
'ep_mime_type' => 'MIME type',
'ep_modified' => 'Modified on',
'ep_source' => 'Source',
'ep_title' => 'Title',
'ep_urls_id' => 'URL key',
'urls_author' => 'Author',
'urls_content_type' => 'Content type',
'urls_copyright' => 'Copyright',
'urls_description' => 'Description',
'urls_dns' => 'DNS',
'urls_feedformat' => 'Feed format',
'urls_generator' => 'Generator',
'urls_host_up' => 'Host up',
'urls_http_status' => 'HTTP status',
'urls_id' => 'Key',
'urls_image' => 'Image',
'urls_language' => 'Language',
'urls_last_update' => 'Last updated',
'urls_link' => 'Link',
'urls_modified' => 'Modified on',
'urls_title' => 'Title',
'urls_url' => 'Feed URL',
'urls_urltype' => 'URL type',
'urls_parent_id' => 'Parent ID',
'urls_child_count' => 'Child count',
);
@seq1 = (
'urls_title',
'urls_url',
'urls_feedformat',
'urls_link',
'urls_author',
'urls_content_type',
'urls_copyright',
'urls_description',
'urls_dns',
'urls_generator',
'urls_host_up',
'urls_http_status',
'urls_image',
'urls_language',
'urls_last_update',
'urls_modified',
'urls_parent_id',
'urls_child_count',
);
@seq2 = (
'ep_title',
'ep_enclosure',
'ep_category',
'ep_link',
'ep_author',
'ep_byte_length',
'ep_ep_id',
'ep_mime_type',
'ep_issued',
'ep_last_update',
'ep_modified',
'ep_source',
);
if ($feed) {
print $fh "Channel:\n";
foreach my $key (@seq1) {
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
coalesce( $feed->{$key}, '--' );
}
print $fh "\nLatest episode:\n";
foreach my $key (@seq2) {
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
coalesce( $feed->{$key}, '--' );
}
print $fh "\n";
}
return;
}
#=== FUNCTION ================================================================
# NAME: collectData
# PURPOSE: Collects data from the database for generating a report
# PARAMETERS: $dbh database handle
# RETURNS: An array of hashrefs from the query
# DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
# table and the 'all_episodes' view, showing the date of the
# latest episode. The result is an array of rows, each
# represented as a hash, all sorted by the feed title.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub collectData {
my ($dbh) = @_;
#
# Query to report the contents of the 'urls' table with the details of the
# latest episode
#
my $sql = q{
SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
ORDER BY ae.urls_title
};
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Return everything as an arrayref of hashrefs
#
my $tbl_ary_ref = $sth1->fetchall_arrayref({});
$sth1->finish;
return $tbl_ary_ref;
}
#=== FUNCTION ================================================================
# NAME: countRows
# PURPOSE: To count the rows in a table
# PARAMETERS: $dbh database handle
# $sql SQL expression to use
# RETURNS: Number of rows found (note that zero is returned as 0E0)
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub countRows {
my ( $dbh, $sql ) = @_;
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
my $h1 = $sth1->fetch;
$sth1->finish;
return @$h1[0];
}
#=== FUNCTION ================================================================
# NAME: dbSearch
# PURPOSE: To perform a simple search in the database
# PARAMETERS: $dbh database handle
# $sql SQL expression to use (expected to be
# a SELECT)
# @args arguments for the 'execute'
# RETURNS: True (1) if the row exists, otherwise false (0).
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
# SQL without incurring the overhead of repeated 'prepare'
# calls. Only the first row is fetched (we expect there to be
# only one) and the success or failure is determined by its
# existence.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub dbSearch {
my ( $dbh, $sql, @args ) = @_;
my $sth1 = $dbh->prepare_cached($sql);
my $rv = $sth1->execute(@args);
if ( $dbh->err ) {
warn $dbh->errstr;
}
my $h1 = $sth1->fetchrow_hashref();
$sth1->finish;
return defined($h1);
}
#=== FUNCTION ================================================================
# NAME: execSQL
# PURPOSE: To perform a non-SELECT query
# PARAMETERS: $dbh database handle
# $sql SQL expression to use
# @args arguments for the 'execute'
# RETURNS: True (1) if the query succeeded, otherwise false (0).
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
# SQL without incurring the overhead of repeated 'prepare'
# calls.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub execSQL {
my ( $dbh, $sql, @args ) = @_;
my $sth1 = $dbh->prepare_cached($sql);
my $rv = $sth1->execute(@args);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$sth1->finish;
return $rv;
}
#=== FUNCTION ================================================================
# NAME: validateURI
# PURPOSE: Checks a URL for validity
# PARAMETERS: $rec the URL handed to the script
# $uridata hashref containing data for this URI
# RETURNS: A URI object if valid otherwise undef
# DESCRIPTION: The URL string is validated with the URI module. A canonical
# string version is stored in the hash referenced by $uridata
# hash if valid otherwise the URL is marked as invalid.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub validateURI {
my ( $rec, $uridata ) = @_;
#
# Check that we have a valid URL. We don't save them if they are
# invalid.
#
my $uri = URI->new( $rec, 'http' );
if ( $uri->scheme ) {
emit( $silent, "URI $uri is valid\n" );
$uridata->{URI} = $uri->canonical->as_string;
return $uri;
}
else {
emit( $silent, "URI $uri is not valid\n" );
$uridata->{SAVE} = 0;
return;
}
}
#=== FUNCTION ================================================================
# NAME: checkDNS
# PURPOSE: Looks up a host DNS entry
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the DNS query was successful, otherwise false (0)
# DESCRIPTION: The host name is extracted from the URI (and stored). The
# hostname is searched for in the DNS and if successful, an
# array of addresses from the 'A' records is built. This is
# sorted and stored in the hash referenced by $uridata.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkDNS {
my ( $uri, $uridata ) = @_;
my @adds;
my $hostname = $uri->host;
$uridata->{HOST} = $hostname;
my $res = Net::DNS::Resolver->new;
my $query = $res->search($hostname);
if ($query) {
foreach my $rr ( $query->answer ) {
next unless $rr->type eq "A";
push( @adds, $rr->address );
}
@adds = sort(@adds);
$uridata->{DNS} = \@adds;
return 1;
}
else {
warn "Query failed: ", $res->errorstring, "\n";
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: serverUp
# PURPOSE: Checks that a given host is responding
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the host responds to a TCP connection, false (0)
# otherwise
# DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
# to the appropriate default for the scheme, such as 80 for
# http). Attempts to connect to this host and port. If the
# connect fails then details are written to the data structure
# pointed to by $uridata.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub serverUp {
my ( $uri, $uridata ) = @_;
my ( $hostname, $port, $socket );
$hostname = $uri->host();
$port = $uri->port();
$uridata->{PORT} = $port;
$socket = IO::Socket::INET->new(
Proto => "tcp",
PeerAddr => $hostname,
PeerPort => $port,
Reuse => 1,
Timeout => 10
);
if ($socket) {
$socket->close;
$uridata->{HOSTUP} = 1;
return 1;
}
else {
$uridata->{HOSTUP} = 0;
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: robotRulesOK
# PURPOSE: To check the intended URL against the site's robots.txt rules
# PARAMETERS: $uri URI object
# $rules WWW::RobotRules object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the GET of the robots.txt file succeeded and the
# rules allow the URI object to be fetched, false (0) otherwise.
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub robotRulesOK {
my ( $uri, $rules, $uridata ) = @_;
#
# Build the URL of robots.txt
#
my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
my $robots_txt;
#
# Use LWP::UserAgent to get the feed and handle errors
#
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("$PROG/$VERSION");
my $response = $ua->get($roboturl);
if ( $response->is_success ) {
$uridata->{ROBOTS} = 'Found';
$robots_txt = $response->decoded_content;
$rules->parse( $roboturl, $robots_txt );
return $rules->allowed("$uri");
}
else {
$uridata->{ROBOTS} = $response->status_line;
warn "Failed to get $roboturl\n";
warn $response->status_line . "\n";
return; # undef
}
}
#=== FUNCTION ================================================================
# NAME: checkContentType
# PURPOSE: Check the content_type of the url
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# $headers hashref containing query headers
# $children arrayref to return any 'child' feeds we might
# find; they are appended to the array
# $log Log::Handler object
# RETURNS: True (1) if all was well, otherwise false (0)
# DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
# for the URI object using the LWP::UserAgent head method. Then
# we examine the 'content-type' header looking for the string
# 'xml' or 'html' in it. The former denotes a feed, and the
# latter a normal HTML page.
2022-11-20 22:49:57 +00:00
# We use Feed::Find if the page is HTML in order to find any
2022-11-19 21:27:51 +00:00
# "child" feeds which we really want to process if we find any.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkContentType {
my ( $uri, $uridata, $headers, $children, $log ) = @_;
my @feeds;
$uridata->{HTTP_STATUS} = 'Unknown';
my $browser = LWP::UserAgent->new or return 0;
my $response = $browser->head( $uri->as_string, %{$headers} )
or return 0;
$uridata->{HTTP_STATUS} = $response->status_line;
if ( $response->is_success ) {
$uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
#
# Decode the content-type we received
#
if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
$uridata->{TYPE} = 'Feed';
}
elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
$uridata->{TYPE} = 'HTML';
}
else {
$uridata->{TYPE} = 'Unknown';
}
#
# Sometimes perfectly valid feeds misleadingly return text/html so we
# try to spot such cases here and adjust the internal type accordingly
#
if ( $uridata->{TYPE} eq 'HTML' ) {
@feeds = Feed::Find->find( $uri->as_string );
if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
emit( $silent, "Feed found with wrong content-type\n" );
$uridata->{TYPE} = 'Feed';
}
else {
#
# This HTML and we found 'child' feeds of some kind
#
emit( $silent, "Found ", scalar(@feeds),
" feeds within this HTML page\n" );
print Dumper( \@feeds ), "\n" if $DEBUG > 0;
push(@{$children}, @feeds);
}
}
$log->info( "URL content classified as: ", $uridata->{TYPE} );
emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
return 1;
}
else {
$uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: getFeed
# PURPOSE: Download the contents of a feed URL
# PARAMETERS: $feed_url URL of the feed to download
# RETURNS: String representation of the feed contents or undef if the
# download failed.
# DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
# need not be). If successful the contents are decoded and
# returned, otherwise undef is returned.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub getFeed {
my ($feed_url) = @_;
#
# Use LWP::UserAgent to get the feed and handle errors
#
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("$PROG/$VERSION");
my $response = $ua->get($feed_url);
my $feed_content;
if ( $response->is_success ) {
$feed_content = $response->decoded_content;
return $feed_content;
}
else {
warn "Failed to get $feed_url\n";
warn $response->status_line, "\n";
return; # undef
}
}
#=== FUNCTION ================================================================
# NAME: parseFeed
# PURPOSE: Parse a podcast feed that has already been downloaded
# PARAMETERS: $feed_url URL of the feed previously downloaded
# $feed_content String containing the content of the feed, for
# parsing
# RETURNS: An XML::Feed object containing the parsed feed or undef if the
# parse failed
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub parseFeed {
my ( $feed_url, $feed_content ) = @_;
my $feed = XML::Feed->parse( \$feed_content );
unless ($feed) {
#
# Something went wrong. Abort this feed
#
warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
return; # undef
}
return $feed;
}
#=== FUNCTION ================================================================
# NAME: storeFeed
# PURPOSE: Stores feed attributes in a hash
# PARAMETERS: $feed XML::Feed object returned from parsing the
# feed
# $uridata hashref containing data for this URI
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub storeFeed {
my ($feed, $uridata) = @_;
( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g;
( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
$uridata->{LINK} = $feed->link;
$uridata->{DESCRIPTION} = clean_string( $feed->tagline );
$uridata->{AUTHOR} = $feed->author;
$uridata->{MODIFIED} = normaliseDT( $feed->modified );
$uridata->{IMAGE} = flattenArray( $feed->image );
$uridata->{COPYRIGHT} = $feed->copyright;
$uridata->{GENERATOR} = $feed->generator;
$uridata->{LANGUAGE} = $feed->language;
#print coalesce($feed->webMaster,'No webMaster'),"\n";
$uridata->{ENCLOSURES} = extractEnclosures($feed);
$uridata->{ENCLOSURE_COUNT}
= scalar( @{ $uridata->{ENCLOSURES} } );
}
#=== FUNCTION ================================================================
# NAME: checkCopyright
# PURPOSE: Ask the user to check the copyright of a feed
# PARAMETERS:
# RETURNS:
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkCopyright {
my ($uridata) = @_;
my $decision;
$LOG->info('Checking copyright of feed');
#
# Prompt the user, failing gracefully if there's
# a problem. If the user types 'Y' or 'y' we accept the
# feed, otherwise we do not (thus a blank return = 'no').
#
try {
printf STDERR
"Feed '%s' has the copyright string:\n%s\n",
$uridata->{TITLE},
coalesce( $uridata->{COPYRIGHT}, '' );
$decision = prompt(
-in => *STDIN,
-prompt => 'OK to add this feed?',
-style => 'bold red underlined',
-yes
);
}
catch {
warn "Problem processing copyright decision: $_";
$decision = 0;
};
#
# Take action on the decision (or default)
#
$uridata->{SAVE} = $decision;
if ($decision) {
emit( $silent, "Feed added\n" );
$LOG->info('Copyright OK');
return 1;
}
else {
emit( $silent, "Feed not added\n" );
$LOG->info('Copyright not OK');
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: parseRSS
# PURPOSE: Attempt to parse a feed as RSS
# PARAMETERS: $feed_url URL of the feed previously downloaded
# $feed_content String containing the content of the feed, for
# parsing
# RETURNS: An XML::RSS object containing the parsed feed or undef if the
# parse failed
# DESCRIPTION: ** Incomplete **
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub parseRSS {
my ( $feed_url, $feed_content ) = @_;
my $rss = XML::RSS->parse(\$feed_content);
unless ($rss) {
}
}
#=== FUNCTION ================================================================
# NAME: addURI
# PURPOSE: Adds the data for a URI to the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# $keymap hashref containing a map of key names to
# database field names
# RETURNS: True (1) if the insert succeeded, false (0) otherwise
# DESCRIPTION: The hash keys are defined as an array to make it easy to slice
# the hash and the SQL is defined internally using the size of
# the key array as a guide to the number of '?' placeholders.
# These are passed to execSQL to do the work.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub addURI {
my ( $dbh, $uridata, $keymap ) = @_;
my @keys = (
'URI', 'DNS',
'HOSTUP', 'HTTP_STATUS',
'HTTP_CONTENT_TYPE', 'TYPE',
'FORMAT', 'TITLE',
'DESCRIPTION', 'AUTHOR',
'MODIFIED', 'LINK',
'IMAGE', 'COPYRIGHT',
'GENERATOR', 'LANGUAGE',
);
my $sql
= 'INSERT INTO urls ('
. join( ",", @{$keymap}{@keys} ) . ') '
. 'VALUES('
. join( ',', ('?') x scalar(@keys) ) . ')';
print "addURI query: $sql\n" if $DEBUG > 0;
return execSQL( $dbh, $sql, @{$uridata}{@keys} );
}
#=== FUNCTION ================================================================
# NAME: extractEnclosures
# PURPOSE: Builds an array of hashes containing enclosure data from an
# XML::Feed object
# PARAMETERS: $feed XML::Feed object
# RETURNS: A reference to the array of anonymous hashes built from the
# feed.
# DESCRIPTION: The XML::Feed object is expected to contain an array of
# entries. These are converted to hashes, references to which
# are stored in an array. The two DateTime components are
# converted to ISO8601 dates. If there is an enclosure then its
# elements are saved. Note that there could be multiple
# enclosures, but XML::Feed does not cater for them unless
# explicitly requested. We do not deal with such a case here.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub extractEnclosures {
my ($feed) = @_;
my @encs;
foreach my $entry ( $feed->entries ) {
my %ent;
$ent{title} = $entry->title;
$ent{base} = $entry->base;
$ent{link} = $entry->link;
$ent{category} = join( ", ", $entry->category );
$ent{author} = $entry->author;
$ent{id} = $entry->id;
$ent{issued} = normaliseDT( $entry->issued );
$ent{modified} = normaliseDT( $entry->modified );
#
# Work around a bug in XML::Feed where the source method is only
# available for Atom feeds. TODO report this.
#
if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
$ent{source} = $entry->source;
}
my ($enclosure) = $entry->enclosure;
if ( defined($enclosure) ) {
$ent{url} = $enclosure->url;
$ent{type} = $enclosure->type;
$ent{length} = $enclosure->length;
}
push( @encs, \%ent );
}
return \@encs;
}
#=== FUNCTION ================================================================
# NAME: addEnclosures
# PURPOSE: Adds episodes extracted from a feed into the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
# DESCRIPTION: The SQL is defined internally and the hash keys are defined as
# an array to make it easy to slice the hash. The enclosures (or
# more correctly, feed items) are present in the hash as an
# array of anonymous hashes. These are processed one at a time
# and inserted into the database. A count of the number of
# successful inserts is kept. This is compared with the number
# of enclosures to determine the boolean value to return.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub addEnclosures {
my ( $dbh, $uridata ) = @_;
my $sql = q{INSERT INTO episodes
(urls_id, link, enclosure, title, author, category, source, ep_id,
issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
my @keys = (
'link', 'url', 'title', 'author', 'category', 'source',
'id', 'issued', 'modified', 'length', 'type'
);
my $successes = 0;
foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
$successes++;
}
else {
emit( $silent, "Failed to add episode $enc->{url}\n" );
}
}
return ( $successes == $uridata->{ENCLOSURE_COUNT} );
}
#=== FUNCTION ================================================================
# NAME: updateURI
# PURPOSE: Compare the data in a hash with that in the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# $keymap hashref containing a map of key names to
# database field names
# RETURNS: The number of changes made
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub updateURI {
my ( $dbh, $uridata, $keymap ) = @_;
my ( $sql1, $sth1, $rv1, $h1 );
my ( %fieldvals, %where );
my ( $diffs, $updates ) = ( 0, 0 );
my @keys = (
'URI', 'DNS',
'HOSTUP', 'HTTP_STATUS',
'HTTP_CONTENT_TYPE', 'TYPE',
'FORMAT', 'TITLE',
'DESCRIPTION', 'AUTHOR',
'MODIFIED', 'LINK',
'IMAGE', 'COPYRIGHT',
'GENERATOR', 'LANGUAGE',
);
#
# Get the row from the urls table
#
$sql1 = q{SELECT * FROM urls WHERE url = ?};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute( $uridata->{URI} );
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
for my $key (@keys) {
unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
$diffs++;
$fieldvals{$key} = $uridata->{$key};
#
# Temporary report
#
print "Difference: ($key)\n";
print " Feed: ", coalesce($uridata->{$key},''), "\n";
print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
}
}
if ($diffs > 0) {
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
#
# Build the update statement
#
$where{id} = $h1->{id};
my ( $stmt, @bindvals )
= $sql->update( 'urls', \%fieldvals, \%where );
# Temporary
print "Statement: $stmt\n";
print "Bind values: ", join(",",@bindvals),"\n";
#
# Perform the updates
#
$sth1 = $dbh->prepare($stmt);
$sth1->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $h1->{url}\n", $dbh->errstr;
}
else {
emit($silent, "Updated $h1->{url}\n");
$updates++;
}
}
return $updates;
}
#=== FUNCTION ================================================================
# NAME: updateEnclosures
# PURPOSE: Update the enclosures stored with a feed URL
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# RETURNS: The number of changes made
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub updateEnclosures {
my ( $dbh, $uridata ) = @_;
my ( $sql1, $sth1, $rv1, $h1 );
my ( %fieldvals, %where );
my ( $diffs, $updates ) = ( 0, 0 );
#
# Get the enclosures associated with this feed
#
$sql1 = q{
SELECT * FROM episodes
WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute( $uridata->{URI} );
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
}
#=== FUNCTION ================================================================
# NAME: equal
# PURPOSE: Compare two strings even if undefined
# PARAMETERS: $s1 The first string
# $s2 The second string
# RETURNS: True if both strings are undefined, false if one isn't
# defined, otherwise the result of comparing them.
# DESCRIPTION: Works on the principle that two undefined strings are equal,
# a defined and an undefined string are not, and otherwise they
# are equal if they are equal!
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub equal {
my ( $s1, $s2 ) = @_;
return 1 if ( !defined($s1) && !defined($s2) );
return 0 if ( !defined($s1) || !defined($s2) );
return ( $s1 eq $s2 );
}
#=== FUNCTION ================================================================
# NAME: clean_string
# PURPOSE: Clean a string of non-printables, newlines, multiple spaces
# PARAMETERS: $str The string to process
# RETURNS: The processed string
# DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
# characters. Removes all CR/LF sequences. Removes multiple
# spaces.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub clean_string {
my ($str) = @_;
$str =~ s/(^\s+|\s+$)//g;
$str =~ tr/[[:graph:]]//c;
$str =~ tr/\x0A\x0D/ /;
$str =~ tr/ / /s;
return $str;
}
#=== FUNCTION ================================================================
# NAME: normaliseDT
# PURPOSE: Normalise an ISO8601 date for comparison, etc.
# PARAMETERS: $dt a DateTime object
# RETURNS: The DateTime object formatted as an ISO8601 string
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub normaliseDT {
my ($dt) = @_;
my $p = DateTime::Format::SQLite->new();
return (
defined($dt)
? $p->format_datetime(
DateTime::Format::ISO8601->parse_datetime($dt)
)
: undef
);
}
#=== FUNCTION ================================================================
# NAME: flattenArray
# PURPOSE: Turns an arrayref into a simple list in a string
# PARAMETERS: $item - the item that may be an arrayref
# RETURNS: The plain item if it's not an array otherwise the flattened
# version
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flattenArray {
my ($item) = @_;
my $result;
if ( ref( $item ) eq 'ARRAY' ) {
$result = join(", ",@{$item});
}
else {
$result = $item;
}
return $item;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return; # undef
}
#=== FUNCTION ================================================================
# NAME: emit
# PURPOSE: Print text on STDERR unless silent mode has been selected
# PARAMETERS: - Boolean indicating whether to be silent or not
# - list of arguments to 'print'
# RETURNS: Nothing
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
# a message to STDERR depending on a boolean. We need this to be
# able to make the script silent when the -silent option is
# selected
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub emit {
unless (shift) {
print STDERR @_;
}
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "manpage", "debug=i", "silent!",
"load=s", "delete=s", "scan!", "report:s",
"check!", "json:s", "opml:s", "config=s",
"out=s", "template:s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
feedWatcher - watch a collection of podcast feeds
=head1 VERSION
This documentation refers to I<feedWatcher> version 0.0.15
=head1 USAGE
feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
[-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
[-[no]silent] [-config=FILE] [-debug=N] [URL ...]
=head1 ARGUMENTS
Arguments are optional and may consist of an arbitrarily long list of URLs to
be processed and possibly added to the database by the script.
These URLs are prepended to any which may be provided through the
B<-load=FILE> option described below. The script makes sure the URL collection
contains no duplicates.
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
The full documentation can be displayed with the command:
perldoc -oman feedWatcher
Alternatively, use the option B<-manpage>
=item B<-manpage>
Prints the entire documentation for the script. Note that this mode uses a
simpler type of formatter than I<perldoc>.
=item B<-load=FILE>
Defines a file from which new URLs are to be added to the database. These URLs
are checked in various ways before adding to the database. If arguments are
provided when the script is invoked these URLs are appended to the argument list.
=item B<-delete=FILE>
Defines a file from which a list of URLs is to be read which are to be
deleted from the database.
Note that it is possible (though inadvisable) to both add and delete an URL in
the same run of the script. The URL will first be added (from the load file or
argument list) then deleted. This is a pointless exercise which wastes
bandwidth, so don't do it!
=item B<-[no]scan>
This option (B<-scan>) causes the URLs stored in the database to be scanned
and updated. The negated form, which is also the default behaviour of the
script, (B<-noscan>) omits the scan.
NOTE: This function is not implemented yet.
=item B<-out=FILE>
This option defines an output file to receive any output. If the option is
omitted the data is written to STDOUT, allowing it to be redirected if
required.
=item B<-[no]check>
This option (B<-check>) causes each feed which is being to be checked against
the script user to check that it's OK to add it. The script reports the
I<copyright> field and requests a I<y> or I<n> response.
=item B<-[no]report>
This option (B<-report>) causes a report of the contents of the database to be
generated. The negated form, which is also the default behaviour of the
script, (B<-noreport>) omits the report.
NOTE: The report is currently very simple.
=item B<-json[=FILE]>
This option, which may be omitted, defines the location where the feed and
episode details are to be written. If omitted no JSON data is written.
If the option is given as B<-json=FILE> the data is written to the nominated
file.
If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
used.
=item B<-opml[=FILE]>
This option, which may be omitted, defines the location where the feed details
are to be written. If omitted no OPML data is written.
If the option is given as B<-opml=FILE> the data is written to the nominated
file.
If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
used.
=item B<-template=FILE>
This option defines the template used to generate a form of the feed data. The
template is written using the B<Template> toolkit language.
If the file name is omitted then the script uses the file B<feedWatcher.tpl>
in the same directory as the script. If this file does not exist then the
script will exit with an error message.
For convenience B<feedWatcher.tpl> can be made a soft link which points to the
file which is the current default. This allows the development of versions
without changing the usual way this script is run.
=item B<-config=FILE>
This option allows an alternative configuration file to be used. This file
defines the location of the database, its name and the username and
password to be used to access it. This feature permits a test database to be
used, or for two more sets of feeds to be processed.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<feedWatcher.cfg>
=item B<-debug=N>
This option selects the debug level, resulting in a lot of output.
0 (the default) No debug output
1 Dumps the list of feeds found in an HTML download.
Also shows the SQL query which will result in the insertion of a new
row into the database table I<urls>.
2 Dumps the collected data which is destined to be written to the database
3 Dumps the contents of feeds during analysis
=back
=head1 DESCRIPTION
A full description of the application and its features.
May include numerous subsections (i.e. =head2, =head3, etc.)
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
A configuration file is used by the script. By default the name is
B<feedWatcher.cfg> (though, what is searched for is derived from the name of
the script). An alternative configuration file can be provided, specified
through the B<-config=FILE> option.
The file conforms to the format managed by the B<Config::General> module:
<database>
type = SQLite
file = feedWatcher.db
user =
password =
</database>
The B<type> value should be I<SQLite>, though it might be possible to use
other database types.
The B<file> value is the name of the database, which can include the
directory path.
The B<user> and B<password> values are not currently required, but if
authentication to the database was required it would be implemented this way.
=head1 DEPENDENCIES
Config::General
DBI
Data::Dumper
DateTime::Format::ISO8601
DateTime::Format::SQLite
Feed::Find
Getopt::Long
HTML::Entities
IO::Prompter
IO::Socket
JSON
LWP::UserAgent
List::MoreUtils qw{uniq}
Log::Handler
Net::DNS
Pod::Usage
SQL::Abstract
Template
Template::Filters
Try::Tiny
URI
WWW::RobotRules
XML::Feed
XML::RSS::Parser
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com) 2020
=head1 LICENCE AND COPYRIGHT
Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This program is free software. You can redistribute it and/or modify it under
the same terms as perl itself.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker