2022-11-19 21:27:51 +00:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
#===============================================================================
|
|
|
|
#
|
|
|
|
# FILE: feedWatcher
|
|
|
|
#
|
|
|
|
# USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
|
|
|
|
# [-report[=title]] [-[no]check] [-out=FILE] [-json[=FILE]]
|
|
|
|
# [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
|
|
|
|
# [URL ...]
|
|
|
|
#
|
|
|
|
# DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
|
|
|
|
# podcasts. Ken's script was inspired by Dann Washko's site at
|
|
|
|
# http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
|
|
|
|
# August 2012 where it was used to generate handouts.
|
|
|
|
#
|
|
|
|
# The script has not been developed since 2014, but is now in
|
|
|
|
# use in 2020 helping to prepare podcast information for
|
|
|
|
# a FOSDEM visit under the heading of "Free Culture Podcasts".
|
|
|
|
# See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
|
|
|
|
# (created with Pandoc) in the same directory as this script for
|
|
|
|
# details of what has been done to develop the original design.
|
|
|
|
#
|
|
|
|
# OPTIONS: ---
|
|
|
|
# REQUIREMENTS: ---
|
|
|
|
# BUGS: ---
|
|
|
|
# NOTES: ---
|
|
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
|
|
# VERSION: 0.0.15
|
|
|
|
# CREATED: 2013-12-25 12:40:33
|
|
|
|
# REVISION: 2022-11-18 22:27:35
|
|
|
|
#
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Released under the terms of the GNU Affero General Public License (AGPLv3)
|
|
|
|
# http://www.gnu.org/licenses/agpl-3.0.html
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
#
|
|
|
|
#===============================================================================
|
|
|
|
|
|
|
|
use 5.030;
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use utf8;
|
|
|
|
use feature qw{ postderef say signatures state };
|
|
|
|
no warnings qw{ experimental::postderef experimental::signatures } ;
|
|
|
|
|
|
|
|
use Getopt::Long;
|
|
|
|
use Pod::Usage;
|
|
|
|
use Config::General;
|
|
|
|
use List::MoreUtils qw{uniq};
|
|
|
|
use Log::Handler;
|
|
|
|
|
|
|
|
use Try::Tiny;
|
|
|
|
|
|
|
|
use URI;
|
|
|
|
use Net::DNS;
|
|
|
|
use IO::Socket;
|
|
|
|
use LWP::UserAgent;
|
|
|
|
use WWW::RobotRules;
|
|
|
|
use XML::RSS::Parser;
|
|
|
|
use XML::Feed;
|
|
|
|
use Feed::Find;
|
|
|
|
|
|
|
|
use Template;
|
|
|
|
use Template::Filters;
|
|
|
|
Template::Filters->use_html_entities; # Use HTML::Entities in the template
|
|
|
|
|
|
|
|
use HTML::Entities;
|
|
|
|
|
|
|
|
use IO::Prompter;
|
|
|
|
|
|
|
|
use JSON;
|
|
|
|
|
|
|
|
use DBI;
|
|
|
|
use DateTime::Format::SQLite;
|
|
|
|
use DateTime::Format::ISO8601;
|
|
|
|
use SQL::Abstract;
|
|
|
|
|
|
|
|
use Data::Dumper;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Version number (manually incremented)
|
|
|
|
#
|
|
|
|
our $VERSION = '0.0.15';
|
|
|
|
|
|
|
|
#
|
|
|
|
# Script name
|
|
|
|
#
|
|
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Declarations
|
|
|
|
#
|
|
|
|
my ( @new_urls, @deletions );
|
|
|
|
my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" );
|
|
|
|
my ( $search_target );
|
|
|
|
my ( $sth1, $h1, $rv );
|
|
|
|
|
|
|
|
my $feeds;
|
|
|
|
|
|
|
|
#
|
|
|
|
# File and directory paths
|
|
|
|
#
|
|
|
|
my $basedir = "$ENV{HOME}/HPR/feed_watcher";
|
|
|
|
my $tempdir = "$basedir/tmp";
|
|
|
|
my $configfile = "$basedir/$PROG.cfg";
|
|
|
|
my $logfile = "$basedir/${PROG}.log";
|
|
|
|
my $deftemplate = "$basedir/${PROG}.tpl";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Hash key map to database field names (chevron comments are to stop Perltidy
|
|
|
|
# messing with the layout)
|
|
|
|
#
|
|
|
|
#<<<
|
|
|
|
my %keymap = (
|
|
|
|
AUTHOR => 'author',
|
|
|
|
COPYRIGHT => 'copyright',
|
|
|
|
DESCRIPTION => 'description',
|
|
|
|
DNS => 'dns',
|
|
|
|
# ENCLOSURES => undef,
|
|
|
|
# ENCLOSURE_COUNT => undef,
|
|
|
|
FORMAT => 'feedformat',
|
|
|
|
GENERATOR => 'generator',
|
|
|
|
# HOST => undef,
|
|
|
|
HOSTUP => 'host_up',
|
|
|
|
HTTP_CONTENT_TYPE => 'content_type',
|
|
|
|
HTTP_STATUS => 'http_status',
|
|
|
|
IMAGE => 'image',
|
|
|
|
LANGUAGE => 'language',
|
|
|
|
LINK => 'link',
|
|
|
|
MODIFIED => 'modified',
|
|
|
|
# PORT => undef,
|
|
|
|
# ROBOTS => undef,
|
|
|
|
# SAVE => undef,
|
|
|
|
TITLE => 'title',
|
|
|
|
TYPE => 'urltype',
|
|
|
|
URI => 'url',
|
|
|
|
# URI_ID => undef,
|
|
|
|
);
|
|
|
|
#>>>
|
|
|
|
|
|
|
|
#
|
|
|
|
# Headers for LWP::UserAgent
|
|
|
|
#
|
|
|
|
my %headers = (
|
|
|
|
'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
|
|
|
|
. 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
|
|
|
|
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
|
|
|
|
. 'image/pjpeg, image/png, */*',
|
|
|
|
'Accept-Charset' => 'iso-8859-1,*,utf-8',
|
|
|
|
'Accept-Language' => 'en-US',
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Enable Unicode mode
|
|
|
|
#
|
|
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Options and arguments
|
|
|
|
#
|
|
|
|
my %options;
|
|
|
|
Options( \%options );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Any arguments are taken to be URLs
|
|
|
|
#
|
|
|
|
@new_urls = @ARGV;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Default help
|
|
|
|
#
|
|
|
|
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
|
|
|
|
if ( $options{'help'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Detailed help
|
|
|
|
#
|
|
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
|
|
|
|
if ( $options{'manpage'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect options
|
|
|
|
#
|
|
|
|
my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
|
|
|
|
|
|
|
|
my $cfgfile
|
|
|
|
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
|
|
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
|
|
my $loadfile = $options{'load'};
|
|
|
|
my $deletefile = $options{'delete'};
|
|
|
|
|
|
|
|
my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 );
|
|
|
|
my $check = ( defined( $options{check} ) ? $options{check} : 0 );
|
|
|
|
|
|
|
|
my $outfile = $options{out};
|
|
|
|
my $report = $options{report};
|
|
|
|
my $json = $options{json};
|
|
|
|
my $opml = $options{opml};
|
|
|
|
my $template = $options{template};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the configuration file
|
|
|
|
#
|
|
|
|
die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the load file
|
|
|
|
#
|
|
|
|
if ($loadfile) {
|
|
|
|
die "File $loadfile does not exist\n" unless -e $loadfile;
|
|
|
|
die "File $loadfile is not readable\n" unless -r $loadfile;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the delete file
|
|
|
|
#
|
|
|
|
if ($deletefile) {
|
|
|
|
die "File $deletefile does not exist\n" unless -e $deletefile;
|
|
|
|
die "File $deletefile is not readable\n" unless -r $deletefile;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# We accept -report, meaning report everything or -report='title' to report
|
|
|
|
# just the feed with the given title.
|
|
|
|
#
|
|
|
|
if ( defined($report) ) {
|
|
|
|
if ($report =~ /^$/) {
|
|
|
|
$search_target = undef;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$search_target = $report;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#
|
|
|
|
# We accept -json or -json=filename. In the former case we make a default
|
|
|
|
# name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($json) ) {
|
|
|
|
if ($json =~ /^$/) {
|
|
|
|
$json = "$PROG.json"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# We accept -opml or -opml=filename. In the former case we make a default
|
|
|
|
# name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($opml) ) {
|
|
|
|
if ($opml =~ /^$/) {
|
|
|
|
$opml = "$PROG.opml"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Similarly we accept -template or -template=filename. In the former case we
|
|
|
|
# make a default name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($template) ) {
|
|
|
|
if ($template =~ /^$/) {
|
|
|
|
$template = "$deftemplate";
|
|
|
|
}
|
|
|
|
|
|
|
|
die "Error: Unable to find template $template\n" unless -r $template;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Load configuration data
|
|
|
|
#
|
|
|
|
my $conf = new Config::General(
|
|
|
|
-ConfigFile => $cfgfile,
|
|
|
|
-InterPolateVars => 1,
|
|
|
|
-ExtendedAccess => 1
|
|
|
|
);
|
|
|
|
my %config = $conf->getall();
|
|
|
|
|
|
|
|
#
|
|
|
|
# Connect to the database
|
|
|
|
#
|
|
|
|
my $dbtype = $config{database}->{type};
|
|
|
|
my $dbfile = $config{database}->{file};
|
|
|
|
my $dbuser = $config{database}->{user};
|
|
|
|
my $dbpwd = $config{database}->{password};
|
|
|
|
my $dbh
|
|
|
|
= DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
|
|
|
|
{ AutoCommit => 1, sqlite_unicode => 1, } )
|
|
|
|
or die $DBI::errstr;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
|
|
|
|
# foreign key constraints to function)
|
|
|
|
#
|
|
|
|
$dbh->do('PRAGMA foreign_keys = ON');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check we have something to do
|
|
|
|
#
|
|
|
|
my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
|
|
|
|
my $work = (
|
|
|
|
scalar(@new_urls) > 0
|
|
|
|
|| defined($loadfile)
|
|
|
|
|| defined($deletefile)
|
|
|
|
|| ( defined($report)
|
|
|
|
|| defined($json)
|
|
|
|
|| defined($opml)
|
|
|
|
|| defined($template)
|
|
|
|
|| $scan && $rows > 0 )
|
|
|
|
);
|
|
|
|
|
|
|
|
die "Nothing to do!\n" unless $work;
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Set up logging keeping the default log layout except for the date
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $LOG = Log::Handler->new();
|
|
|
|
|
|
|
|
$LOG->add(
|
|
|
|
file => {
|
|
|
|
timeformat => "%Y/%m/%d %H:%M:%S",
|
|
|
|
filename => $logfile,
|
|
|
|
minlevel => 0,
|
|
|
|
maxlevel => 7,
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Open the output file (or STDOUT) - we may need the date to do it
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $outfh;
|
|
|
|
if ($outfile) {
|
|
|
|
open( $outfh, ">:encoding(UTF-8)", $outfile )
|
|
|
|
or warn "Unable to open $outfile for writing: $!";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
|
|
|
|
or warn "Unable to initialise for writing: $!";
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Set up a robot.txt rules parser
|
|
|
|
#
|
|
|
|
$rules = WWW::RobotRules->new($robot_name);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Slurp the load file into @new_urls if the file is provided
|
|
|
|
#
|
|
|
|
if ($loadfile) {
|
|
|
|
#
|
|
|
|
# Load the input file
|
|
|
|
#
|
|
|
|
open( my $in, '<:encoding(utf8)', $loadfile )
|
|
|
|
or die "$PROG : failed to open load file '$loadfile' : $!\n";
|
|
|
|
chomp( my @loaded = <$in> );
|
|
|
|
close($in)
|
|
|
|
or warn "$PROG : failed to close load file '$loadfile' : $!\n";
|
|
|
|
|
|
|
|
$LOG->info("Loaded URLs from $loadfile");
|
|
|
|
|
|
|
|
#
|
|
|
|
# Add the loaded URLs to the array
|
|
|
|
#
|
|
|
|
push( @new_urls, @loaded );
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Now, we either have URLs from the command line, or from the load file, so we
|
|
|
|
# process these.
|
|
|
|
#
|
|
|
|
#while (@new_urls) {
|
|
|
|
# TODO: Why was this a loop?
|
|
|
|
if (@new_urls) {
|
|
|
|
#
|
|
|
|
# Remove duplicates
|
|
|
|
#
|
|
|
|
@new_urls = uniq(@new_urls);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Remove any commented out lines
|
|
|
|
#
|
|
|
|
@new_urls = grep {!/^\s*#/} @new_urls;
|
|
|
|
|
|
|
|
$LOG->info( "Received ", scalar(@new_urls),
|
|
|
|
" URLs to add to the database" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Load these URLs as appropriate, returning any more that we find by
|
|
|
|
# following HTML urls.
|
|
|
|
#
|
|
|
|
@new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Now process any URLs that came back. Since we are explicitly looking for
|
|
|
|
# feeds we can assume that's what we have so don't need to recurse again.
|
|
|
|
#
|
|
|
|
# TODO
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Process the delete file if there is one
|
|
|
|
#
|
|
|
|
if ($deletefile) {
|
|
|
|
#
|
|
|
|
# Load the delete file
|
|
|
|
#
|
|
|
|
open( my $del, '<:encoding(utf8)', $deletefile )
|
|
|
|
or die "$PROG : failed to open load file '$deletefile' : $!\n";
|
|
|
|
chomp( @deletions = <$del> );
|
|
|
|
close($del)
|
|
|
|
or warn "$PROG : failed to close load file '$deletefile' : $!\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Remove duplicates
|
|
|
|
#
|
|
|
|
@deletions = uniq(@deletions);
|
|
|
|
|
|
|
|
$LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# There are URLs to delete. Process them on by one.
|
|
|
|
#
|
|
|
|
if (@deletions) {
|
|
|
|
$sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
|
|
|
|
foreach my $rec (@deletions) {
|
|
|
|
$rv = $sth1->execute($rec);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
if ( $rv != 0 ) {
|
|
|
|
emit ( $silent, "Deleted $rec ($rv rows)\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit ( $silent, "Failed to delete $rec\n" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Perform a database scan if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($scan) {
|
|
|
|
$LOG->info( "Scan is not fully implemented yet" );
|
2022-11-20 22:49:57 +00:00
|
|
|
# Testing. Processes the first two feeds
|
2022-11-19 21:27:51 +00:00
|
|
|
scanDB($dbh, \%keymap);
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Report all or selected database contents if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ( defined($report) ) {
|
|
|
|
if ( defined($search_target) ) {
|
|
|
|
#
|
|
|
|
# Reporting a specific title
|
|
|
|
#
|
|
|
|
my @matches = searchTitle( $dbh, $search_target );
|
|
|
|
if (@matches) {
|
|
|
|
#
|
|
|
|
# Too many matches!
|
|
|
|
#
|
|
|
|
if ( $#matches > 1 ) {
|
|
|
|
say "Multiple matches:\n";
|
|
|
|
say "- ", join( "\n- ", @matches );
|
|
|
|
say "\nTry again with a different title";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# One match, report it
|
|
|
|
#
|
|
|
|
reportFeed( findFeed( $dbh, $matches[0] ), $outfh );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
say "Feed not found with target '$search_target'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# Reporting everything
|
|
|
|
#
|
|
|
|
reportDB( $dbh, $outfh );
|
|
|
|
$LOG->info("Report generated");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Do any image repairs we need, but temporarily
|
|
|
|
#
|
|
|
|
if ($json || $opml || $template) {
|
|
|
|
$feeds = collectData($dbh);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Clean up the image references (until we have a proper fix)
|
|
|
|
#
|
|
|
|
for (my $i = 0; $i < scalar(@$feeds); $i++) {
|
|
|
|
my $f = $feeds->[$i];
|
|
|
|
if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
|
|
|
|
$f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
|
|
|
|
$LOG->info( "Fixed image for $f->{urls_url}" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Output all feeds to a JSON file if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($json) {
|
|
|
|
my $js = JSON->new->utf8->canonical->pretty;
|
|
|
|
open( my $out, '>:encoding(UTF-8)', $json )
|
|
|
|
or die "Unable to open output file $json $!\n";
|
|
|
|
|
|
|
|
print $out $js->encode($feeds), "\n";
|
|
|
|
close($out);
|
|
|
|
emit ( $silent, "JSON data written to $json\n" );
|
|
|
|
$LOG->info( "JSON data written to $json" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Requesting the -opml option means to dump the entire feed table as OPML
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($opml) {
|
|
|
|
require XML::OPML;
|
|
|
|
|
|
|
|
open( my $out, '>:encoding(UTF-8)', $opml )
|
|
|
|
or die "Unable to open output file $opml $!\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Start building OPML. Mandatory attributes are 'type', 'text' and
|
|
|
|
# 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
|
|
|
|
# attribute is poorly defined; this module uses 'rss' as the type and
|
|
|
|
# 'RSS' as the value of the 'version' attribute. This is not linked to the
|
|
|
|
# type of the feed.
|
|
|
|
#
|
|
|
|
my $OPML = XML::OPML->new(version => '1.1');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Create the 'head' element
|
|
|
|
#
|
|
|
|
my $DT = normaliseDT(DateTime->now());
|
|
|
|
$OPML->head(
|
|
|
|
title => 'Free Culture Podcasts',
|
|
|
|
dateCreated => $DT,
|
|
|
|
dateModified => $DT,
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Walk the feeds generating OPML
|
|
|
|
#
|
|
|
|
for (my $i = 0; $i < scalar(@$feeds); $i++) {
|
|
|
|
my $f = $feeds->[$i];
|
|
|
|
|
|
|
|
$OPML->add_outline(
|
|
|
|
title => $f->{urls_title},
|
|
|
|
text => $f->{urls_title},
|
|
|
|
description => $f->{urls_description},
|
|
|
|
xmlUrl => $f->{urls_url},
|
|
|
|
htmlUrl => $f->{urls_link},
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Output OPML to the nominated file
|
|
|
|
#
|
|
|
|
print $out $OPML->as_string;
|
|
|
|
|
|
|
|
close($out);
|
|
|
|
emit ( $silent, "OPML data written to $opml\n" );
|
|
|
|
$LOG->info( "OPML data written to $opml" );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Fill and print the template if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($template) {
|
|
|
|
my $tt = Template->new(
|
|
|
|
{ ABSOLUTE => 1,
|
|
|
|
ENCODING => 'utf8',
|
|
|
|
INCLUDE_PATH => $basedir,
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Make the structure the template needs
|
|
|
|
#
|
|
|
|
my $vars = {
|
|
|
|
feeds => $feeds,
|
|
|
|
};
|
|
|
|
|
|
|
|
# print Dumper($vars),"\n";
|
|
|
|
|
|
|
|
my $document;
|
|
|
|
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|
|
|
|
|| die $tt->error(), "\n";
|
|
|
|
print $outfh $document;
|
|
|
|
|
|
|
|
emit ( $silent, "Data processed with $template\n" );
|
|
|
|
$LOG->info( "Data processed with $template" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#$dbh->disconnect;
|
|
|
|
|
|
|
|
exit;
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: loadUrls
|
|
|
|
# PURPOSE: To load URLs read from the input file into the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $new_urls arrayref containing URLs
|
|
|
|
# $rules WWW::RobotRules object
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
|
|
|
# RETURNS: Any new URLs discovered by investigating non-feed URLs.
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub loadUrls {
|
|
|
|
my ( $dbh, $new_urls, $rules, $keymap ) = @_;
|
|
|
|
|
|
|
|
my ( $stream, $feed );
|
|
|
|
my ( %uridata, $roboturl, @found_urls );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Work through the list of URLs
|
|
|
|
#
|
|
|
|
foreach my $rec (@new_urls) {
|
|
|
|
%uridata = ();
|
|
|
|
|
|
|
|
#
|
|
|
|
# By default save the collected data
|
|
|
|
#
|
|
|
|
$uridata{SAVE} = 1;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check that we have a valid URL. We don't save them if they are
|
|
|
|
# invalid ($uridata{SAVE} is set to 0 in the routine).
|
|
|
|
#
|
|
|
|
my $uri = validateURI($rec,\%uridata);
|
|
|
|
next unless $uri;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check whether we already have this URI in the database
|
|
|
|
#
|
|
|
|
if (dbSearch(
|
|
|
|
$dbh, 'SELECT * FROM urls WHERE url = ?',
|
|
|
|
$uridata{URI}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
{
|
|
|
|
emit( $silent, "$uri is already in the database\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the hostname resolves in DNS
|
|
|
|
#
|
|
|
|
if ( checkDNS( $uri, \%uridata ) ) {
|
|
|
|
$uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
|
|
|
|
emit( $silent, "DNS: $uridata{DNS}\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the server is available. Don't save if it's not (probably will
|
|
|
|
# not get the feed anyway).
|
|
|
|
#
|
|
|
|
if ( serverUp( $uri, \%uridata ) ) {
|
|
|
|
emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the site's robots.txt. If there's a block then don't save.
|
|
|
|
#
|
|
|
|
if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
|
|
|
|
emit( $silent, "Check of robots.txt rules succeeded\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ( $uridata{ROBOTS} =~ /404/ ) {
|
|
|
|
emit( $silent, "Search for robots.txt file failed\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Check of robots.txt rules failed\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Look for the HTTP content type. Don't save if the request failed.
|
|
|
|
#
|
|
|
|
if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) {
|
|
|
|
emit( $silent, "HTTP request OK\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "HTTP request failed\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Parse the feed
|
|
|
|
#
|
|
|
|
if ( $uridata{TYPE} eq 'Feed' ) {
|
|
|
|
$LOG->info('Processing feed: ',$uridata{URI});
|
|
|
|
$stream = getFeed( $uridata{URI} );
|
|
|
|
if ($stream) {
|
|
|
|
$feed = parseFeed( $uridata{URI}, $stream );
|
|
|
|
unless ( $feed ) {
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Provide a means of examining the returned feed object. There
|
|
|
|
# are some weird behaviours in there (partly because of the
|
|
|
|
# weirdness of RSS and poor adherence to what standards there
|
|
|
|
# are).
|
|
|
|
#
|
|
|
|
print Dumper($feed), "\n" if ( $DEBUG > 2 );
|
|
|
|
storeFeed($feed,\%uridata);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Perform a check on the copyright. Routine sets
|
|
|
|
# $uridata{SAVE} = 0 if the copyright is not acceptable.
|
|
|
|
#
|
|
|
|
if ($check) {
|
|
|
|
next unless checkCopyright(\%uridata);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit($silent, "Problem collecting feed");
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
elsif ( $uridata{TYPE} eq 'HTML' ) {
|
|
|
|
#
|
|
|
|
# TODO Save the URL and find any feeds it contains
|
|
|
|
#
|
|
|
|
if (@found_urls) {
|
|
|
|
emit( $silent,
|
|
|
|
"Type $uridata{TYPE} contained "
|
|
|
|
. scalar(@found_urls)
|
|
|
|
. " feeds; to be investigated\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"Type $uridata{TYPE} contained no feeds; skipped\n" );
|
|
|
|
}
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
continue {
|
|
|
|
# { This 'continue' block is executed for each iteration or 'next' }
|
|
|
|
|
|
|
|
#
|
|
|
|
# Decide whether to save what we have collected
|
|
|
|
#
|
|
|
|
if ( $uridata{SAVE} ) {
|
|
|
|
if ( addURI( $dbh, \%uridata, $keymap ) ) {
|
|
|
|
emit( $silent, "$uridata{URI} added to the database\n" );
|
|
|
|
$LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the id the database allocated for the row we added
|
|
|
|
#
|
|
|
|
$uridata{URI_ID}
|
|
|
|
= $dbh->last_insert_id( undef, undef, 'urls', undef );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Add any enclosures we found
|
|
|
|
#
|
|
|
|
if ( defined( $uridata{ENCLOSURE_COUNT} )
|
|
|
|
&& $uridata{ENCLOSURE_COUNT} > 0 )
|
|
|
|
{
|
|
|
|
if ( addEnclosures( $dbh, \%uridata ) ) {
|
|
|
|
emit( $silent, $uridata{ENCLOSURE_COUNT},
|
|
|
|
" enclosures for $uridata{URI} added to the database\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"$uridata{URI} was not added to the database\n" );
|
|
|
|
$LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Dump what we have if requested
|
|
|
|
#
|
|
|
|
print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 );
|
|
|
|
|
|
|
|
emit( $silent, '-' x 80, "\n" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Return any further urls we found
|
|
|
|
#
|
|
|
|
return @found_urls;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: searchTitle
|
|
|
|
# PURPOSE: Search the database for a feed with a given title
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $target search target
|
|
|
|
# RETURNS: A list of titles
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub searchTitle {
|
|
|
|
my ($dbh, $target) = @_;
|
|
|
|
|
|
|
|
my ( $sql1, $sql2, $sth, $rv, $h );
|
|
|
|
my ( $count, @result );
|
|
|
|
|
|
|
|
$sql1 = q{
|
|
|
|
SELECT count(*) AS count
|
|
|
|
FROM urls
|
|
|
|
WHERE urltype = 'Feed'
|
|
|
|
AND title REGEXP ?
|
|
|
|
};
|
|
|
|
|
|
|
|
$sql2 = q{
|
|
|
|
SELECT title
|
|
|
|
FROM urls
|
|
|
|
WHERE urltype = 'Feed'
|
|
|
|
AND title REGEXP ?
|
|
|
|
ORDER BY title
|
|
|
|
};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Count the number of matches
|
|
|
|
#
|
|
|
|
$sth = $dbh->prepare($sql1);
|
|
|
|
$rv = $sth->execute($search_target);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Proceed if we have 1 or more matches
|
|
|
|
#
|
|
|
|
if ( $h = $sth->fetchrow_hashref ) {
|
|
|
|
$count = $h->{count};
|
|
|
|
$sth->finish;
|
|
|
|
|
|
|
|
if ( $count >= 1 ) {
|
|
|
|
$sth = $dbh->prepare($sql2);
|
|
|
|
$rv = $sth->execute($search_target);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
while ( $h = $sth->fetchrow_hashref ) {
|
|
|
|
push(@result, $h->{title});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return @result;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: scanDB
|
|
|
|
# PURPOSE: To scan the URLs in the database and update the stored data
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub scanDB {
|
|
|
|
my ($dbh, $keymap) = @_;
|
|
|
|
|
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( $aref, @urls, $DT, $stream, $feed );
|
|
|
|
my ( %uridata, $urichanges, $enc_changes );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to return all feed URLs
|
|
|
|
#
|
|
|
|
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
|
|
|
|
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect everything as an arrayref pointing to a bunch of arrayrefs
|
|
|
|
# containing the column details requested
|
|
|
|
#
|
|
|
|
$aref = $sth1->fetchall_arrayref;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Extract just the URL strings
|
|
|
|
#
|
|
|
|
@urls = map { $_->[1] } @{$aref};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Now process these URLs from the database one at a time
|
|
|
|
#
|
|
|
|
# $sql1 = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
# $sth1 = $dbh->prepare($sql1);
|
|
|
|
|
|
|
|
my $count = 0;
|
|
|
|
|
|
|
|
foreach my $url (@urls) {
|
|
|
|
%uridata = ();
|
|
|
|
|
|
|
|
$count++;
|
|
|
|
|
|
|
|
scanFeed($dbh,$url,\%uridata);
|
|
|
|
|
|
|
|
# #
|
|
|
|
# # Record the scan for this URL
|
|
|
|
# #
|
|
|
|
# $DT = normaliseDT( DateTime->now() );
|
|
|
|
# $uridata{SCANNED_ON} = $DT;
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Default to OK
|
|
|
|
# #
|
|
|
|
# $uridata{SCAN_OK} = 1;
|
|
|
|
#
|
|
|
|
# emit( $silent, "Scanning '$url'\n" );
|
|
|
|
# $rv1 = $sth1->execute($url);
|
|
|
|
# if ( $dbh->err ) {
|
|
|
|
# warn $dbh->errstr;
|
|
|
|
# }
|
|
|
|
# $h1 = $sth1->fetchrow_hashref;
|
|
|
|
# emit( $silent, $h1->{title}, "\n" );
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # The URL should be valid already from the point at which it was added
|
|
|
|
# # to the database, but if we do this we get a canonical version (it
|
|
|
|
# # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
|
|
|
|
# # it. It also sets $uridata{URI}, which is useful.)
|
|
|
|
# #
|
|
|
|
# my $uri = validateURI( $url, \%uridata );
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Check the hostname resolves in DNS
|
|
|
|
# #
|
|
|
|
# if ( checkDNS( $uri, \%uridata ) ) {
|
|
|
|
# $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
|
|
|
|
# emit( $silent, "DNS: $uridata{DNS}\n" );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Check the server is available
|
|
|
|
# #
|
|
|
|
# if ( serverUp( $uri, \%uridata ) ) {
|
|
|
|
# emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Look for the HTTP content type.
|
|
|
|
# #
|
|
|
|
# if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
|
|
|
|
# emit( $silent, "HTTP request OK\n" );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "HTTP request failed\n" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Note: not doing the robots.txt check since it was done at load time
|
|
|
|
# #
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # We know we have a feed, so go get it
|
|
|
|
# #
|
|
|
|
# $stream = getFeed( $uridata{URI} );
|
|
|
|
# if ($stream) {
|
|
|
|
# #
|
|
|
|
# # Parse the feed. The routine generates its own error messages
|
|
|
|
# #
|
|
|
|
# $feed = parseFeed( $uridata{URI}, $stream );
|
|
|
|
# unless ( $feed ) {
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Save the important feed components in the %uridata hash
|
|
|
|
# #
|
|
|
|
# print Dumper($feed), "\n" if ( $DEBUG > 2 );
|
|
|
|
# storeFeed( $feed, \%uridata );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "Problem collecting feed" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
}
|
|
|
|
continue {
|
|
|
|
# { This 'continue' block is executed for each iteration or 'next' }
|
|
|
|
|
|
|
|
#
|
|
|
|
# If the scan went OK then perform a comparison between the new feed
|
|
|
|
# data and that which is stored
|
|
|
|
#
|
|
|
|
if ( $uridata{SCAN_OK} ) {
|
|
|
|
$urichanges = updateURI( $dbh, \%uridata, $keymap );
|
|
|
|
$LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
|
|
|
|
$urichanges )
|
|
|
|
if $urichanges > 0;
|
|
|
|
|
|
|
|
# TODO Update enclosures
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uridata{URI} was not scanned successfully\n" );
|
|
|
|
$LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Finished scanning this feed
|
|
|
|
#
|
|
|
|
emit( $silent, '-' x 80, "\n" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# NOTE: Temporarily stop after the first N feeds
|
|
|
|
#
|
|
|
|
last if $count == 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: scanFeed
|
|
|
|
# PURPOSE: Performs a scan on a single feed
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $url feed URL to look up
|
|
|
|
# $uridata hashref to the hash of data collected from the
|
|
|
|
# feed
|
|
|
|
# RETURNS: True (1) if all the steps worked, otherwise false (0)
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub scanFeed {
|
|
|
|
my ( $dbh, $url, $uridata ) = @_;
|
|
|
|
|
|
|
|
my ( $sql, $sth, $rv, $h );
|
|
|
|
my ( $DT, $stream, $feed );
|
|
|
|
my ( $urichanges, $enc_changes );
|
|
|
|
|
|
|
|
$sql = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
$sth = $dbh->prepare($sql);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Record the scan for this URL
|
|
|
|
#
|
|
|
|
$DT = normaliseDT( DateTime->now() );
|
|
|
|
$uridata->{SCANNED_ON} = $DT;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Default to OK
|
|
|
|
#
|
|
|
|
$uridata->{SCAN_OK} = 1;
|
|
|
|
|
|
|
|
emit( $silent, "Scanning '$url'\n" );
|
|
|
|
$rv = $sth->execute($url);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h = $sth->fetchrow_hashref;
|
|
|
|
emit( $silent, $h->{title}, "\n" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# The URL should be valid already from the point at which it was added
|
|
|
|
# to the database, but if we do this we get a canonical version (it
|
|
|
|
# sets $uridata->{SAVE} which makes no sense here, but we'll just ignore
|
|
|
|
# it. It also sets $uridata->{URI}, which is useful.)
|
|
|
|
#
|
|
|
|
my $uri = validateURI( $url, $uridata );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the hostname resolves in DNS
|
|
|
|
#
|
|
|
|
if ( checkDNS( $uri, $uridata ) ) {
|
|
|
|
$uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } );
|
|
|
|
emit( $silent, "DNS: $uridata->{DNS}\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the server is available
|
|
|
|
#
|
|
|
|
if ( serverUp( $uri, $uridata ) ) {
|
|
|
|
emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Look for the HTTP content type.
|
|
|
|
#
|
|
|
|
if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) {
|
|
|
|
emit( $silent, "HTTP request OK\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "HTTP request failed\n" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Note: not doing the robots.txt check since it was done at load time
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# We know we have a feed, so go get it
|
|
|
|
#
|
|
|
|
$stream = getFeed( $uridata->{URI} );
|
|
|
|
if ($stream) {
|
|
|
|
#
|
|
|
|
# Parse the feed. The routine generates its own error messages
|
|
|
|
#
|
|
|
|
$feed = parseFeed( $uridata->{URI}, $stream );
|
|
|
|
unless ( $feed ) {
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Save the important feed components in the %uridata hash
|
|
|
|
#
|
|
|
|
print Dumper($feed), "\n" if ( $DEBUG > 2 );
|
|
|
|
storeFeed( $feed, $uridata );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Problem collecting feed" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: reportDB
|
|
|
|
# PURPOSE: To generate a printed report from the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $fh output file handle
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: reportFeed generates a standardised report
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub reportDB {
|
|
|
|
my ($dbh, $fh) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to report the contents of the 'all_episodes' view with the latest
|
|
|
|
# associated episode
|
|
|
|
#
|
|
|
|
my $sql = q{
|
|
|
|
SELECT urls.id, ae.*,
|
|
|
|
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
ORDER BY urls_title
|
|
|
|
};
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ( my $h1 = $sth1->fetchrow_hashref() ) {
|
|
|
|
reportFeed( $h1, $fh );
|
|
|
|
}
|
|
|
|
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: findFeed
|
|
|
|
# PURPOSE: Find and return a single feed by title
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $target search target
|
|
|
|
# RETURNS: Hashref indexing the feed
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub findFeed {
|
|
|
|
my ( $dbh, $target ) = @_;
|
|
|
|
|
|
|
|
my ( $sql, $sth, $rv, $h );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to report the contents of the 'all_episodes' view with the latest
|
|
|
|
# associated episode
|
|
|
|
#
|
|
|
|
$sql = q{
|
|
|
|
SELECT urls.id, ae.*,
|
|
|
|
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
AND ae.urls_title = ?
|
|
|
|
};
|
|
|
|
|
|
|
|
$sth = $dbh->prepare($sql);
|
|
|
|
$rv = $sth->execute($target);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
$h = $sth->fetchrow_hashref();
|
|
|
|
|
|
|
|
return $h;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: reportFeed
|
|
|
|
# PURPOSE: Reports a single feed from the database
|
|
|
|
# PARAMETERS: $feed a hash from the database containing the
|
|
|
|
# details of a particular feed
|
|
|
|
# $fh output file handle
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub reportFeed {
|
|
|
|
my ($feed, $fh) = @_;
|
|
|
|
|
|
|
|
my $lwidth = 12;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Hash for converting database keys to labels for the report and arrays
|
|
|
|
# for controlling the sequence.
|
|
|
|
#
|
|
|
|
state ( %keys, @seq1, @seq2 );
|
|
|
|
%keys = (
|
|
|
|
'id' => 'DB Primary key',
|
|
|
|
'latest_ep' => 'Latest episode date',
|
|
|
|
'ep_author' => 'Author',
|
|
|
|
'ep_byte_length' => 'Byte length',
|
|
|
|
'ep_category' => 'Category',
|
|
|
|
'ep_enclosure' => 'URL',
|
|
|
|
'ep_ep_id' => 'ID',
|
|
|
|
'ep_id' => 'Key',
|
|
|
|
'ep_issued' => 'Issued on',
|
|
|
|
'ep_last_update' => 'Last updated',
|
|
|
|
'ep_link' => 'Permalink',
|
|
|
|
'ep_mime_type' => 'MIME type',
|
|
|
|
'ep_modified' => 'Modified on',
|
|
|
|
'ep_source' => 'Source',
|
|
|
|
'ep_title' => 'Title',
|
|
|
|
'ep_urls_id' => 'URL key',
|
|
|
|
'urls_author' => 'Author',
|
|
|
|
'urls_content_type' => 'Content type',
|
|
|
|
'urls_copyright' => 'Copyright',
|
|
|
|
'urls_description' => 'Description',
|
|
|
|
'urls_dns' => 'DNS',
|
|
|
|
'urls_feedformat' => 'Feed format',
|
|
|
|
'urls_generator' => 'Generator',
|
|
|
|
'urls_host_up' => 'Host up',
|
|
|
|
'urls_http_status' => 'HTTP status',
|
|
|
|
'urls_id' => 'Key',
|
|
|
|
'urls_image' => 'Image',
|
|
|
|
'urls_language' => 'Language',
|
|
|
|
'urls_last_update' => 'Last updated',
|
|
|
|
'urls_link' => 'Link',
|
|
|
|
'urls_modified' => 'Modified on',
|
|
|
|
'urls_title' => 'Title',
|
|
|
|
'urls_url' => 'Feed URL',
|
|
|
|
'urls_urltype' => 'URL type',
|
|
|
|
'urls_parent_id' => 'Parent ID',
|
|
|
|
'urls_child_count' => 'Child count',
|
|
|
|
|
|
|
|
);
|
|
|
|
@seq1 = (
|
|
|
|
'urls_title',
|
|
|
|
'urls_url',
|
|
|
|
'urls_feedformat',
|
|
|
|
'urls_link',
|
|
|
|
'urls_author',
|
|
|
|
'urls_content_type',
|
|
|
|
'urls_copyright',
|
|
|
|
'urls_description',
|
|
|
|
'urls_dns',
|
|
|
|
'urls_generator',
|
|
|
|
'urls_host_up',
|
|
|
|
'urls_http_status',
|
|
|
|
'urls_image',
|
|
|
|
'urls_language',
|
|
|
|
'urls_last_update',
|
|
|
|
'urls_modified',
|
|
|
|
'urls_parent_id',
|
|
|
|
'urls_child_count',
|
|
|
|
);
|
|
|
|
@seq2 = (
|
|
|
|
'ep_title',
|
|
|
|
'ep_enclosure',
|
|
|
|
'ep_category',
|
|
|
|
'ep_link',
|
|
|
|
'ep_author',
|
|
|
|
'ep_byte_length',
|
|
|
|
'ep_ep_id',
|
|
|
|
'ep_mime_type',
|
|
|
|
'ep_issued',
|
|
|
|
'ep_last_update',
|
|
|
|
'ep_modified',
|
|
|
|
'ep_source',
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($feed) {
|
|
|
|
print $fh "Channel:\n";
|
|
|
|
foreach my $key (@seq1) {
|
|
|
|
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
|
|
|
|
coalesce( $feed->{$key}, '--' );
|
|
|
|
}
|
|
|
|
|
|
|
|
print $fh "\nLatest episode:\n";
|
|
|
|
foreach my $key (@seq2) {
|
|
|
|
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
|
|
|
|
coalesce( $feed->{$key}, '--' );
|
|
|
|
}
|
|
|
|
print $fh "\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: collectData
|
|
|
|
# PURPOSE: Collects data from the database for generating a report
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# RETURNS: An array of hashrefs from the query
|
|
|
|
# DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
|
|
|
|
# table and the 'all_episodes' view, showing the date of the
|
|
|
|
# latest episode. The result is an array of rows, each
|
|
|
|
# represented as a hash, all sorted by the feed title.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub collectData {
|
|
|
|
my ($dbh) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to report the contents of the 'urls' table with the details of the
|
|
|
|
# latest episode
|
|
|
|
#
|
|
|
|
my $sql = q{
|
|
|
|
SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
ORDER BY ae.urls_title
|
|
|
|
};
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Return everything as an arrayref of hashrefs
|
|
|
|
#
|
|
|
|
my $tbl_ary_ref = $sth1->fetchall_arrayref({});
|
|
|
|
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return $tbl_ary_ref;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: countRows
|
|
|
|
# PURPOSE: To count the rows in a table
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use
|
|
|
|
# RETURNS: Number of rows found (note that zero is returned as 0E0)
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub countRows {
|
|
|
|
my ( $dbh, $sql ) = @_;
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
my $h1 = $sth1->fetch;
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return @$h1[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: dbSearch
|
|
|
|
# PURPOSE: To perform a simple search in the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use (expected to be
|
|
|
|
# a SELECT)
|
|
|
|
# @args arguments for the 'execute'
|
|
|
|
# RETURNS: True (1) if the row exists, otherwise false (0).
|
|
|
|
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
|
|
|
|
# SQL without incurring the overhead of repeated 'prepare'
|
|
|
|
# calls. Only the first row is fetched (we expect there to be
|
|
|
|
# only one) and the success or failure is determined by its
|
|
|
|
# existence.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub dbSearch {
|
|
|
|
my ( $dbh, $sql, @args ) = @_;
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare_cached($sql);
|
|
|
|
my $rv = $sth1->execute(@args);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
my $h1 = $sth1->fetchrow_hashref();
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return defined($h1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: execSQL
|
|
|
|
# PURPOSE: To perform a non-SELECT query
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use
|
|
|
|
# @args arguments for the 'execute'
|
|
|
|
# RETURNS: True (1) if the query succeeded, otherwise false (0).
|
|
|
|
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
|
|
|
|
# SQL without incurring the overhead of repeated 'prepare'
|
|
|
|
# calls.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub execSQL {
|
|
|
|
my ( $dbh, $sql, @args ) = @_;
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare_cached($sql);
|
|
|
|
my $rv = $sth1->execute(@args);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return $rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: validateURI
|
|
|
|
# PURPOSE: Checks a URL for validity
|
|
|
|
# PARAMETERS: $rec the URL handed to the script
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: A URI object if valid otherwise undef
|
|
|
|
# DESCRIPTION: The URL string is validated with the URI module. A canonical
|
|
|
|
# string version is stored in the hash referenced by $uridata
|
|
|
|
# hash if valid otherwise the URL is marked as invalid.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub validateURI {
|
|
|
|
my ( $rec, $uridata ) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check that we have a valid URL. We don't save them if they are
|
|
|
|
# invalid.
|
|
|
|
#
|
|
|
|
my $uri = URI->new( $rec, 'http' );
|
|
|
|
if ( $uri->scheme ) {
|
|
|
|
emit( $silent, "URI $uri is valid\n" );
|
|
|
|
$uridata->{URI} = $uri->canonical->as_string;
|
|
|
|
return $uri;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "URI $uri is not valid\n" );
|
|
|
|
$uridata->{SAVE} = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkDNS
|
|
|
|
# PURPOSE: Looks up a host DNS entry
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the DNS query was successful, otherwise false (0)
|
|
|
|
# DESCRIPTION: The host name is extracted from the URI (and stored). The
|
|
|
|
# hostname is searched for in the DNS and if successful, an
|
|
|
|
# array of addresses from the 'A' records is built. This is
|
|
|
|
# sorted and stored in the hash referenced by $uridata.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkDNS {
|
|
|
|
my ( $uri, $uridata ) = @_;
|
|
|
|
|
|
|
|
my @adds;
|
|
|
|
my $hostname = $uri->host;
|
|
|
|
$uridata->{HOST} = $hostname;
|
|
|
|
|
|
|
|
my $res = Net::DNS::Resolver->new;
|
|
|
|
my $query = $res->search($hostname);
|
|
|
|
if ($query) {
|
|
|
|
foreach my $rr ( $query->answer ) {
|
|
|
|
next unless $rr->type eq "A";
|
|
|
|
push( @adds, $rr->address );
|
|
|
|
}
|
|
|
|
@adds = sort(@adds);
|
|
|
|
$uridata->{DNS} = \@adds;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
warn "Query failed: ", $res->errorstring, "\n";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: serverUp
|
|
|
|
# PURPOSE: Checks that a given host is responding
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the host responds to a TCP connection, false (0)
|
|
|
|
# otherwise
|
|
|
|
# DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
|
|
|
|
# to the appropriate default for the scheme, such as 80 for
|
|
|
|
# http). Attempts to connect to this host and port. If the
|
|
|
|
# connect fails then details are written to the data structure
|
|
|
|
# pointed to by $uridata.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub serverUp {
|
|
|
|
my ( $uri, $uridata ) = @_;
|
|
|
|
|
|
|
|
my ( $hostname, $port, $socket );
|
|
|
|
|
|
|
|
$hostname = $uri->host();
|
|
|
|
$port = $uri->port();
|
|
|
|
$uridata->{PORT} = $port;
|
|
|
|
|
|
|
|
$socket = IO::Socket::INET->new(
|
|
|
|
Proto => "tcp",
|
|
|
|
PeerAddr => $hostname,
|
|
|
|
PeerPort => $port,
|
|
|
|
Reuse => 1,
|
|
|
|
Timeout => 10
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($socket) {
|
|
|
|
$socket->close;
|
|
|
|
$uridata->{HOSTUP} = 1;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{HOSTUP} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: robotRulesOK
|
|
|
|
# PURPOSE: To check the intended URL against the site's robots.txt rules
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $rules WWW::RobotRules object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the GET of the robots.txt file succeeded and the
|
|
|
|
# rules allow the URI object to be fetched, false (0) otherwise.
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub robotRulesOK {
|
|
|
|
my ( $uri, $rules, $uridata ) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Build the URL of robots.txt
|
|
|
|
#
|
|
|
|
my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
|
|
|
|
my $robots_txt;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Use LWP::UserAgent to get the feed and handle errors
|
|
|
|
#
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->timeout(10);
|
|
|
|
$ua->agent("$PROG/$VERSION");
|
|
|
|
|
|
|
|
my $response = $ua->get($roboturl);
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$uridata->{ROBOTS} = 'Found';
|
|
|
|
$robots_txt = $response->decoded_content;
|
|
|
|
$rules->parse( $roboturl, $robots_txt );
|
|
|
|
return $rules->allowed("$uri");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{ROBOTS} = $response->status_line;
|
|
|
|
warn "Failed to get $roboturl\n";
|
|
|
|
warn $response->status_line . "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkContentType
|
|
|
|
# PURPOSE: Check the content_type of the url
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# $headers hashref containing query headers
|
|
|
|
# $children arrayref to return any 'child' feeds we might
|
|
|
|
# find; they are appended to the array
|
|
|
|
# $log Log::Handler object
|
|
|
|
# RETURNS: True (1) if all was well, otherwise false (0)
|
|
|
|
# DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
|
|
|
|
# for the URI object using the LWP::UserAgent head method. Then
|
|
|
|
# we examine the 'content-type' header looking for the string
|
|
|
|
# 'xml' or 'html' in it. The former denotes a feed, and the
|
|
|
|
# latter a normal HTML page.
|
2022-11-20 22:49:57 +00:00
|
|
|
# We use Feed::Find if the page is HTML in order to find any
|
2022-11-19 21:27:51 +00:00
|
|
|
# "child" feeds which we really want to process if we find any.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkContentType {
|
|
|
|
my ( $uri, $uridata, $headers, $children, $log ) = @_;
|
|
|
|
|
|
|
|
my @feeds;
|
|
|
|
|
|
|
|
$uridata->{HTTP_STATUS} = 'Unknown';
|
|
|
|
|
|
|
|
my $browser = LWP::UserAgent->new or return 0;
|
|
|
|
|
|
|
|
my $response = $browser->head( $uri->as_string, %{$headers} )
|
|
|
|
or return 0;
|
|
|
|
|
|
|
|
$uridata->{HTTP_STATUS} = $response->status_line;
|
|
|
|
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Decode the content-type we received
|
|
|
|
#
|
|
|
|
if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
|
|
|
|
$uridata->{TYPE} = 'Feed';
|
|
|
|
}
|
|
|
|
elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
|
|
|
|
$uridata->{TYPE} = 'HTML';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{TYPE} = 'Unknown';
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Sometimes perfectly valid feeds misleadingly return text/html so we
|
|
|
|
# try to spot such cases here and adjust the internal type accordingly
|
|
|
|
#
|
|
|
|
if ( $uridata->{TYPE} eq 'HTML' ) {
|
|
|
|
@feeds = Feed::Find->find( $uri->as_string );
|
|
|
|
if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
|
|
|
|
emit( $silent, "Feed found with wrong content-type\n" );
|
|
|
|
$uridata->{TYPE} = 'Feed';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# This HTML and we found 'child' feeds of some kind
|
|
|
|
#
|
|
|
|
emit( $silent, "Found ", scalar(@feeds),
|
|
|
|
" feeds within this HTML page\n" );
|
|
|
|
print Dumper( \@feeds ), "\n" if $DEBUG > 0;
|
|
|
|
push(@{$children}, @feeds);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$log->info( "URL content classified as: ", $uridata->{TYPE} );
|
|
|
|
emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: getFeed
|
|
|
|
# PURPOSE: Download the contents of a feed URL
|
|
|
|
# PARAMETERS: $feed_url URL of the feed to download
|
|
|
|
# RETURNS: String representation of the feed contents or undef if the
|
|
|
|
# download failed.
|
|
|
|
# DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
|
|
|
|
# need not be). If successful the contents are decoded and
|
|
|
|
# returned, otherwise undef is returned.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub getFeed {
|
|
|
|
my ($feed_url) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Use LWP::UserAgent to get the feed and handle errors
|
|
|
|
#
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->timeout(10);
|
|
|
|
$ua->agent("$PROG/$VERSION");
|
|
|
|
my $response = $ua->get($feed_url);
|
|
|
|
|
|
|
|
my $feed_content;
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$feed_content = $response->decoded_content;
|
|
|
|
return $feed_content;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
warn "Failed to get $feed_url\n";
|
|
|
|
warn $response->status_line, "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: parseFeed
|
|
|
|
# PURPOSE: Parse a podcast feed that has already been downloaded
|
|
|
|
# PARAMETERS: $feed_url URL of the feed previously downloaded
|
|
|
|
# $feed_content String containing the content of the feed, for
|
|
|
|
# parsing
|
|
|
|
# RETURNS: An XML::Feed object containing the parsed feed or undef if the
|
|
|
|
# parse failed
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub parseFeed {
|
|
|
|
my ( $feed_url, $feed_content ) = @_;
|
|
|
|
|
|
|
|
my $feed = XML::Feed->parse( \$feed_content );
|
|
|
|
unless ($feed) {
|
|
|
|
#
|
|
|
|
# Something went wrong. Abort this feed
|
|
|
|
#
|
|
|
|
warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
|
|
|
|
return $feed;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: storeFeed
|
|
|
|
# PURPOSE: Stores feed attributes in a hash
|
|
|
|
# PARAMETERS: $feed XML::Feed object returned from parsing the
|
|
|
|
# feed
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub storeFeed {
|
|
|
|
my ($feed, $uridata) = @_;
|
|
|
|
|
|
|
|
( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g;
|
|
|
|
( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
|
|
|
|
|
|
|
|
$uridata->{LINK} = $feed->link;
|
|
|
|
$uridata->{DESCRIPTION} = clean_string( $feed->tagline );
|
|
|
|
$uridata->{AUTHOR} = $feed->author;
|
|
|
|
$uridata->{MODIFIED} = normaliseDT( $feed->modified );
|
|
|
|
$uridata->{IMAGE} = flattenArray( $feed->image );
|
|
|
|
$uridata->{COPYRIGHT} = $feed->copyright;
|
|
|
|
$uridata->{GENERATOR} = $feed->generator;
|
|
|
|
$uridata->{LANGUAGE} = $feed->language;
|
|
|
|
#print coalesce($feed->webMaster,'No webMaster'),"\n";
|
|
|
|
|
|
|
|
$uridata->{ENCLOSURES} = extractEnclosures($feed);
|
|
|
|
$uridata->{ENCLOSURE_COUNT}
|
|
|
|
= scalar( @{ $uridata->{ENCLOSURES} } );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkCopyright
|
|
|
|
# PURPOSE: Ask the user to check the copyright of a feed
|
|
|
|
# PARAMETERS:
|
|
|
|
# RETURNS:
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkCopyright {
|
|
|
|
my ($uridata) = @_;
|
|
|
|
|
|
|
|
my $decision;
|
|
|
|
$LOG->info('Checking copyright of feed');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Prompt the user, failing gracefully if there's
|
|
|
|
# a problem. If the user types 'Y' or 'y' we accept the
|
|
|
|
# feed, otherwise we do not (thus a blank return = 'no').
|
|
|
|
#
|
|
|
|
try {
|
|
|
|
printf STDERR
|
|
|
|
"Feed '%s' has the copyright string:\n%s\n",
|
|
|
|
$uridata->{TITLE},
|
|
|
|
coalesce( $uridata->{COPYRIGHT}, '' );
|
|
|
|
$decision = prompt(
|
|
|
|
-in => *STDIN,
|
|
|
|
-prompt => 'OK to add this feed?',
|
|
|
|
-style => 'bold red underlined',
|
|
|
|
-yes
|
|
|
|
);
|
|
|
|
}
|
|
|
|
catch {
|
|
|
|
warn "Problem processing copyright decision: $_";
|
|
|
|
$decision = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Take action on the decision (or default)
|
|
|
|
#
|
|
|
|
$uridata->{SAVE} = $decision;
|
|
|
|
if ($decision) {
|
|
|
|
emit( $silent, "Feed added\n" );
|
|
|
|
$LOG->info('Copyright OK');
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Feed not added\n" );
|
|
|
|
$LOG->info('Copyright not OK');
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: parseRSS
|
|
|
|
# PURPOSE: Attempt to parse a feed as RSS
|
|
|
|
# PARAMETERS: $feed_url URL of the feed previously downloaded
|
|
|
|
# $feed_content String containing the content of the feed, for
|
|
|
|
# parsing
|
|
|
|
# RETURNS: An XML::RSS object containing the parsed feed or undef if the
|
|
|
|
# parse failed
|
|
|
|
# DESCRIPTION: ** Incomplete **
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub parseRSS {
|
|
|
|
my ( $feed_url, $feed_content ) = @_;
|
|
|
|
|
|
|
|
my $rss = XML::RSS->parse(\$feed_content);
|
|
|
|
unless ($rss) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: addURI
|
|
|
|
# PURPOSE: Adds the data for a URI to the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
|
|
|
# RETURNS: True (1) if the insert succeeded, false (0) otherwise
|
|
|
|
# DESCRIPTION: The hash keys are defined as an array to make it easy to slice
|
|
|
|
# the hash and the SQL is defined internally using the size of
|
|
|
|
# the key array as a guide to the number of '?' placeholders.
|
|
|
|
# These are passed to execSQL to do the work.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub addURI {
|
|
|
|
my ( $dbh, $uridata, $keymap ) = @_;
|
|
|
|
|
|
|
|
my @keys = (
|
|
|
|
'URI', 'DNS',
|
|
|
|
'HOSTUP', 'HTTP_STATUS',
|
|
|
|
'HTTP_CONTENT_TYPE', 'TYPE',
|
|
|
|
'FORMAT', 'TITLE',
|
|
|
|
'DESCRIPTION', 'AUTHOR',
|
|
|
|
'MODIFIED', 'LINK',
|
|
|
|
'IMAGE', 'COPYRIGHT',
|
|
|
|
'GENERATOR', 'LANGUAGE',
|
|
|
|
);
|
|
|
|
|
|
|
|
my $sql
|
|
|
|
= 'INSERT INTO urls ('
|
|
|
|
. join( ",", @{$keymap}{@keys} ) . ') '
|
|
|
|
. 'VALUES('
|
|
|
|
. join( ',', ('?') x scalar(@keys) ) . ')';
|
|
|
|
|
|
|
|
print "addURI query: $sql\n" if $DEBUG > 0;
|
|
|
|
|
|
|
|
return execSQL( $dbh, $sql, @{$uridata}{@keys} );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: extractEnclosures
|
|
|
|
# PURPOSE: Builds an array of hashes containing enclosure data from an
|
|
|
|
# XML::Feed object
|
|
|
|
# PARAMETERS: $feed XML::Feed object
|
|
|
|
# RETURNS: A reference to the array of anonymous hashes built from the
|
|
|
|
# feed.
|
|
|
|
# DESCRIPTION: The XML::Feed object is expected to contain an array of
|
|
|
|
# entries. These are converted to hashes, references to which
|
|
|
|
# are stored in an array. The two DateTime components are
|
|
|
|
# converted to ISO8601 dates. If there is an enclosure then its
|
|
|
|
# elements are saved. Note that there could be multiple
|
|
|
|
# enclosures, but XML::Feed does not cater for them unless
|
|
|
|
# explicitly requested. We do not deal with such a case here.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub extractEnclosures {
|
|
|
|
my ($feed) = @_;
|
|
|
|
|
|
|
|
my @encs;
|
|
|
|
|
|
|
|
foreach my $entry ( $feed->entries ) {
|
|
|
|
my %ent;
|
|
|
|
|
|
|
|
$ent{title} = $entry->title;
|
|
|
|
$ent{base} = $entry->base;
|
|
|
|
$ent{link} = $entry->link;
|
|
|
|
$ent{category} = join( ", ", $entry->category );
|
|
|
|
$ent{author} = $entry->author;
|
|
|
|
$ent{id} = $entry->id;
|
|
|
|
$ent{issued} = normaliseDT( $entry->issued );
|
|
|
|
$ent{modified} = normaliseDT( $entry->modified );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Work around a bug in XML::Feed where the source method is only
|
|
|
|
# available for Atom feeds. TODO report this.
|
|
|
|
#
|
|
|
|
if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
|
|
|
|
$ent{source} = $entry->source;
|
|
|
|
}
|
|
|
|
|
|
|
|
my ($enclosure) = $entry->enclosure;
|
|
|
|
if ( defined($enclosure) ) {
|
|
|
|
$ent{url} = $enclosure->url;
|
|
|
|
$ent{type} = $enclosure->type;
|
|
|
|
$ent{length} = $enclosure->length;
|
|
|
|
}
|
|
|
|
|
|
|
|
push( @encs, \%ent );
|
|
|
|
}
|
|
|
|
|
|
|
|
return \@encs;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: addEnclosures
|
|
|
|
# PURPOSE: Adds episodes extracted from a feed into the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
|
|
|
# RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
|
|
|
|
# DESCRIPTION: The SQL is defined internally and the hash keys are defined as
|
|
|
|
# an array to make it easy to slice the hash. The enclosures (or
|
|
|
|
# more correctly, feed items) are present in the hash as an
|
|
|
|
# array of anonymous hashes. These are processed one at a time
|
|
|
|
# and inserted into the database. A count of the number of
|
|
|
|
# successful inserts is kept. This is compared with the number
|
|
|
|
# of enclosures to determine the boolean value to return.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub addEnclosures {
|
|
|
|
my ( $dbh, $uridata ) = @_;
|
|
|
|
|
|
|
|
my $sql = q{INSERT INTO episodes
|
|
|
|
(urls_id, link, enclosure, title, author, category, source, ep_id,
|
|
|
|
issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
|
|
|
|
|
|
|
|
my @keys = (
|
|
|
|
'link', 'url', 'title', 'author', 'category', 'source',
|
|
|
|
'id', 'issued', 'modified', 'length', 'type'
|
|
|
|
);
|
|
|
|
|
|
|
|
my $successes = 0;
|
|
|
|
|
|
|
|
foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
|
|
|
|
if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
|
|
|
|
$successes++;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Failed to add episode $enc->{url}\n" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( $successes == $uridata->{ENCLOSURE_COUNT} );
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: updateURI
|
|
|
|
# PURPOSE: Compare the data in a hash with that in the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
|
|
|
# RETURNS: The number of changes made
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub updateURI {
|
|
|
|
my ( $dbh, $uridata, $keymap ) = @_;
|
|
|
|
|
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( %fieldvals, %where );
|
|
|
|
my ( $diffs, $updates ) = ( 0, 0 );
|
|
|
|
my @keys = (
|
|
|
|
'URI', 'DNS',
|
|
|
|
'HOSTUP', 'HTTP_STATUS',
|
|
|
|
'HTTP_CONTENT_TYPE', 'TYPE',
|
|
|
|
'FORMAT', 'TITLE',
|
|
|
|
'DESCRIPTION', 'AUTHOR',
|
|
|
|
'MODIFIED', 'LINK',
|
|
|
|
'IMAGE', 'COPYRIGHT',
|
|
|
|
'GENERATOR', 'LANGUAGE',
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the row from the urls table
|
|
|
|
#
|
|
|
|
$sql1 = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute( $uridata->{URI} );
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
|
|
|
|
|
|
for my $key (@keys) {
|
|
|
|
unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
|
|
|
|
$diffs++;
|
|
|
|
$fieldvals{$key} = $uridata->{$key};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Temporary report
|
|
|
|
#
|
|
|
|
print "Difference: ($key)\n";
|
|
|
|
print " Feed: ", coalesce($uridata->{$key},''), "\n";
|
|
|
|
print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($diffs > 0) {
|
|
|
|
#
|
|
|
|
# Prepare to use SQL::Abstract
|
|
|
|
#
|
|
|
|
my $sql = SQL::Abstract->new();
|
|
|
|
|
|
|
|
#
|
|
|
|
# Build the update statement
|
|
|
|
#
|
|
|
|
$where{id} = $h1->{id};
|
|
|
|
my ( $stmt, @bindvals )
|
|
|
|
= $sql->update( 'urls', \%fieldvals, \%where );
|
|
|
|
|
|
|
|
# Temporary
|
|
|
|
print "Statement: $stmt\n";
|
|
|
|
print "Bind values: ", join(",",@bindvals),"\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Perform the updates
|
|
|
|
#
|
|
|
|
$sth1 = $dbh->prepare($stmt);
|
|
|
|
$sth1->execute(@bindvals);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn "Processing $h1->{url}\n", $dbh->errstr;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit($silent, "Updated $h1->{url}\n");
|
|
|
|
$updates++;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return $updates;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: updateEnclosures
|
|
|
|
# PURPOSE: Update the enclosures stored with a feed URL
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
|
|
|
# RETURNS: The number of changes made
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub updateEnclosures {
|
|
|
|
my ( $dbh, $uridata ) = @_;
|
|
|
|
|
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( %fieldvals, %where );
|
|
|
|
my ( $diffs, $updates ) = ( 0, 0 );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the enclosures associated with this feed
|
|
|
|
#
|
|
|
|
$sql1 = q{
|
|
|
|
SELECT * FROM episodes
|
|
|
|
WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
|
|
|
|
};
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute( $uridata->{URI} );
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: equal
|
|
|
|
# PURPOSE: Compare two strings even if undefined
|
|
|
|
# PARAMETERS: $s1 The first string
|
|
|
|
# $s2 The second string
|
|
|
|
# RETURNS: True if both strings are undefined, false if one isn't
|
|
|
|
# defined, otherwise the result of comparing them.
|
|
|
|
# DESCRIPTION: Works on the principle that two undefined strings are equal,
|
|
|
|
# a defined and an undefined string are not, and otherwise they
|
|
|
|
# are equal if they are equal!
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub equal {
|
|
|
|
my ( $s1, $s2 ) = @_;
|
|
|
|
|
|
|
|
return 1 if ( !defined($s1) && !defined($s2) );
|
|
|
|
return 0 if ( !defined($s1) || !defined($s2) );
|
|
|
|
|
|
|
|
return ( $s1 eq $s2 );
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: clean_string
|
|
|
|
# PURPOSE: Clean a string of non-printables, newlines, multiple spaces
|
|
|
|
# PARAMETERS: $str The string to process
|
|
|
|
# RETURNS: The processed string
|
|
|
|
# DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
|
|
|
|
# characters. Removes all CR/LF sequences. Removes multiple
|
|
|
|
# spaces.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub clean_string {
|
|
|
|
my ($str) = @_;
|
|
|
|
|
|
|
|
$str =~ s/(^\s+|\s+$)//g;
|
|
|
|
$str =~ tr/[[:graph:]]//c;
|
|
|
|
$str =~ tr/\x0A\x0D/ /;
|
|
|
|
$str =~ tr/ / /s;
|
|
|
|
|
|
|
|
return $str;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: normaliseDT
|
|
|
|
# PURPOSE: Normalise an ISO8601 date for comparison, etc.
|
|
|
|
# PARAMETERS: $dt a DateTime object
|
|
|
|
# RETURNS: The DateTime object formatted as an ISO8601 string
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub normaliseDT {
|
|
|
|
my ($dt) = @_;
|
|
|
|
|
|
|
|
my $p = DateTime::Format::SQLite->new();
|
|
|
|
|
|
|
|
return (
|
|
|
|
defined($dt)
|
|
|
|
? $p->format_datetime(
|
|
|
|
DateTime::Format::ISO8601->parse_datetime($dt)
|
|
|
|
)
|
|
|
|
: undef
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: flattenArray
|
|
|
|
# PURPOSE: Turns an arrayref into a simple list in a string
|
|
|
|
# PARAMETERS: $item - the item that may be an arrayref
|
|
|
|
# RETURNS: The plain item if it's not an array otherwise the flattened
|
|
|
|
# version
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub flattenArray {
|
|
|
|
my ($item) = @_;
|
|
|
|
|
|
|
|
my $result;
|
|
|
|
if ( ref( $item ) eq 'ARRAY' ) {
|
|
|
|
$result = join(", ",@{$item});
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$result = $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: coalesce
|
|
|
|
# PURPOSE: To find the defined argument and return it
|
|
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
|
|
# RETURNS: The first defined argument
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub coalesce {
|
|
|
|
foreach (@_) {
|
|
|
|
return $_ if defined($_);
|
|
|
|
}
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: emit
|
|
|
|
# PURPOSE: Print text on STDERR unless silent mode has been selected
|
|
|
|
# PARAMETERS: - Boolean indicating whether to be silent or not
|
|
|
|
# - list of arguments to 'print'
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
|
|
|
|
# a message to STDERR depending on a boolean. We need this to be
|
|
|
|
# able to make the script silent when the -silent option is
|
|
|
|
# selected
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub emit {
|
|
|
|
unless (shift) {
|
|
|
|
print STDERR @_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: Options
|
|
|
|
# PURPOSE: Processes command-line options
|
|
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
|
|
# RETURNS: Undef
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub Options {
|
|
|
|
my ($optref) = @_;
|
|
|
|
|
|
|
|
my @options = (
|
|
|
|
"help", "manpage", "debug=i", "silent!",
|
|
|
|
"load=s", "delete=s", "scan!", "report:s",
|
|
|
|
"check!", "json:s", "opml:s", "config=s",
|
|
|
|
"out=s", "template:s",
|
|
|
|
);
|
|
|
|
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
|
|
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
__END__
|
|
|
|
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
# Application Documentation
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
#{{{
|
|
|
|
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
feedWatcher - watch a collection of podcast feeds
|
|
|
|
|
|
|
|
=head1 VERSION
|
|
|
|
|
|
|
|
This documentation refers to I<feedWatcher> version 0.0.15
|
|
|
|
|
|
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
|
|
|
|
feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
|
|
|
|
[-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
|
|
|
|
[-[no]silent] [-config=FILE] [-debug=N] [URL ...]
|
|
|
|
|
|
|
|
=head1 ARGUMENTS
|
|
|
|
|
|
|
|
Arguments are optional and may consist of an arbitrarily long list of URLs to
|
|
|
|
be processed and possibly added to the database by the script.
|
|
|
|
|
|
|
|
These URLs are prepended to any which may be provided through the
|
|
|
|
B<-load=FILE> option described below. The script makes sure the URL collection
|
|
|
|
contains no duplicates.
|
|
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
|
|
|
|
=over 8
|
|
|
|
|
|
|
|
=item B<-help>
|
|
|
|
|
|
|
|
Prints a brief help message describing the usage of the program, and then exits.
|
|
|
|
|
|
|
|
The full documentation can be displayed with the command:
|
|
|
|
|
|
|
|
perldoc -oman feedWatcher
|
|
|
|
|
|
|
|
Alternatively, use the option B<-manpage>
|
|
|
|
|
|
|
|
=item B<-manpage>
|
|
|
|
|
|
|
|
Prints the entire documentation for the script. Note that this mode uses a
|
|
|
|
simpler type of formatter than I<perldoc>.
|
|
|
|
|
|
|
|
=item B<-load=FILE>
|
|
|
|
|
|
|
|
Defines a file from which new URLs are to be added to the database. These URLs
|
|
|
|
are checked in various ways before adding to the database. If arguments are
|
|
|
|
provided when the script is invoked these URLs are appended to the argument list.
|
|
|
|
|
|
|
|
=item B<-delete=FILE>
|
|
|
|
|
|
|
|
Defines a file from which a list of URLs is to be read which are to be
|
|
|
|
deleted from the database.
|
|
|
|
|
|
|
|
Note that it is possible (though inadvisable) to both add and delete an URL in
|
|
|
|
the same run of the script. The URL will first be added (from the load file or
|
|
|
|
argument list) then deleted. This is a pointless exercise which wastes
|
|
|
|
bandwidth, so don't do it!
|
|
|
|
|
|
|
|
=item B<-[no]scan>
|
|
|
|
|
|
|
|
This option (B<-scan>) causes the URLs stored in the database to be scanned
|
|
|
|
and updated. The negated form, which is also the default behaviour of the
|
|
|
|
script, (B<-noscan>) omits the scan.
|
|
|
|
|
|
|
|
NOTE: This function is not implemented yet.
|
|
|
|
|
|
|
|
=item B<-out=FILE>
|
|
|
|
|
|
|
|
This option defines an output file to receive any output. If the option is
|
|
|
|
omitted the data is written to STDOUT, allowing it to be redirected if
|
|
|
|
required.
|
|
|
|
|
|
|
|
=item B<-[no]check>
|
|
|
|
|
|
|
|
This option (B<-check>) causes each feed which is being to be checked against
|
|
|
|
the script user to check that it's OK to add it. The script reports the
|
|
|
|
I<copyright> field and requests a I<y> or I<n> response.
|
|
|
|
|
|
|
|
=item B<-[no]report>
|
|
|
|
|
|
|
|
This option (B<-report>) causes a report of the contents of the database to be
|
|
|
|
generated. The negated form, which is also the default behaviour of the
|
|
|
|
script, (B<-noreport>) omits the report.
|
|
|
|
|
|
|
|
NOTE: The report is currently very simple.
|
|
|
|
|
|
|
|
=item B<-json[=FILE]>
|
|
|
|
|
|
|
|
This option, which may be omitted, defines the location where the feed and
|
|
|
|
episode details are to be written. If omitted no JSON data is written.
|
|
|
|
|
|
|
|
If the option is given as B<-json=FILE> the data is written to the nominated
|
|
|
|
file.
|
|
|
|
|
|
|
|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
|
|
|
|
used.
|
|
|
|
|
|
|
|
=item B<-opml[=FILE]>
|
|
|
|
|
|
|
|
This option, which may be omitted, defines the location where the feed details
|
|
|
|
are to be written. If omitted no OPML data is written.
|
|
|
|
|
|
|
|
If the option is given as B<-opml=FILE> the data is written to the nominated
|
|
|
|
file.
|
|
|
|
|
|
|
|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
|
|
|
|
used.
|
|
|
|
|
|
|
|
=item B<-template=FILE>
|
|
|
|
|
|
|
|
This option defines the template used to generate a form of the feed data. The
|
|
|
|
template is written using the B<Template> toolkit language.
|
|
|
|
|
|
|
|
If the file name is omitted then the script uses the file B<feedWatcher.tpl>
|
|
|
|
in the same directory as the script. If this file does not exist then the
|
|
|
|
script will exit with an error message.
|
|
|
|
|
|
|
|
For convenience B<feedWatcher.tpl> can be made a soft link which points to the
|
|
|
|
file which is the current default. This allows the development of versions
|
|
|
|
without changing the usual way this script is run.
|
|
|
|
|
|
|
|
=item B<-config=FILE>
|
|
|
|
|
|
|
|
This option allows an alternative configuration file to be used. This file
|
|
|
|
defines the location of the database, its name and the username and
|
|
|
|
password to be used to access it. This feature permits a test database to be
|
|
|
|
used, or for two more sets of feeds to be processed.
|
|
|
|
|
|
|
|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
|
|
|
|
|
|
|
If the option is omitted the default file is used: B<feedWatcher.cfg>
|
|
|
|
|
|
|
|
=item B<-debug=N>
|
|
|
|
|
|
|
|
This option selects the debug level, resulting in a lot of output.
|
|
|
|
|
|
|
|
0 (the default) No debug output
|
|
|
|
1 Dumps the list of feeds found in an HTML download.
|
|
|
|
Also shows the SQL query which will result in the insertion of a new
|
|
|
|
row into the database table I<urls>.
|
|
|
|
2 Dumps the collected data which is destined to be written to the database
|
|
|
|
3 Dumps the contents of feeds during analysis
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
A full description of the application and its features.
|
|
|
|
May include numerous subsections (i.e. =head2, =head3, etc.)
|
|
|
|
|
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
|
|
|
|
A list of every error and warning message that the application can generate
|
|
|
|
(even the ones that will "never happen"), with a full explanation of each
|
|
|
|
problem, one or more likely causes, and any suggested remedies. If the
|
|
|
|
application generates exit status codes (e.g. under Unix) then list the exit
|
|
|
|
status associated with each error.
|
|
|
|
|
|
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
|
|
|
|
A configuration file is used by the script. By default the name is
|
|
|
|
B<feedWatcher.cfg> (though, what is searched for is derived from the name of
|
|
|
|
the script). An alternative configuration file can be provided, specified
|
|
|
|
through the B<-config=FILE> option.
|
|
|
|
|
|
|
|
The file conforms to the format managed by the B<Config::General> module:
|
|
|
|
|
|
|
|
<database>
|
|
|
|
type = SQLite
|
|
|
|
file = feedWatcher.db
|
|
|
|
user =
|
|
|
|
password =
|
|
|
|
</database>
|
|
|
|
|
|
|
|
The B<type> value should be I<SQLite>, though it might be possible to use
|
|
|
|
other database types.
|
|
|
|
|
|
|
|
The B<file> value is the name of the database, which can include the
|
|
|
|
directory path.
|
|
|
|
|
|
|
|
The B<user> and B<password> values are not currently required, but if
|
|
|
|
authentication to the database was required it would be implemented this way.
|
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
|
|
|
|
Config::General
|
|
|
|
DBI
|
|
|
|
Data::Dumper
|
|
|
|
DateTime::Format::ISO8601
|
|
|
|
DateTime::Format::SQLite
|
|
|
|
Feed::Find
|
|
|
|
Getopt::Long
|
|
|
|
HTML::Entities
|
|
|
|
IO::Prompter
|
|
|
|
IO::Socket
|
|
|
|
JSON
|
|
|
|
LWP::UserAgent
|
|
|
|
List::MoreUtils qw{uniq}
|
|
|
|
Log::Handler
|
|
|
|
Net::DNS
|
|
|
|
Pod::Usage
|
|
|
|
SQL::Abstract
|
|
|
|
Template
|
|
|
|
Template::Filters
|
|
|
|
Try::Tiny
|
|
|
|
URI
|
|
|
|
WWW::RobotRules
|
|
|
|
XML::Feed
|
|
|
|
XML::RSS::Parser
|
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
|
|
|
|
There are no known bugs in this module.
|
|
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
Patches are welcome.
|
|
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com) 2020
|
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
|
|
|
|
Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
|
|
|
|
|
|
|
This program is free software. You can redistribute it and/or modify it under
|
|
|
|
the same terms as perl itself.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
#}}}
|
|
|
|
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|