2022-11-19 21:27:51 +00:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
#===============================================================================
|
|
|
|
#
|
|
|
|
# FILE: feedWatcher
|
|
|
|
#
|
|
|
|
# USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
|
2023-01-09 18:20:17 +00:00
|
|
|
# [-report[=title]] [-check[=mode]] [-out=FILE]
|
|
|
|
# [-rejects[=FILE]] [-json[=FILE]] [-opml[=FILE]]
|
|
|
|
# [-template[=FILE]] [-[no]silent] [-debug=N]
|
2022-11-19 21:27:51 +00:00
|
|
|
# [URL ...]
|
|
|
|
#
|
|
|
|
# DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
|
|
|
|
# podcasts. Ken's script was inspired by Dann Washko's site at
|
|
|
|
# http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
|
|
|
|
# August 2012 where it was used to generate handouts.
|
|
|
|
#
|
|
|
|
# The script has not been developed since 2014, but is now in
|
|
|
|
# use in 2020 helping to prepare podcast information for
|
|
|
|
# a FOSDEM visit under the heading of "Free Culture Podcasts".
|
|
|
|
# See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
|
|
|
|
# (created with Pandoc) in the same directory as this script for
|
|
|
|
# details of what has been done to develop the original design.
|
|
|
|
#
|
2023-01-11 09:45:38 +00:00
|
|
|
# Further development is taking place in 2022/2023, again for
|
|
|
|
# FOSDEM.
|
|
|
|
#
|
2022-11-19 21:27:51 +00:00
|
|
|
# OPTIONS: ---
|
|
|
|
# REQUIREMENTS: ---
|
|
|
|
# BUGS: ---
|
|
|
|
# NOTES: ---
|
|
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
2023-02-19 19:54:25 +00:00
|
|
|
# VERSION: 0.1.4
|
2022-11-19 21:27:51 +00:00
|
|
|
# CREATED: 2013-12-25 12:40:33
|
2023-02-19 19:54:25 +00:00
|
|
|
# REVISION: 2023-01-31 20:45:23
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Released under the terms of the GNU Affero General Public License (AGPLv3)
|
|
|
|
# http://www.gnu.org/licenses/agpl-3.0.html
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
#
|
|
|
|
#===============================================================================
|
|
|
|
|
|
|
|
use 5.030;
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use utf8;
|
|
|
|
use feature qw{ postderef say signatures state };
|
|
|
|
no warnings qw{ experimental::postderef experimental::signatures } ;
|
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# There's an issue in XML::RSS, so we're using a local version with a hack.
|
|
|
|
# It's in ./lib/ and FindBin::libs looks there to find it.
|
2023-01-11 09:45:38 +00:00
|
|
|
#
|
2023-01-10 20:22:47 +00:00
|
|
|
use FindBin::libs;
|
|
|
|
use XML::RSS;
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
use Getopt::Long;
|
|
|
|
use Pod::Usage;
|
|
|
|
use Config::General;
|
|
|
|
use List::MoreUtils qw{uniq};
|
2023-02-19 19:54:25 +00:00
|
|
|
use Set::Array;
|
2022-11-19 21:27:51 +00:00
|
|
|
use Log::Handler;
|
|
|
|
|
|
|
|
use Try::Tiny;
|
|
|
|
|
|
|
|
use URI;
|
|
|
|
use Net::DNS;
|
|
|
|
use IO::Socket;
|
|
|
|
use LWP::UserAgent;
|
|
|
|
use WWW::RobotRules;
|
2023-01-10 20:22:47 +00:00
|
|
|
#use XML::RSS::Parser;
|
2022-11-19 21:27:51 +00:00
|
|
|
use XML::Feed;
|
2023-01-09 18:20:17 +00:00
|
|
|
use XML::FeedPP; # Fall back?
|
2022-11-19 21:27:51 +00:00
|
|
|
use Feed::Find;
|
|
|
|
|
|
|
|
use Template;
|
|
|
|
use Template::Filters;
|
|
|
|
Template::Filters->use_html_entities; # Use HTML::Entities in the template
|
|
|
|
|
|
|
|
use HTML::Entities;
|
2023-01-14 23:13:49 +00:00
|
|
|
use HTML::Parser ();
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
use IO::Prompter;
|
|
|
|
|
|
|
|
use JSON;
|
|
|
|
|
|
|
|
use DBI;
|
|
|
|
use DateTime::Format::SQLite;
|
|
|
|
use DateTime::Format::ISO8601;
|
|
|
|
use SQL::Abstract;
|
|
|
|
|
|
|
|
use Data::Dumper;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Version number (manually incremented)
|
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
our $VERSION = '0.1.4';
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Script name
|
|
|
|
#
|
|
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Declarations
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $action_mode, @urls, @deletions );
|
|
|
|
my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" );
|
|
|
|
my ( $sth1, $h1, $rv, $search_target, $rejectcount );
|
2023-02-19 19:54:25 +00:00
|
|
|
my ( $loadfile, $deletefile ) = ( '', '' );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
my $feeds;
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# To be written by a handler subroutine called by HTML::Parser
|
|
|
|
#
|
|
|
|
our $html_title;
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
# File and directory paths
|
|
|
|
#
|
|
|
|
my $basedir = "$ENV{HOME}/HPR/feed_watcher";
|
|
|
|
my $tempdir = "$basedir/tmp";
|
|
|
|
my $configfile = "$basedir/$PROG.cfg";
|
|
|
|
my $logfile = "$basedir/${PROG}.log";
|
|
|
|
my $deftemplate = "$basedir/${PROG}.tpl";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Hash key map to database field names (chevron comments are to stop Perltidy
|
|
|
|
# messing with the layout)
|
|
|
|
#
|
|
|
|
#<<<
|
|
|
|
my %keymap = (
|
|
|
|
AUTHOR => 'author',
|
|
|
|
COPYRIGHT => 'copyright',
|
2023-01-14 23:13:49 +00:00
|
|
|
CHECKTYPE => 'check_type',
|
2022-11-19 21:27:51 +00:00
|
|
|
DESCRIPTION => 'description',
|
|
|
|
DNS => 'dns',
|
|
|
|
# ENCLOSURES => undef,
|
|
|
|
# ENCLOSURE_COUNT => undef,
|
|
|
|
FORMAT => 'feedformat',
|
|
|
|
GENERATOR => 'generator',
|
|
|
|
# HOST => undef,
|
|
|
|
HOSTUP => 'host_up',
|
|
|
|
HTTP_CONTENT_TYPE => 'content_type',
|
|
|
|
HTTP_STATUS => 'http_status',
|
|
|
|
IMAGE => 'image',
|
|
|
|
LANGUAGE => 'language',
|
|
|
|
LINK => 'link',
|
|
|
|
MODIFIED => 'modified',
|
|
|
|
# PORT => undef,
|
2023-01-14 23:13:49 +00:00
|
|
|
REASON_ACCEPTED => 'reason_accepted',
|
2022-11-19 21:27:51 +00:00
|
|
|
# ROBOTS => undef,
|
|
|
|
# SAVE => undef,
|
|
|
|
TITLE => 'title',
|
|
|
|
TYPE => 'urltype',
|
|
|
|
URI => 'url',
|
|
|
|
# URI_ID => undef,
|
|
|
|
);
|
|
|
|
#>>>
|
|
|
|
|
|
|
|
#
|
|
|
|
# Headers for LWP::UserAgent
|
|
|
|
#
|
|
|
|
my %headers = (
|
|
|
|
'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
|
|
|
|
. 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
|
|
|
|
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
|
|
|
|
. 'image/pjpeg, image/png, */*',
|
|
|
|
'Accept-Charset' => 'iso-8859-1,*,utf-8',
|
|
|
|
'Accept-Language' => 'en-US',
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Enable Unicode mode
|
|
|
|
#
|
|
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Options and arguments
|
|
|
|
#
|
|
|
|
my %options;
|
|
|
|
Options( \%options );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Any arguments are taken to be URLs
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
@urls = @ARGV;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Default help
|
|
|
|
#
|
2023-01-09 18:20:17 +00:00
|
|
|
pod2usage( -msg => "Version $VERSION\n", -exitval => 1, -verbose => 0 )
|
2022-11-19 21:27:51 +00:00
|
|
|
if ( $options{'help'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Detailed help
|
|
|
|
#
|
2023-01-09 18:20:17 +00:00
|
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
|
2022-11-19 21:27:51 +00:00
|
|
|
if ( $options{'manpage'} );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect options
|
|
|
|
#
|
|
|
|
my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
|
|
|
|
|
|
|
|
my $cfgfile
|
|
|
|
= ( defined( $options{config} ) ? $options{config} : $configfile );
|
2023-01-14 23:13:49 +00:00
|
|
|
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
|
|
|
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
|
|
|
2023-02-19 19:54:25 +00:00
|
|
|
my $load = $options{'load'};
|
|
|
|
my $delete = $options{'delete'};
|
|
|
|
my $inputfile = $options{'input'};
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-24 22:44:11 +00:00
|
|
|
my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 );
|
|
|
|
my $refresh = ( defined( $options{refresh} ) ? $options{refresh} : 0 );
|
2023-02-19 19:54:25 +00:00
|
|
|
my $expire = ( defined( $options{expire} ) ? $options{expire} : 0 );
|
2023-01-24 22:44:11 +00:00
|
|
|
my $html = ( defined( $options{html} ) ? $options{html} : 0 );
|
2023-02-19 19:54:25 +00:00
|
|
|
my $ignore_case
|
|
|
|
= ( defined( $options{'ignore-case'} ) ? $options{'ignore-case'} : 0 );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
my $check = $options{check};
|
|
|
|
my $outfile = $options{out};
|
|
|
|
my $rejectfile = $options{rejects};
|
|
|
|
my $report = $options{report};
|
|
|
|
my $json = $options{json};
|
|
|
|
my $opml = $options{opml};
|
|
|
|
my $template = $options{template};
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Validate and process options
|
|
|
|
#-------------------------------------------------------------------------------
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Sanity
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
die "Choose either -load or -delete, not both\n"
|
2023-02-19 19:54:25 +00:00
|
|
|
if (defined($load) && defined($delete));
|
2023-01-24 22:44:11 +00:00
|
|
|
die "Options -load and -delete should not be combined with -scan or -refresh\n"
|
2023-02-19 19:54:25 +00:00
|
|
|
if ( ( defined($load) || defined($delete) ) && ( $scan || $refresh ) );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the -input=FILE option is used with -load or -delete and confirm the
|
|
|
|
# existence and readability of the input file if specified.
|
|
|
|
#
|
|
|
|
if ( defined($inputfile) ) {
|
|
|
|
die "Option -input=FILE must be used with -load or -delete\n"
|
|
|
|
unless ($load || $delete);
|
|
|
|
die "File in '-input=$inputfile' does not exist\n"
|
|
|
|
unless -e $inputfile;
|
|
|
|
die "File in '-input=$inputfile' is not readable\n"
|
|
|
|
unless -r $inputfile;
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Check the configuration file
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
# Determine the action mode, reading a file and/or using URLs on the command
|
|
|
|
# line.
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
if ($load) {
|
2023-01-14 23:13:49 +00:00
|
|
|
$action_mode = 'load';
|
2023-02-19 19:54:25 +00:00
|
|
|
$loadfile = $inputfile;
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug(
|
|
|
|
$DEBUG > 0,
|
|
|
|
"Action mode: $action_mode",
|
2023-02-19 19:54:25 +00:00
|
|
|
( defined($loadfile)
|
|
|
|
? "File to load: $loadfile"
|
|
|
|
: "Load from arguments"
|
2023-01-14 23:13:49 +00:00
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
2023-02-19 19:54:25 +00:00
|
|
|
elsif ($delete) {
|
2023-01-14 23:13:49 +00:00
|
|
|
$action_mode = 'delete';
|
2023-02-19 19:54:25 +00:00
|
|
|
$deletefile = $inputfile;
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug(
|
|
|
|
$DEBUG > 0,
|
|
|
|
"Action mode: $action_mode",
|
2023-02-19 19:54:25 +00:00
|
|
|
( defined($deletefile)
|
|
|
|
? "File to delete from: $deletefile"
|
|
|
|
: "Delete from arguments"
|
2023-01-14 23:13:49 +00:00
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$action_mode = 'none';
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
|
2023-02-19 19:54:25 +00:00
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# The copyright checking mode defaults to 'auto' if the option has no value,
|
|
|
|
# or may be 'manual' or 'none'. If the option is not used at all it defaults
|
|
|
|
# to 'none'. It's only relevant to the 'load' action though.
|
2023-01-09 18:20:17 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
if ( $action_mode eq 'load' ) {
|
2023-02-19 19:54:25 +00:00
|
|
|
# if ( $action_mode eq 'load' && defined($check) ) {
|
|
|
|
if ( defined($check) ) {
|
2023-01-14 23:13:49 +00:00
|
|
|
$check =~ s/(^\s+|\s+$)//g;
|
|
|
|
if ($check =~ /^$/) {
|
|
|
|
$check = "auto";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$check = lc($check);
|
|
|
|
die "Invalid option '-check=$check'\n" .
|
|
|
|
"Values are <blank>, auto and manual\n"
|
|
|
|
unless ($check =~ /^(auto|manual|none)$/)
|
|
|
|
}
|
2023-01-09 18:20:17 +00:00
|
|
|
}
|
|
|
|
else {
|
2023-01-14 23:13:49 +00:00
|
|
|
$check = 'none';
|
2023-01-09 18:20:17 +00:00
|
|
|
}
|
2023-01-14 23:13:49 +00:00
|
|
|
emit($silent,"Copyright check mode = $check\n");
|
2023-01-09 18:20:17 +00:00
|
|
|
}
|
|
|
|
|
2023-02-19 19:54:25 +00:00
|
|
|
if ($dry_run) {
|
|
|
|
emit( $silent, "Dry run mode = On\n" );
|
|
|
|
emit( $silent, "----\n" );
|
|
|
|
}
|
2023-01-14 23:13:49 +00:00
|
|
|
|
|
|
|
# TODO: Does it make sense to have -load and -report, etc at the same time?
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
# We accept -report, meaning report everything or -report='title' to report
|
2023-01-14 23:13:49 +00:00
|
|
|
# just the feed with the given title (actually, the title which contains the
|
|
|
|
# given string).
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
if ( defined($report) ) {
|
|
|
|
if ($report =~ /^$/) {
|
|
|
|
$search_target = undef;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$search_target = $report;
|
|
|
|
}
|
|
|
|
}
|
2023-01-14 23:13:49 +00:00
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
# We accept -json or -json=filename. In the former case we make a default
|
|
|
|
# name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($json) ) {
|
|
|
|
if ($json =~ /^$/) {
|
2023-01-09 18:20:17 +00:00
|
|
|
$json = "$PROG.json";
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# We accept -opml or -opml=filename. In the former case we make a default
|
|
|
|
# name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($opml) ) {
|
|
|
|
if ($opml =~ /^$/) {
|
|
|
|
$opml = "$PROG.opml"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Similarly we accept -template or -template=filename. In the former case we
|
|
|
|
# make a default name, otherwise we use the one provided.
|
|
|
|
#
|
|
|
|
if ( defined($template) ) {
|
|
|
|
if ($template =~ /^$/) {
|
|
|
|
$template = "$deftemplate";
|
|
|
|
}
|
|
|
|
|
|
|
|
die "Error: Unable to find template $template\n" unless -r $template;
|
|
|
|
}
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2022-11-19 21:27:51 +00:00
|
|
|
# Load configuration data
|
2023-01-14 23:13:49 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2022-11-19 21:27:51 +00:00
|
|
|
my $conf = new Config::General(
|
|
|
|
-ConfigFile => $cfgfile,
|
|
|
|
-InterPolateVars => 1,
|
|
|
|
-ExtendedAccess => 1
|
|
|
|
);
|
|
|
|
my %config = $conf->getall();
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2022-11-19 21:27:51 +00:00
|
|
|
# Connect to the database
|
2023-01-14 23:13:49 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2022-11-19 21:27:51 +00:00
|
|
|
my $dbtype = $config{database}->{type};
|
|
|
|
my $dbfile = $config{database}->{file};
|
|
|
|
my $dbuser = $config{database}->{user};
|
|
|
|
my $dbpwd = $config{database}->{password};
|
|
|
|
my $dbh
|
|
|
|
= DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
|
|
|
|
{ AutoCommit => 1, sqlite_unicode => 1, } )
|
|
|
|
or die $DBI::errstr;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
|
|
|
|
# foreign key constraints to function)
|
|
|
|
#
|
|
|
|
$dbh->do('PRAGMA foreign_keys = ON');
|
|
|
|
|
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
# Check we have something to do. NOTE: this check is a bit early because we
|
|
|
|
# haven't read the input file yet, if there is one.
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
|
|
|
|
my $work = (
|
2023-02-19 19:54:25 +00:00
|
|
|
( ( scalar(@urls) > 0 || defined($inputfile) )
|
|
|
|
&& $action_mode =~ /load|delete/
|
|
|
|
)
|
|
|
|
|| ( defined($report)
|
|
|
|
|| defined($json)
|
|
|
|
|| defined($opml)
|
|
|
|
|| defined($template) )
|
|
|
|
|| ( ( $scan || $refresh ) && $rows > 0 )
|
2022-11-19 21:27:51 +00:00
|
|
|
);
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
unless ($work) {
|
|
|
|
print STDERR "Nothing to do!\n";
|
|
|
|
exit;
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
2023-01-27 20:33:31 +00:00
|
|
|
# Set up logging keeping the default log layout except for the date. Enable
|
|
|
|
# 'utf-8' mode (documented in Log::Handler::Output::File).
|
2022-11-19 21:27:51 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $LOG = Log::Handler->new();
|
|
|
|
|
|
|
|
$LOG->add(
|
|
|
|
file => {
|
|
|
|
timeformat => "%Y/%m/%d %H:%M:%S",
|
|
|
|
filename => $logfile,
|
|
|
|
minlevel => 0,
|
|
|
|
maxlevel => 7,
|
2023-01-27 20:33:31 +00:00
|
|
|
'utf-8' => 1,
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Open the output file (or STDOUT) - we may need the date to do it
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $outfh;
|
|
|
|
if ($outfile) {
|
|
|
|
open( $outfh, ">:encoding(UTF-8)", $outfile )
|
|
|
|
or warn "Unable to open $outfile for writing: $!";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
|
|
|
|
or warn "Unable to initialise for writing: $!";
|
|
|
|
}
|
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Open the rejects file if requested, otherwise we don't write reject data
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
my $rejectfh;
|
|
|
|
if ($rejectfile) {
|
|
|
|
if ($rejectfile =~ /^$/) {
|
|
|
|
$rejectfile = "${PROG}_rejected_URLs.txt";
|
|
|
|
}
|
|
|
|
|
|
|
|
open( $rejectfh, ">:encoding(UTF-8)", $rejectfile )
|
|
|
|
or warn "Unable to open $rejectfile for writing: $!";
|
|
|
|
|
|
|
|
$rejectcount = 0;
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Set up a robot.txt rules parser
|
|
|
|
#
|
|
|
|
$rules = WWW::RobotRules->new($robot_name);
|
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2023-01-14 23:13:49 +00:00
|
|
|
# Check the mode we are in and prepare to load or delete according to the
|
|
|
|
# answer.
|
2023-01-09 18:20:17 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
2023-01-14 23:13:49 +00:00
|
|
|
if ($action_mode eq 'load') {
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Slurp the load file into @urls if the file is provided
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
if ($loadfile) {
|
|
|
|
#
|
|
|
|
# Load the input file
|
|
|
|
#
|
|
|
|
open( my $in, '<:encoding(utf8)', $loadfile )
|
|
|
|
or die "$PROG : failed to open load file '$loadfile' : $!\n";
|
|
|
|
chomp( my @loaded = <$in> );
|
|
|
|
close($in)
|
|
|
|
or warn "$PROG : failed to close load file '$loadfile' : $!\n";
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
$LOG->info("Loaded URLs from $loadfile");
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# Add the loaded URLs to the array
|
|
|
|
#
|
|
|
|
push( @urls, @loaded );
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Now, we either have URLs from the command line, or from the load file (or
|
|
|
|
# both), so we process these.
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# It's a loop because 'loadUrls' might find some more URLs by scanning HTML
|
|
|
|
# URLs if given them. If it does we replace @urls with the found URLs and
|
|
|
|
# go again. When there's nothing returned the loop stops.
|
|
|
|
# ----
|
|
|
|
# NOTE: This seems dirty, but all the 'while' is testing is whether the array
|
|
|
|
# contains anything or not. It's not iterating over it or anything, which would
|
|
|
|
# be messy!
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
while (@urls) {
|
|
|
|
#
|
|
|
|
# Remove duplicates, finish if it deletes them all!
|
|
|
|
#
|
|
|
|
@urls = uniq(@urls);
|
|
|
|
last unless @urls;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# Remove any commented out lines, finish if it deletes them all!
|
|
|
|
#
|
|
|
|
@urls = grep {!/^\s*#/} @urls;
|
|
|
|
last unless @urls;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
$LOG->info( "Loading ", scalar(@urls), " URLs to the database" );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# Load these URLs as appropriate, returning any more that we find by
|
|
|
|
# following HTML urls. We overwrite the original list and start all over
|
|
|
|
# again.
|
|
|
|
#
|
|
|
|
@urls = loadUrls( $dbh, \@urls, $rules, \%keymap, $dry_run );
|
|
|
|
}
|
|
|
|
}
|
2023-02-19 19:54:25 +00:00
|
|
|
elsif ( $action_mode eq 'delete' ) {
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Process the delete file if there is one
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
if ($deletefile) {
|
|
|
|
#
|
|
|
|
# Load the delete file
|
|
|
|
#
|
|
|
|
open( my $del, '<:encoding(utf8)', $deletefile )
|
|
|
|
or die "$PROG : failed to open load file '$deletefile' : $!\n";
|
|
|
|
chomp( @deletions = <$del> );
|
|
|
|
close($del)
|
2023-02-19 19:54:25 +00:00
|
|
|
or warn
|
|
|
|
"$PROG : failed to close delete file '$deletefile' : $!\n";
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# Add the loaded URLs to the array
|
|
|
|
#
|
|
|
|
push( @urls, @deletions );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# Remove duplicates
|
|
|
|
#
|
|
|
|
@urls = uniq(@urls);
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
if (@urls) {
|
|
|
|
#
|
|
|
|
# TODO: check that these URLs are actually in the database! Seems
|
|
|
|
# silly to report "Failed to delete" when it's not there anyway!
|
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# There are URLs to delete. Process them one by one.
|
|
|
|
#
|
|
|
|
if ($dry_run) {
|
|
|
|
emit( $silent,
|
2023-02-19 19:54:25 +00:00
|
|
|
"Would have deleted "
|
|
|
|
. scalar(@urls)
|
|
|
|
. " URLs (after checking)\n" );
|
2023-01-14 23:13:49 +00:00
|
|
|
}
|
|
|
|
else {
|
2023-02-19 19:54:25 +00:00
|
|
|
my @missing;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the URLs exist
|
|
|
|
#
|
|
|
|
$sth1 = $dbh->prepare(q{SELECT id from urls WHERE url = ?});
|
2023-01-14 23:13:49 +00:00
|
|
|
foreach my $rec (@urls) {
|
|
|
|
$rv = $sth1->execute($rec);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
2023-02-19 19:54:25 +00:00
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
|
|
unless ($h1) {
|
|
|
|
emit( $silent, "Could not find URL $rec in the database\n" );
|
|
|
|
$LOG->warning(
|
|
|
|
"Failed to delete '$rec'; not in the database");
|
|
|
|
push( @missing, $rec );
|
2023-01-14 23:13:49 +00:00
|
|
|
}
|
2023-02-19 19:54:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Remove the missing URLs from @urls
|
|
|
|
#
|
|
|
|
if (@missing) {
|
|
|
|
my $sa1 = Set::Array->new(@urls);
|
|
|
|
my $sa2 = Set::Array->new(@missing);
|
|
|
|
@urls = $sa1->difference($sa2);
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# If nothing is left we're done
|
|
|
|
#
|
|
|
|
unless (@urls) {
|
|
|
|
warn "No URLs left after cleaning\n";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# Delete what's left after cleaning
|
|
|
|
#
|
|
|
|
$sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
|
|
|
|
foreach my $rec (@urls) {
|
|
|
|
$rv = $sth1->execute($rec);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
if ( $rv != 0 ) {
|
|
|
|
emit( $silent, "Deleted $rec ($rv rows)\n" );
|
|
|
|
$LOG->info("Deleted URL '$rec' from the database");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Failed to delete $rec\n" );
|
|
|
|
$LOG->warning(
|
|
|
|
"Failed to delete '$rec' from the database");
|
|
|
|
}
|
2023-01-14 23:13:49 +00:00
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Perform a database scan if requested
|
2023-01-09 18:20:17 +00:00
|
|
|
# TODO: Needs to be developed; does nothing at the moment.
|
2022-11-19 21:27:51 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($scan) {
|
2023-01-24 22:44:11 +00:00
|
|
|
warn "Refresh is not implemented yet\n";
|
|
|
|
#$LOG->warning( "Scan is not fully implemented yet" );
|
2022-11-20 22:49:57 +00:00
|
|
|
# Testing. Processes the first two feeds
|
2023-01-24 22:44:11 +00:00
|
|
|
# TODO: Currently broken
|
|
|
|
#scanDB($dbh, \%keymap, $dry_run);
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Perform a feed refresh
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($refresh) {
|
|
|
|
warn "Refresh is not implemented yet";
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Report all or selected database contents if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ( defined($report) ) {
|
|
|
|
if ( defined($search_target) ) {
|
|
|
|
#
|
|
|
|
# Reporting a specific title
|
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
my @matches = searchTitle( $dbh, $search_target, $ignore_case );
|
2022-11-19 21:27:51 +00:00
|
|
|
if (@matches) {
|
|
|
|
#
|
|
|
|
# Too many matches!
|
|
|
|
#
|
|
|
|
if ( $#matches > 1 ) {
|
|
|
|
say "Multiple matches:\n";
|
|
|
|
say "- ", join( "\n- ", @matches );
|
|
|
|
say "\nTry again with a different title";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# One match, report it
|
|
|
|
#
|
|
|
|
reportFeed( findFeed( $dbh, $matches[0] ), $outfh );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
say "Feed not found with target '$search_target'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
|
|
|
# Reporting everything
|
|
|
|
#
|
|
|
|
reportDB( $dbh, $outfh );
|
|
|
|
$LOG->info("Report generated");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Do any image repairs we need, but temporarily
|
|
|
|
#
|
|
|
|
if ($json || $opml || $template) {
|
|
|
|
$feeds = collectData($dbh);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Clean up the image references (until we have a proper fix)
|
|
|
|
#
|
|
|
|
for (my $i = 0; $i < scalar(@$feeds); $i++) {
|
|
|
|
my $f = $feeds->[$i];
|
|
|
|
if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
|
|
|
|
$f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
|
|
|
|
$LOG->info( "Fixed image for $f->{urls_url}" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Output all feeds to a JSON file if requested
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($json) {
|
|
|
|
my $js = JSON->new->utf8->canonical->pretty;
|
|
|
|
open( my $out, '>:encoding(UTF-8)', $json )
|
|
|
|
or die "Unable to open output file $json $!\n";
|
|
|
|
|
|
|
|
print $out $js->encode($feeds), "\n";
|
|
|
|
close($out);
|
|
|
|
emit ( $silent, "JSON data written to $json\n" );
|
|
|
|
$LOG->info( "JSON data written to $json" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Requesting the -opml option means to dump the entire feed table as OPML
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($opml) {
|
|
|
|
require XML::OPML;
|
|
|
|
|
|
|
|
open( my $out, '>:encoding(UTF-8)', $opml )
|
|
|
|
or die "Unable to open output file $opml $!\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Start building OPML. Mandatory attributes are 'type', 'text' and
|
|
|
|
# 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
|
|
|
|
# attribute is poorly defined; this module uses 'rss' as the type and
|
|
|
|
# 'RSS' as the value of the 'version' attribute. This is not linked to the
|
|
|
|
# type of the feed.
|
|
|
|
#
|
|
|
|
my $OPML = XML::OPML->new(version => '1.1');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Create the 'head' element
|
|
|
|
#
|
|
|
|
my $DT = normaliseDT(DateTime->now());
|
|
|
|
$OPML->head(
|
|
|
|
title => 'Free Culture Podcasts',
|
|
|
|
dateCreated => $DT,
|
|
|
|
dateModified => $DT,
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Walk the feeds generating OPML
|
|
|
|
#
|
|
|
|
for (my $i = 0; $i < scalar(@$feeds); $i++) {
|
|
|
|
my $f = $feeds->[$i];
|
|
|
|
|
|
|
|
$OPML->add_outline(
|
|
|
|
title => $f->{urls_title},
|
|
|
|
text => $f->{urls_title},
|
|
|
|
description => $f->{urls_description},
|
|
|
|
xmlUrl => $f->{urls_url},
|
|
|
|
htmlUrl => $f->{urls_link},
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Output OPML to the nominated file
|
|
|
|
#
|
|
|
|
print $out $OPML->as_string;
|
|
|
|
|
|
|
|
close($out);
|
|
|
|
emit ( $silent, "OPML data written to $opml\n" );
|
|
|
|
$LOG->info( "OPML data written to $opml" );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
2023-01-14 23:13:49 +00:00
|
|
|
# Fill and print the TT² template if requested
|
2022-11-19 21:27:51 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
if ($template) {
|
|
|
|
my $tt = Template->new(
|
|
|
|
{ ABSOLUTE => 1,
|
|
|
|
ENCODING => 'utf8',
|
|
|
|
INCLUDE_PATH => $basedir,
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Make the structure the template needs
|
|
|
|
#
|
|
|
|
my $vars = {
|
|
|
|
feeds => $feeds,
|
|
|
|
};
|
|
|
|
|
|
|
|
# print Dumper($vars),"\n";
|
|
|
|
|
|
|
|
my $document;
|
|
|
|
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|
|
|
|
|| die $tt->error(), "\n";
|
|
|
|
print $outfh $document;
|
|
|
|
|
|
|
|
emit ( $silent, "Data processed with $template\n" );
|
|
|
|
$LOG->info( "Data processed with $template" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#$dbh->disconnect;
|
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
if ($rejectfile) {
|
|
|
|
emit( $silent,
|
|
|
|
"Number of rejected URLs written to $rejectfile is $rejectcount\n" );
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
exit;
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: loadUrls
|
2023-01-14 23:13:49 +00:00
|
|
|
# PURPOSE: To load URLs read from the input file (and the arguments) into
|
|
|
|
# the database
|
2022-11-19 21:27:51 +00:00
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $new_urls arrayref containing URLs
|
|
|
|
# $rules WWW::RobotRules object
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: Any new URLs discovered by investigating non-feed URLs.
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub loadUrls {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $dbh, $new_urls, $rules, $keymap, $dry_run ) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
my ( $stream, $feed );
|
|
|
|
my ( %uridata, $roboturl, @found_urls );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Work through the list of URLs
|
|
|
|
#
|
2023-01-09 18:20:17 +00:00
|
|
|
foreach my $rec (@$new_urls) {
|
2022-11-19 21:27:51 +00:00
|
|
|
%uridata = ();
|
|
|
|
|
|
|
|
#
|
|
|
|
# By default save the collected data
|
|
|
|
#
|
|
|
|
$uridata{SAVE} = 1;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check that we have a valid URL. We don't save them if they are
|
|
|
|
# invalid ($uridata{SAVE} is set to 0 in the routine).
|
|
|
|
#
|
|
|
|
my $uri = validateURI($rec,\%uridata);
|
|
|
|
next unless $uri;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check whether we already have this URI in the database
|
|
|
|
#
|
|
|
|
if (dbSearch(
|
|
|
|
$dbh, 'SELECT * FROM urls WHERE url = ?',
|
|
|
|
$uridata{URI}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
{
|
|
|
|
emit( $silent, "$uri is already in the database\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the hostname resolves in DNS
|
|
|
|
#
|
|
|
|
if ( checkDNS( $uri, \%uridata ) ) {
|
|
|
|
$uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
|
|
|
|
emit( $silent, "DNS: $uridata{DNS}\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the server is available. Don't save if it's not (probably will
|
|
|
|
# not get the feed anyway).
|
|
|
|
#
|
|
|
|
if ( serverUp( $uri, \%uridata ) ) {
|
|
|
|
emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the site's robots.txt. If there's a block then don't save.
|
|
|
|
#
|
|
|
|
if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
|
|
|
|
emit( $silent, "Check of robots.txt rules succeeded\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ( $uridata{ROBOTS} =~ /404/ ) {
|
|
|
|
emit( $silent, "Search for robots.txt file failed\n" );
|
|
|
|
}
|
|
|
|
else {
|
2023-01-14 23:13:49 +00:00
|
|
|
emit( $silent, "Check of robots.txt rules blocks access\n" );
|
2022-11-19 21:27:51 +00:00
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Look for the HTTP content type. Don't save if the request failed.
|
|
|
|
#
|
|
|
|
if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) {
|
2023-01-09 18:20:17 +00:00
|
|
|
emit( $silent, "HTTP request to check type OK\n" );
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "HTTP request failed\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Parse the feed
|
|
|
|
#
|
|
|
|
if ( $uridata{TYPE} eq 'Feed' ) {
|
|
|
|
$LOG->info('Processing feed: ',$uridata{URI});
|
|
|
|
$stream = getFeed( $uridata{URI} );
|
|
|
|
if ($stream) {
|
|
|
|
$feed = parseFeed( $uridata{URI}, $stream );
|
|
|
|
unless ( $feed ) {
|
|
|
|
$uridata{SAVE} = 0;
|
2023-01-09 18:20:17 +00:00
|
|
|
emit( $silent, "Feed did not parse $uridata{URI}\n" );
|
|
|
|
$LOG->warning('Feed did not parse: ',$uridata{URI});
|
2022-11-19 21:27:51 +00:00
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Provide a means of examining the returned feed object. There
|
|
|
|
# are some weird behaviours in there (partly because of the
|
|
|
|
# weirdness of RSS and poor adherence to what standards there
|
|
|
|
# are).
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug( $DEBUG > 2, Dumper($feed));
|
2022-11-19 21:27:51 +00:00
|
|
|
storeFeed($feed,\%uridata);
|
|
|
|
|
|
|
|
#
|
2023-01-09 18:20:17 +00:00
|
|
|
# Perform a check on the copyright. The routine sets
|
2022-11-19 21:27:51 +00:00
|
|
|
# $uridata{SAVE} = 0 if the copyright is not acceptable.
|
|
|
|
#
|
2023-01-24 22:44:11 +00:00
|
|
|
$uridata{CHECKTYPE} = $check;
|
2023-01-09 18:20:17 +00:00
|
|
|
if ( $check ne 'none' ) {
|
|
|
|
unless (checkCopyright( $check, \%uridata )) {
|
|
|
|
#
|
|
|
|
# Rejected, write URL to a file if requested
|
|
|
|
#
|
|
|
|
if ($rejectfile) {
|
|
|
|
printf $rejectfh "%s\n", $uridata{URI};
|
|
|
|
$rejectcount++;
|
|
|
|
}
|
|
|
|
next;
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit($silent, "Problem collecting feed");
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
elsif ( $uridata{TYPE} eq 'HTML' ) {
|
|
|
|
#
|
|
|
|
# TODO Save the URL and find any feeds it contains
|
|
|
|
#
|
|
|
|
if (@found_urls) {
|
|
|
|
emit( $silent,
|
|
|
|
"Type $uridata{TYPE} contained "
|
|
|
|
. scalar(@found_urls)
|
2023-01-10 20:22:47 +00:00
|
|
|
. " feeds; queued to be checked for inclusion\n" );
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"Type $uridata{TYPE} contained no feeds; skipped\n" );
|
|
|
|
}
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
|
|
|
|
$uridata{SAVE} = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
continue {
|
|
|
|
# { This 'continue' block is executed for each iteration or 'next' }
|
|
|
|
|
|
|
|
#
|
|
|
|
# Decide whether to save what we have collected
|
|
|
|
#
|
|
|
|
if ( $uridata{SAVE} ) {
|
2023-01-14 23:13:49 +00:00
|
|
|
if ( addURI( $dbh, \%uridata, $keymap, $dry_run ) ) {
|
2022-11-19 21:27:51 +00:00
|
|
|
emit( $silent, "$uridata{URI} added to the database\n" );
|
|
|
|
$LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the id the database allocated for the row we added
|
|
|
|
#
|
|
|
|
$uridata{URI_ID}
|
|
|
|
= $dbh->last_insert_id( undef, undef, 'urls', undef );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Add any enclosures we found
|
|
|
|
#
|
|
|
|
if ( defined( $uridata{ENCLOSURE_COUNT} )
|
|
|
|
&& $uridata{ENCLOSURE_COUNT} > 0 )
|
|
|
|
{
|
2023-01-14 23:13:49 +00:00
|
|
|
if ( addEnclosures( $dbh, \%uridata, $dry_run ) ) {
|
2022-11-19 21:27:51 +00:00
|
|
|
emit( $silent, $uridata{ENCLOSURE_COUNT},
|
|
|
|
" enclosures for $uridata{URI} added to the database\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent,
|
|
|
|
"$uridata{URI} was not added to the database\n" );
|
|
|
|
$LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Dump what we have if requested
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug( $DEBUG > 1, Dumper( \%uridata ) );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
emit( $silent, '-' x 80, "\n" );
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Return any further urls we found
|
|
|
|
#
|
|
|
|
return @found_urls;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: searchTitle
|
|
|
|
# PURPOSE: Search the database for a feed with a given title
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $target search target
|
2023-02-19 19:54:25 +00:00
|
|
|
# $ignore_case Boolean controlling whether it's a caseles
|
|
|
|
# search
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: A list of titles
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub searchTitle {
|
2023-02-19 19:54:25 +00:00
|
|
|
my ($dbh, $target, $ignore_case) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
my ( $sql1, $sql2, $sth, $rv, $h );
|
|
|
|
my ( $count, @result );
|
|
|
|
|
|
|
|
$sql1 = q{
|
|
|
|
SELECT count(*) AS count
|
|
|
|
FROM urls
|
|
|
|
WHERE urltype = 'Feed'
|
|
|
|
AND title REGEXP ?
|
|
|
|
};
|
|
|
|
|
|
|
|
$sql2 = q{
|
|
|
|
SELECT title
|
|
|
|
FROM urls
|
|
|
|
WHERE urltype = 'Feed'
|
|
|
|
AND title REGEXP ?
|
|
|
|
ORDER BY title
|
|
|
|
};
|
|
|
|
|
2023-02-19 19:54:25 +00:00
|
|
|
#
|
|
|
|
# Handle caseless searches
|
|
|
|
#
|
|
|
|
$target = ($ignore_case ? '(?i)' : '') . $target;
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
# Count the number of matches
|
|
|
|
#
|
|
|
|
$sth = $dbh->prepare($sql1);
|
2023-02-19 19:54:25 +00:00
|
|
|
$rv = $sth->execute($target);
|
2022-11-19 21:27:51 +00:00
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Proceed if we have 1 or more matches
|
|
|
|
#
|
|
|
|
if ( $h = $sth->fetchrow_hashref ) {
|
|
|
|
$count = $h->{count};
|
|
|
|
$sth->finish;
|
|
|
|
|
|
|
|
if ( $count >= 1 ) {
|
|
|
|
$sth = $dbh->prepare($sql2);
|
2023-02-19 19:54:25 +00:00
|
|
|
$rv = $sth->execute($target);
|
2022-11-19 21:27:51 +00:00
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
while ( $h = $sth->fetchrow_hashref ) {
|
|
|
|
push(@result, $h->{title});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return @result;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-24 22:44:11 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: refreshFeeds
|
|
|
|
# PURPOSE: To refresh the episodes on all feeds
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub refreshFeeds {
|
|
|
|
my ($dbh) = @_;
|
|
|
|
|
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( $aref, @urls, $DT, $stream, $feed );
|
|
|
|
my ( %uridata, $encref, $enc_changes );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to return all feed URLs
|
|
|
|
#
|
|
|
|
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
|
|
|
|
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect everything as an arrayref pointing to a bunch of arrayrefs
|
|
|
|
# containing the column details requested
|
|
|
|
#
|
|
|
|
$aref = $sth1->fetchall_arrayref;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Extract just the URL strings
|
|
|
|
#
|
|
|
|
@urls = map { $_->[1] } @{$aref};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Loop through the feed URLs
|
|
|
|
#
|
|
|
|
foreach my $url (@urls) {
|
|
|
|
#
|
|
|
|
# Get the feed as XML
|
|
|
|
#
|
|
|
|
$stream = getFeed($url);
|
|
|
|
next unless $stream;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Parse the feed as an XML::Feed object
|
|
|
|
#
|
|
|
|
$feed = parseFeed($url,$stream);
|
|
|
|
next unless $feed;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Turn the enclosures in the feed into an array of anonymous hashes
|
|
|
|
#
|
|
|
|
$encref = extractEnclosures($feed);
|
|
|
|
}
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: scanDB
|
|
|
|
# PURPOSE: To scan the URLs in the database and update the stored data
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub scanDB {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ($dbh, $keymap, $dry_run) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
# TODO: dry-run mode not implemented here yet
|
2022-11-19 21:27:51 +00:00
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( $aref, @urls, $DT, $stream, $feed );
|
|
|
|
my ( %uridata, $urichanges, $enc_changes );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to return all feed URLs
|
|
|
|
#
|
|
|
|
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
|
|
|
|
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Collect everything as an arrayref pointing to a bunch of arrayrefs
|
|
|
|
# containing the column details requested
|
|
|
|
#
|
|
|
|
$aref = $sth1->fetchall_arrayref;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Extract just the URL strings
|
|
|
|
#
|
|
|
|
@urls = map { $_->[1] } @{$aref};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Now process these URLs from the database one at a time
|
|
|
|
#
|
|
|
|
# $sql1 = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
# $sth1 = $dbh->prepare($sql1);
|
|
|
|
|
|
|
|
my $count = 0;
|
|
|
|
|
|
|
|
foreach my $url (@urls) {
|
|
|
|
%uridata = ();
|
|
|
|
|
|
|
|
$count++;
|
|
|
|
|
|
|
|
scanFeed($dbh,$url,\%uridata);
|
|
|
|
|
|
|
|
# #
|
|
|
|
# # Record the scan for this URL
|
|
|
|
# #
|
|
|
|
# $DT = normaliseDT( DateTime->now() );
|
|
|
|
# $uridata{SCANNED_ON} = $DT;
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Default to OK
|
|
|
|
# #
|
|
|
|
# $uridata{SCAN_OK} = 1;
|
|
|
|
#
|
|
|
|
# emit( $silent, "Scanning '$url'\n" );
|
|
|
|
# $rv1 = $sth1->execute($url);
|
|
|
|
# if ( $dbh->err ) {
|
|
|
|
# warn $dbh->errstr;
|
|
|
|
# }
|
|
|
|
# $h1 = $sth1->fetchrow_hashref;
|
|
|
|
# emit( $silent, $h1->{title}, "\n" );
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # The URL should be valid already from the point at which it was added
|
|
|
|
# # to the database, but if we do this we get a canonical version (it
|
|
|
|
# # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
|
|
|
|
# # it. It also sets $uridata{URI}, which is useful.)
|
|
|
|
# #
|
|
|
|
# my $uri = validateURI( $url, \%uridata );
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Check the hostname resolves in DNS
|
|
|
|
# #
|
|
|
|
# if ( checkDNS( $uri, \%uridata ) ) {
|
|
|
|
# $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
|
|
|
|
# emit( $silent, "DNS: $uridata{DNS}\n" );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Check the server is available
|
|
|
|
# #
|
|
|
|
# if ( serverUp( $uri, \%uridata ) ) {
|
|
|
|
# emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Look for the HTTP content type.
|
|
|
|
# #
|
|
|
|
# if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
|
|
|
|
# emit( $silent, "HTTP request OK\n" );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "HTTP request failed\n" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Note: not doing the robots.txt check since it was done at load time
|
|
|
|
# #
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # We know we have a feed, so go get it
|
|
|
|
# #
|
|
|
|
# $stream = getFeed( $uridata{URI} );
|
|
|
|
# if ($stream) {
|
|
|
|
# #
|
|
|
|
# # Parse the feed. The routine generates its own error messages
|
|
|
|
# #
|
|
|
|
# $feed = parseFeed( $uridata{URI}, $stream );
|
|
|
|
# unless ( $feed ) {
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
# # Save the important feed components in the %uridata hash
|
|
|
|
# #
|
|
|
|
# print Dumper($feed), "\n" if ( $DEBUG > 2 );
|
|
|
|
# storeFeed( $feed, \%uridata );
|
|
|
|
# }
|
|
|
|
# else {
|
|
|
|
# emit( $silent, "Problem collecting feed" );
|
|
|
|
# $uridata{SCAN_OK} = 0;
|
|
|
|
# next;
|
|
|
|
# }
|
|
|
|
#
|
|
|
|
}
|
|
|
|
continue {
|
|
|
|
# { This 'continue' block is executed for each iteration or 'next' }
|
|
|
|
|
|
|
|
#
|
|
|
|
# If the scan went OK then perform a comparison between the new feed
|
|
|
|
# data and that which is stored
|
|
|
|
#
|
|
|
|
if ( $uridata{SCAN_OK} ) {
|
|
|
|
$urichanges = updateURI( $dbh, \%uridata, $keymap );
|
|
|
|
$LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
|
|
|
|
$urichanges )
|
|
|
|
if $urichanges > 0;
|
|
|
|
|
|
|
|
# TODO Update enclosures
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uridata{URI} was not scanned successfully\n" );
|
|
|
|
$LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Finished scanning this feed
|
|
|
|
#
|
|
|
|
emit( $silent, '-' x 80, "\n" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# NOTE: Temporarily stop after the first N feeds
|
|
|
|
#
|
|
|
|
last if $count == 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: scanFeed
|
|
|
|
# PURPOSE: Performs a scan on a single feed
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $url feed URL to look up
|
|
|
|
# $uridata hashref to the hash of data collected from the
|
|
|
|
# feed
|
2023-01-14 23:13:49 +00:00
|
|
|
# RETURNS: True (1) if all the steps worked, otherwise false (0)
|
2022-11-19 21:27:51 +00:00
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub scanFeed {
|
|
|
|
my ( $dbh, $url, $uridata ) = @_;
|
|
|
|
|
|
|
|
my ( $sql, $sth, $rv, $h );
|
|
|
|
my ( $DT, $stream, $feed );
|
|
|
|
my ( $urichanges, $enc_changes );
|
|
|
|
|
|
|
|
$sql = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
$sth = $dbh->prepare($sql);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Record the scan for this URL
|
|
|
|
#
|
|
|
|
$DT = normaliseDT( DateTime->now() );
|
|
|
|
$uridata->{SCANNED_ON} = $DT;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Default to OK
|
|
|
|
#
|
|
|
|
$uridata->{SCAN_OK} = 1;
|
|
|
|
|
|
|
|
emit( $silent, "Scanning '$url'\n" );
|
|
|
|
$rv = $sth->execute($url);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h = $sth->fetchrow_hashref;
|
|
|
|
emit( $silent, $h->{title}, "\n" );
|
|
|
|
|
|
|
|
#
|
|
|
|
# The URL should be valid already from the point at which it was added
|
|
|
|
# to the database, but if we do this we get a canonical version (it
|
|
|
|
# sets $uridata->{SAVE} which makes no sense here, but we'll just ignore
|
|
|
|
# it. It also sets $uridata->{URI}, which is useful.)
|
|
|
|
#
|
|
|
|
my $uri = validateURI( $url, $uridata );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the hostname resolves in DNS
|
|
|
|
#
|
|
|
|
if ( checkDNS( $uri, $uridata ) ) {
|
|
|
|
$uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } );
|
|
|
|
emit( $silent, "DNS: $uridata->{DNS}\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "$uri has no DNS entry\n" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check the server is available
|
|
|
|
#
|
|
|
|
if ( serverUp( $uri, $uridata ) ) {
|
|
|
|
emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Look for the HTTP content type.
|
|
|
|
#
|
|
|
|
if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) {
|
|
|
|
emit( $silent, "HTTP request OK\n" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "HTTP request failed\n" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Note: not doing the robots.txt check since it was done at load time
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# We know we have a feed, so go get it
|
|
|
|
#
|
|
|
|
$stream = getFeed( $uridata->{URI} );
|
|
|
|
if ($stream) {
|
|
|
|
#
|
|
|
|
# Parse the feed. The routine generates its own error messages
|
|
|
|
#
|
|
|
|
$feed = parseFeed( $uridata->{URI}, $stream );
|
|
|
|
unless ( $feed ) {
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
2023-01-09 18:20:17 +00:00
|
|
|
emit( $silent, "Feed did not parse $uridata->{URI}\n" );
|
|
|
|
$LOG->warning('Feed did not parse: ',$uridata->{URI});
|
2022-11-19 21:27:51 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Save the important feed components in the %uridata hash
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug( $DEBUG > 2, Dumper( $feed ) );
|
2022-11-19 21:27:51 +00:00
|
|
|
storeFeed( $feed, $uridata );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Problem collecting feed" );
|
|
|
|
$uridata->{SCAN_OK} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: reportDB
|
|
|
|
# PURPOSE: To generate a printed report from the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $fh output file handle
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: reportFeed generates a standardised report
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub reportDB {
|
|
|
|
my ($dbh, $fh) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to report the contents of the 'all_episodes' view with the latest
|
|
|
|
# associated episode
|
|
|
|
#
|
|
|
|
my $sql = q{
|
|
|
|
SELECT urls.id, ae.*,
|
|
|
|
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
ORDER BY urls_title
|
|
|
|
};
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ( my $h1 = $sth1->fetchrow_hashref() ) {
|
|
|
|
reportFeed( $h1, $fh );
|
|
|
|
}
|
|
|
|
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: findFeed
|
|
|
|
# PURPOSE: Find and return a single feed by title
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $target search target
|
|
|
|
# RETURNS: Hashref indexing the feed
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub findFeed {
|
|
|
|
my ( $dbh, $target ) = @_;
|
|
|
|
|
|
|
|
my ( $sql, $sth, $rv, $h );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Query to report the contents of the 'all_episodes' view with the latest
|
|
|
|
# associated episode
|
|
|
|
#
|
|
|
|
$sql = q{
|
|
|
|
SELECT urls.id, ae.*,
|
|
|
|
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
AND ae.urls_title = ?
|
|
|
|
};
|
|
|
|
|
|
|
|
$sth = $dbh->prepare($sql);
|
|
|
|
$rv = $sth->execute($target);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
$h = $sth->fetchrow_hashref();
|
|
|
|
|
|
|
|
return $h;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: reportFeed
|
|
|
|
# PURPOSE: Reports a single feed from the database
|
|
|
|
# PARAMETERS: $feed a hash from the database containing the
|
|
|
|
# details of a particular feed
|
|
|
|
# $fh output file handle
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub reportFeed {
|
|
|
|
my ($feed, $fh) = @_;
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
my $lwidth = 15;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Hash for converting database keys to labels for the report and arrays
|
|
|
|
# for controlling the sequence.
|
|
|
|
#
|
|
|
|
state ( %keys, @seq1, @seq2 );
|
|
|
|
%keys = (
|
2023-01-14 23:13:49 +00:00
|
|
|
'id' => 'DB Primary key',
|
|
|
|
'latest_ep' => 'Latest episode date',
|
|
|
|
'ep_author' => 'Author',
|
|
|
|
'ep_byte_length' => 'Byte length',
|
|
|
|
'ep_category' => 'Category',
|
|
|
|
'ep_enclosure' => 'URL',
|
|
|
|
'ep_ep_id' => 'ID',
|
|
|
|
'ep_id' => 'Key',
|
|
|
|
'ep_issued' => 'Issued on',
|
|
|
|
'ep_last_update' => 'Last updated',
|
|
|
|
'ep_link' => 'Permalink',
|
|
|
|
'ep_mime_type' => 'MIME type',
|
|
|
|
'ep_modified' => 'Modified on',
|
|
|
|
'ep_source' => 'Source',
|
|
|
|
'ep_title' => 'Title',
|
|
|
|
'ep_urls_id' => 'URL key',
|
|
|
|
'urls_author' => 'Author',
|
|
|
|
'urls_check_type' => 'Check type',
|
|
|
|
'urls_content_type' => 'Content type',
|
|
|
|
'urls_copyright' => 'Copyright',
|
|
|
|
'urls_description' => 'Description',
|
|
|
|
'urls_dns' => 'DNS',
|
|
|
|
'urls_feedformat' => 'Feed format',
|
|
|
|
'urls_generator' => 'Generator',
|
|
|
|
'urls_host_up' => 'Host up',
|
|
|
|
'urls_http_status' => 'HTTP status',
|
|
|
|
'urls_id' => 'Key',
|
|
|
|
'urls_image' => 'Image',
|
|
|
|
'urls_language' => 'Language',
|
|
|
|
'urls_last_update' => 'Last updated',
|
|
|
|
'urls_link' => 'Link',
|
|
|
|
'urls_modified' => 'Modified on',
|
|
|
|
'urls_reason_accepted' => 'Reason accepted',
|
2023-02-19 19:54:25 +00:00
|
|
|
'urls_status' => 'Status',
|
|
|
|
'urls_summary' => 'Summary',
|
2023-01-14 23:13:49 +00:00
|
|
|
'urls_title' => 'Title',
|
|
|
|
'urls_url' => 'Feed URL',
|
|
|
|
'urls_urltype' => 'URL type',
|
|
|
|
'urls_parent_id' => 'Parent ID',
|
|
|
|
'urls_child_count' => 'Child count',
|
2022-11-19 21:27:51 +00:00
|
|
|
);
|
2023-01-24 22:44:11 +00:00
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
@seq1 = (
|
|
|
|
'urls_title',
|
|
|
|
'urls_url',
|
|
|
|
'urls_feedformat',
|
|
|
|
'urls_link',
|
|
|
|
'urls_author',
|
|
|
|
'urls_content_type',
|
|
|
|
'urls_copyright',
|
2023-01-14 23:13:49 +00:00
|
|
|
'urls_check_type',
|
|
|
|
'urls_reason_accepted',
|
2022-11-19 21:27:51 +00:00
|
|
|
'urls_description',
|
2023-02-19 19:54:25 +00:00
|
|
|
'urls_summary',
|
2022-11-19 21:27:51 +00:00
|
|
|
'urls_dns',
|
|
|
|
'urls_generator',
|
|
|
|
'urls_host_up',
|
|
|
|
'urls_http_status',
|
|
|
|
'urls_image',
|
|
|
|
'urls_language',
|
|
|
|
'urls_last_update',
|
|
|
|
'urls_modified',
|
|
|
|
'urls_parent_id',
|
|
|
|
'urls_child_count',
|
2023-02-19 19:54:25 +00:00
|
|
|
'urls_status',
|
2022-11-19 21:27:51 +00:00
|
|
|
);
|
2023-01-24 22:44:11 +00:00
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
@seq2 = (
|
|
|
|
'ep_title',
|
|
|
|
'ep_enclosure',
|
|
|
|
'ep_category',
|
|
|
|
'ep_link',
|
|
|
|
'ep_author',
|
|
|
|
'ep_byte_length',
|
|
|
|
'ep_ep_id',
|
|
|
|
'ep_mime_type',
|
|
|
|
'ep_issued',
|
|
|
|
'ep_last_update',
|
|
|
|
'ep_modified',
|
|
|
|
'ep_source',
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($feed) {
|
|
|
|
print $fh "Channel:\n";
|
|
|
|
foreach my $key (@seq1) {
|
2023-01-24 22:44:11 +00:00
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
# Format the feed description, summary and copyright with a left
|
|
|
|
# margin using textFormat. Everything else gets a simpler layout.
|
2023-01-24 22:44:11 +00:00
|
|
|
#
|
2023-02-19 19:54:25 +00:00
|
|
|
if ($key =~ /^urls_(description|summary|copyright)$/) {
|
2023-01-24 22:44:11 +00:00
|
|
|
printf $fh "%s\n",
|
|
|
|
textFormat(
|
|
|
|
coalesce( $feed->{$key}, '--' ),
|
|
|
|
sprintf( " %-*s:", $lwidth, $keys{$key} ),
|
|
|
|
'L', $lwidth + 4, 80
|
|
|
|
);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
|
|
|
|
coalesce( $feed->{$key}, '--' );
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
print $fh "\nLatest episode:\n";
|
|
|
|
foreach my $key (@seq2) {
|
|
|
|
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
|
|
|
|
coalesce( $feed->{$key}, '--' );
|
|
|
|
}
|
|
|
|
print $fh "\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: collectData
|
|
|
|
# PURPOSE: Collects data from the database for generating a report
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# RETURNS: An array of hashrefs from the query
|
|
|
|
# DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
|
|
|
|
# table and the 'all_episodes' view, showing the date of the
|
|
|
|
# latest episode. The result is an array of rows, each
|
|
|
|
# represented as a hash, all sorted by the feed title.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub collectData {
|
|
|
|
my ($dbh) = @_;
|
|
|
|
|
|
|
|
#
|
2023-01-09 18:20:17 +00:00
|
|
|
# Query to report only the feeds from the contents of the 'urls' table
|
|
|
|
# with the details of the latest episode
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
my $sql = q{
|
|
|
|
SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
|
|
|
|
FROM all_episodes ae
|
|
|
|
JOIN urls ON (ae.urls_id = urls.id)
|
|
|
|
GROUP by urls.id
|
|
|
|
HAVING ae.urls_urltype = 'Feed'
|
|
|
|
ORDER BY ae.urls_title
|
|
|
|
};
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Return everything as an arrayref of hashrefs
|
|
|
|
#
|
|
|
|
my $tbl_ary_ref = $sth1->fetchall_arrayref({});
|
|
|
|
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return $tbl_ary_ref;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: countRows
|
|
|
|
# PURPOSE: To count the rows in a table
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use
|
|
|
|
# RETURNS: Number of rows found (note that zero is returned as 0E0)
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub countRows {
|
|
|
|
my ( $dbh, $sql ) = @_;
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare($sql);
|
|
|
|
my $rv = $sth1->execute();
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
my $h1 = $sth1->fetch;
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return @$h1[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: dbSearch
|
|
|
|
# PURPOSE: To perform a simple search in the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use (expected to be
|
|
|
|
# a SELECT)
|
|
|
|
# @args arguments for the 'execute'
|
|
|
|
# RETURNS: True (1) if the row exists, otherwise false (0).
|
|
|
|
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
|
|
|
|
# SQL without incurring the overhead of repeated 'prepare'
|
|
|
|
# calls. Only the first row is fetched (we expect there to be
|
|
|
|
# only one) and the success or failure is determined by its
|
|
|
|
# existence.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub dbSearch {
|
|
|
|
my ( $dbh, $sql, @args ) = @_;
|
|
|
|
|
|
|
|
my $sth1 = $dbh->prepare_cached($sql);
|
|
|
|
my $rv = $sth1->execute(@args);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
my $h1 = $sth1->fetchrow_hashref();
|
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return defined($h1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: execSQL
|
|
|
|
# PURPOSE: To perform a non-SELECT query
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $sql SQL expression to use
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# @args arguments for the 'execute'
|
|
|
|
# RETURNS: True (1) if the query succeeded, otherwise false (0).
|
|
|
|
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
|
|
|
|
# SQL without incurring the overhead of repeated 'prepare'
|
|
|
|
# calls.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub execSQL {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $dbh, $sql, $dry_run, @args ) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
my ( $sth1, $rv );
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
if ($dry_run) {
|
|
|
|
emit($silent,"Dry-run: Would have run SQL '$sql'\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
$sth1 = $dbh->prepare_cached($sql);
|
|
|
|
try {
|
|
|
|
$rv = $sth1->execute(@args);
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
2023-01-09 18:20:17 +00:00
|
|
|
catch {
|
|
|
|
warn "Problem with query '$sql'\n";
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn '** ' . $dbh->errstr;
|
|
|
|
}
|
|
|
|
};
|
2022-11-19 21:27:51 +00:00
|
|
|
$sth1->finish;
|
|
|
|
|
|
|
|
return $rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: validateURI
|
|
|
|
# PURPOSE: Checks a URL for validity
|
|
|
|
# PARAMETERS: $rec the URL handed to the script
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: A URI object if valid otherwise undef
|
|
|
|
# DESCRIPTION: The URL string is validated with the URI module. A canonical
|
|
|
|
# string version is stored in the hash referenced by $uridata
|
|
|
|
# hash if valid otherwise the URL is marked as invalid.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub validateURI {
|
|
|
|
my ( $rec, $uridata ) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Check that we have a valid URL. We don't save them if they are
|
|
|
|
# invalid.
|
|
|
|
#
|
|
|
|
my $uri = URI->new( $rec, 'http' );
|
|
|
|
if ( $uri->scheme ) {
|
|
|
|
emit( $silent, "URI $uri is valid\n" );
|
|
|
|
$uridata->{URI} = $uri->canonical->as_string;
|
|
|
|
return $uri;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "URI $uri is not valid\n" );
|
|
|
|
$uridata->{SAVE} = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkDNS
|
|
|
|
# PURPOSE: Looks up a host DNS entry
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the DNS query was successful, otherwise false (0)
|
|
|
|
# DESCRIPTION: The host name is extracted from the URI (and stored). The
|
|
|
|
# hostname is searched for in the DNS and if successful, an
|
|
|
|
# array of addresses from the 'A' records is built. This is
|
|
|
|
# sorted and stored in the hash referenced by $uridata.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkDNS {
|
|
|
|
my ( $uri, $uridata ) = @_;
|
|
|
|
|
|
|
|
my @adds;
|
|
|
|
my $hostname = $uri->host;
|
|
|
|
$uridata->{HOST} = $hostname;
|
|
|
|
|
|
|
|
my $res = Net::DNS::Resolver->new;
|
|
|
|
my $query = $res->search($hostname);
|
|
|
|
if ($query) {
|
|
|
|
foreach my $rr ( $query->answer ) {
|
|
|
|
next unless $rr->type eq "A";
|
|
|
|
push( @adds, $rr->address );
|
|
|
|
}
|
|
|
|
@adds = sort(@adds);
|
|
|
|
$uridata->{DNS} = \@adds;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
warn "Query failed: ", $res->errorstring, "\n";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: serverUp
|
|
|
|
# PURPOSE: Checks that a given host is responding
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the host responds to a TCP connection, false (0)
|
|
|
|
# otherwise
|
|
|
|
# DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
|
|
|
|
# to the appropriate default for the scheme, such as 80 for
|
|
|
|
# http). Attempts to connect to this host and port. If the
|
|
|
|
# connect fails then details are written to the data structure
|
|
|
|
# pointed to by $uridata.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub serverUp {
|
|
|
|
my ( $uri, $uridata ) = @_;
|
|
|
|
|
|
|
|
my ( $hostname, $port, $socket );
|
|
|
|
|
|
|
|
$hostname = $uri->host();
|
|
|
|
$port = $uri->port();
|
|
|
|
$uridata->{PORT} = $port;
|
|
|
|
|
|
|
|
$socket = IO::Socket::INET->new(
|
|
|
|
Proto => "tcp",
|
|
|
|
PeerAddr => $hostname,
|
|
|
|
PeerPort => $port,
|
|
|
|
Reuse => 1,
|
|
|
|
Timeout => 10
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($socket) {
|
|
|
|
$socket->close;
|
|
|
|
$uridata->{HOSTUP} = 1;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{HOSTUP} = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: robotRulesOK
|
|
|
|
# PURPOSE: To check the intended URL against the site's robots.txt rules
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $rules WWW::RobotRules object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: True (1) if the GET of the robots.txt file succeeded and the
|
|
|
|
# rules allow the URI object to be fetched, false (0) otherwise.
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub robotRulesOK {
|
|
|
|
my ( $uri, $rules, $uridata ) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Build the URL of robots.txt
|
|
|
|
#
|
|
|
|
my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
|
|
|
|
my $robots_txt;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Use LWP::UserAgent to get the feed and handle errors
|
|
|
|
#
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->timeout(10);
|
|
|
|
$ua->agent("$PROG/$VERSION");
|
|
|
|
|
|
|
|
my $response = $ua->get($roboturl);
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$uridata->{ROBOTS} = 'Found';
|
|
|
|
$robots_txt = $response->decoded_content;
|
|
|
|
$rules->parse( $roboturl, $robots_txt );
|
|
|
|
return $rules->allowed("$uri");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{ROBOTS} = $response->status_line;
|
|
|
|
warn "Failed to get $roboturl\n";
|
|
|
|
warn $response->status_line . "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkContentType
|
|
|
|
# PURPOSE: Check the content_type of the url
|
|
|
|
# PARAMETERS: $uri URI object
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# $headers hashref containing query headers
|
|
|
|
# $children arrayref to return any 'child' feeds we might
|
|
|
|
# find; they are appended to the array
|
|
|
|
# $log Log::Handler object
|
|
|
|
# RETURNS: True (1) if all was well, otherwise false (0)
|
|
|
|
# DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
|
|
|
|
# for the URI object using the LWP::UserAgent head method. Then
|
|
|
|
# we examine the 'content-type' header looking for the string
|
|
|
|
# 'xml' or 'html' in it. The former denotes a feed, and the
|
|
|
|
# latter a normal HTML page.
|
2022-11-20 22:49:57 +00:00
|
|
|
# We use Feed::Find if the page is HTML in order to find any
|
2022-11-19 21:27:51 +00:00
|
|
|
# "child" feeds which we really want to process if we find any.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkContentType {
|
|
|
|
my ( $uri, $uridata, $headers, $children, $log ) = @_;
|
|
|
|
|
|
|
|
my @feeds;
|
2023-01-14 23:13:49 +00:00
|
|
|
my $content;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
$uridata->{HTTP_STATUS} = 'Unknown';
|
|
|
|
|
|
|
|
my $browser = LWP::UserAgent->new or return 0;
|
2023-01-14 23:13:49 +00:00
|
|
|
$browser->timeout(10);
|
|
|
|
$browser->agent("$PROG/$VERSION");
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#my $response = $browser->head( $uri->as_string, %{$headers} )
|
|
|
|
my $response = $browser->get( $uri->as_string, %{$headers} )
|
2022-11-19 21:27:51 +00:00
|
|
|
or return 0;
|
|
|
|
|
|
|
|
$uridata->{HTTP_STATUS} = $response->status_line;
|
|
|
|
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
|
|
|
|
|
|
|
|
#
|
|
|
|
# Decode the content-type we received
|
|
|
|
#
|
|
|
|
if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
|
|
|
|
$uridata->{TYPE} = 'Feed';
|
|
|
|
}
|
|
|
|
elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
|
|
|
|
$uridata->{TYPE} = 'HTML';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{TYPE} = 'Unknown';
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Sometimes perfectly valid feeds misleadingly return text/html so we
|
|
|
|
# try to spot such cases here and adjust the internal type accordingly
|
|
|
|
#
|
|
|
|
if ( $uridata->{TYPE} eq 'HTML' ) {
|
|
|
|
@feeds = Feed::Find->find( $uri->as_string );
|
|
|
|
if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
|
|
|
|
emit( $silent, "Feed found with wrong content-type\n" );
|
|
|
|
$uridata->{TYPE} = 'Feed';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# This is HTML and we found 'child' feeds of some kind
|
2022-11-19 21:27:51 +00:00
|
|
|
#
|
|
|
|
emit( $silent, "Found ", scalar(@feeds),
|
|
|
|
" feeds within this HTML page\n" );
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug( $DEBUG > 0, Dumper( \@feeds ) );
|
2022-11-19 21:27:51 +00:00
|
|
|
push(@{$children}, @feeds);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#
|
|
|
|
# TODO: Get the title for an HTML page by parsing the source
|
|
|
|
#
|
|
|
|
if ( $uridata->{TYPE} eq 'HTML' ) {
|
|
|
|
unless ($uridata->{TITLE}) {
|
|
|
|
$content = $response->decoded_content;
|
|
|
|
$uridata->{TITLE} = getHTMLTitle($uri->as_string, $content);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
$log->info( "URL content classified as: ", $uridata->{TYPE} );
|
|
|
|
emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: getURL
|
|
|
|
# PURPOSE: Download the contents of an URL
|
|
|
|
# PARAMETERS: $url URL of the page to download
|
|
|
|
# RETURNS: String representation of the contents or undef if the
|
|
|
|
# download failed.
|
|
|
|
# DESCRIPTION: Issues a GET on the URL. If successful the contents are
|
|
|
|
# decoded and returned, otherwise undef is returned.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub getURL {
|
|
|
|
my ($url) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Use LWP::UserAgent to get the feed and handle errors
|
|
|
|
#
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->timeout(10);
|
|
|
|
$ua->agent("$PROG/$VERSION");
|
|
|
|
my $response = $ua->get($url);
|
|
|
|
|
|
|
|
my $content;
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$content = $response->decoded_content;
|
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
warn "Failed to get $url\n";
|
|
|
|
warn $response->status_line, "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: getFeed
|
|
|
|
# PURPOSE: Download the contents of a feed URL
|
|
|
|
# PARAMETERS: $feed_url URL of the feed to download
|
|
|
|
# RETURNS: String representation of the feed contents or undef if the
|
|
|
|
# download failed.
|
|
|
|
# DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
|
|
|
|
# need not be). If successful the contents are decoded and
|
|
|
|
# returned, otherwise undef is returned.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub getFeed {
|
|
|
|
my ($feed_url) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Use LWP::UserAgent to get the feed and handle errors
|
|
|
|
#
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->timeout(10);
|
|
|
|
$ua->agent("$PROG/$VERSION");
|
|
|
|
my $response = $ua->get($feed_url);
|
|
|
|
|
|
|
|
my $feed_content;
|
|
|
|
if ( $response->is_success ) {
|
|
|
|
$feed_content = $response->decoded_content;
|
|
|
|
return $feed_content;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
warn "Failed to get $feed_url\n";
|
|
|
|
warn $response->status_line, "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: getHTMLTitle
|
|
|
|
# PURPOSE: Parse an HTML page to get data. At the moment this is just the
|
|
|
|
# title in the header section.
|
|
|
|
# PARAMETERS: $url URL of the page; only used for messages and as
|
|
|
|
# a fallback title.
|
|
|
|
# $content Decoded content of the HTML page
|
|
|
|
# RETURNS: The title, if there is one, otherwise the URL
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub getHTMLTitle {
|
|
|
|
my ($url, $content) = @_;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Is this effectively a local subroutine?
|
|
|
|
#
|
|
|
|
my $start_handler = sub {
|
|
|
|
#
|
|
|
|
# Ignore any tags which are not 'title'
|
|
|
|
#
|
|
|
|
return if shift ne "title";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Define more handlers if we have a title. One to collect the title string
|
|
|
|
# and the other to abort the parse on encountering the end of the title.
|
|
|
|
#
|
|
|
|
my $self = shift;
|
|
|
|
$self->handler(text => sub { $main::html_title = shift }, "dtext");
|
|
|
|
$self->handler(
|
|
|
|
end => sub {
|
|
|
|
shift->eof if shift eq "title";
|
|
|
|
},
|
|
|
|
"tagname,self"
|
|
|
|
);
|
|
|
|
};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Create the HTML::Parser object
|
|
|
|
#
|
|
|
|
my $p = HTML::Parser->new(api_version => 3);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Define a 'start' handler. When called with 'title' as the 'tagname' it
|
|
|
|
# creates a 'text' event handler to print the text and an 'end' handler to
|
|
|
|
# quit when the closing '</title' tag is found.
|
|
|
|
#
|
|
|
|
#$p->handler(start => \&start_handler, "tagname,self");
|
|
|
|
$p->handler(start => $start_handler, "tagname,self");
|
|
|
|
|
|
|
|
#
|
|
|
|
# Invoke the parser on the string containing the returned HTML.
|
|
|
|
#
|
|
|
|
$p->parse($content);
|
|
|
|
|
|
|
|
return $main::html_title // $url;
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: parseFeed
|
|
|
|
# PURPOSE: Parse a podcast feed that has already been downloaded
|
|
|
|
# PARAMETERS: $feed_url URL of the feed previously downloaded
|
|
|
|
# $feed_content String containing the content of the feed, for
|
|
|
|
# parsing
|
|
|
|
# RETURNS: An XML::Feed object containing the parsed feed or undef if the
|
|
|
|
# parse failed
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub parseFeed {
|
|
|
|
my ( $feed_url, $feed_content ) = @_;
|
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
my $feed;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Catch errors, returning from the subroutine with 'undef' if any are
|
|
|
|
# triggered.
|
|
|
|
#
|
|
|
|
try {
|
|
|
|
$feed = XML::Feed->parse( \$feed_content );
|
|
|
|
unless ($feed) {
|
|
|
|
#
|
|
|
|
# Something went wrong. Abort this feed
|
|
|
|
#
|
|
|
|
warn "Warning: Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
|
|
|
|
return; # undef
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
2023-01-09 18:20:17 +00:00
|
|
|
catch {
|
|
|
|
warn "Warning: Failed to parse $feed_url: $_\n";
|
|
|
|
return;
|
|
|
|
};
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
return $feed;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: storeFeed
|
|
|
|
# PURPOSE: Stores feed attributes in a hash
|
|
|
|
# PARAMETERS: $feed XML::Feed object returned from parsing the
|
|
|
|
# feed
|
|
|
|
# $uridata hashref containing data for this URI
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub storeFeed {
|
|
|
|
my ($feed, $uridata) = @_;
|
|
|
|
|
|
|
|
( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g;
|
|
|
|
( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
|
|
|
|
|
|
|
|
$uridata->{LINK} = $feed->link;
|
|
|
|
$uridata->{DESCRIPTION} = clean_string( $feed->tagline );
|
|
|
|
$uridata->{AUTHOR} = $feed->author;
|
|
|
|
$uridata->{MODIFIED} = normaliseDT( $feed->modified );
|
|
|
|
$uridata->{IMAGE} = flattenArray( $feed->image );
|
|
|
|
$uridata->{COPYRIGHT} = $feed->copyright;
|
|
|
|
$uridata->{GENERATOR} = $feed->generator;
|
|
|
|
$uridata->{LANGUAGE} = $feed->language;
|
|
|
|
#print coalesce($feed->webMaster,'No webMaster'),"\n";
|
|
|
|
|
|
|
|
$uridata->{ENCLOSURES} = extractEnclosures($feed);
|
|
|
|
$uridata->{ENCLOSURE_COUNT}
|
|
|
|
= scalar( @{ $uridata->{ENCLOSURES} } );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: checkCopyright
|
2023-01-09 18:20:17 +00:00
|
|
|
# PURPOSE: Ask the user to check, or applies simple rules, to accept or
|
|
|
|
# reject a feed based on the copyright
|
|
|
|
# PARAMETERS: $checkmode the mode string: auto, manual or none
|
|
|
|
# $uridata reference to the hash containing details of
|
|
|
|
# this feed
|
|
|
|
# RETURNS: 1 (true) if the feed is to be added, 0 (false) if not
|
2022-11-19 21:27:51 +00:00
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub checkCopyright {
|
2023-01-09 18:20:17 +00:00
|
|
|
my ($checkmode, $uridata) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $copyright, $re, $decision, $reason );
|
2023-01-09 18:20:17 +00:00
|
|
|
$LOG->info("Checking copyright of feed (mode: $checkmode)");
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-09 18:20:17 +00:00
|
|
|
if ( $checkmode eq 'manual' ) {
|
|
|
|
#
|
|
|
|
# Prompt the user, failing gracefully if there's
|
|
|
|
# a problem. If the user types 'Y' or 'y' we accept the
|
|
|
|
# feed, otherwise we do not (thus a blank return = 'no').
|
|
|
|
#
|
|
|
|
try {
|
|
|
|
printf STDERR
|
|
|
|
"Feed '%s' has the copyright string:\n%s\n",
|
|
|
|
$uridata->{TITLE},
|
|
|
|
coalesce( $uridata->{COPYRIGHT}, '' );
|
|
|
|
$decision = prompt(
|
|
|
|
-in => *STDIN,
|
|
|
|
-out => *STDERR,
|
|
|
|
-prompt => 'OK to add this feed?',
|
|
|
|
-style => 'bold red underlined',
|
|
|
|
-yes
|
|
|
|
);
|
|
|
|
}
|
|
|
|
catch {
|
|
|
|
warn "Problem processing copyright decision: $_";
|
|
|
|
$decision = 0;
|
|
|
|
};
|
2023-01-14 23:13:49 +00:00
|
|
|
|
2023-01-24 22:44:11 +00:00
|
|
|
#
|
|
|
|
# If accepted we want a reason
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
if ($decision) {
|
|
|
|
try {
|
|
|
|
$reason = prompt(
|
|
|
|
-in => *STDIN,
|
|
|
|
-out => *STDERR,
|
|
|
|
-prompt => 'Please give a reason for this decision:',
|
|
|
|
-style => 'bold red underlined',
|
|
|
|
-default => 'No reason given'
|
|
|
|
);
|
|
|
|
}
|
|
|
|
catch {
|
|
|
|
warn "Problem processing reason for copyright decision: $_";
|
|
|
|
$reason = 'Error while processing the reason';
|
|
|
|
};
|
|
|
|
|
|
|
|
$uridata->{REASON_ACCEPTED} = $reason;
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
}
|
2023-01-09 18:20:17 +00:00
|
|
|
else {
|
|
|
|
#
|
2023-01-14 23:13:49 +00:00
|
|
|
# Automatic mode.
|
|
|
|
# Careful. Un-escaped spaces are ignored in the regex
|
2023-01-09 18:20:17 +00:00
|
|
|
#
|
|
|
|
$re = qr{(
|
2023-01-14 23:13:49 +00:00
|
|
|
\bCC\b|
|
|
|
|
\bCreative\ Commons\b|
|
2023-01-09 18:20:17 +00:00
|
|
|
creativecommons.org|
|
|
|
|
Attribution.NonCommercial.No.?Derivatives?
|
|
|
|
)}xmi;
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
$decision = 0;
|
2023-01-09 18:20:17 +00:00
|
|
|
|
|
|
|
$copyright = coalesce( $uridata->{COPYRIGHT}, '' );
|
|
|
|
emit( $silent, "Copyright: '$copyright'\n" );
|
|
|
|
$LOG->info("Copyright: '$copyright'");
|
|
|
|
if ( $copyright eq '' || $copyright =~ /$re/ ) {
|
|
|
|
$decision = 1;
|
|
|
|
}
|
|
|
|
}
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Take action on the decision (or default)
|
|
|
|
#
|
|
|
|
$uridata->{SAVE} = $decision;
|
|
|
|
if ($decision) {
|
|
|
|
emit( $silent, "Feed added\n" );
|
|
|
|
$LOG->info('Copyright OK');
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Feed not added\n" );
|
|
|
|
$LOG->info('Copyright not OK');
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: parseRSS
|
|
|
|
# PURPOSE: Attempt to parse a feed as RSS
|
|
|
|
# PARAMETERS: $feed_url URL of the feed previously downloaded
|
|
|
|
# $feed_content String containing the content of the feed, for
|
|
|
|
# parsing
|
|
|
|
# RETURNS: An XML::RSS object containing the parsed feed or undef if the
|
|
|
|
# parse failed
|
|
|
|
# DESCRIPTION: ** Incomplete **
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub parseRSS {
|
|
|
|
my ( $feed_url, $feed_content ) = @_;
|
|
|
|
|
|
|
|
my $rss = XML::RSS->parse(\$feed_content);
|
|
|
|
unless ($rss) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: addURI
|
|
|
|
# PURPOSE: Adds the data for a URI to the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: True (1) if the insert succeeded, false (0) otherwise
|
|
|
|
# DESCRIPTION: The hash keys are defined as an array to make it easy to slice
|
2023-01-14 23:13:49 +00:00
|
|
|
# the hash, and the SQL is defined internally using the size of
|
2022-11-19 21:27:51 +00:00
|
|
|
# the key array as a guide to the number of '?' placeholders.
|
|
|
|
# These are passed to execSQL to do the work.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub addURI {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $dbh, $uridata, $keymap, $dry_run ) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
my @keys = (
|
|
|
|
'URI', 'DNS',
|
|
|
|
'HOSTUP', 'HTTP_STATUS',
|
|
|
|
'HTTP_CONTENT_TYPE', 'TYPE',
|
|
|
|
'FORMAT', 'TITLE',
|
|
|
|
'DESCRIPTION', 'AUTHOR',
|
|
|
|
'MODIFIED', 'LINK',
|
|
|
|
'IMAGE', 'COPYRIGHT',
|
2023-01-14 23:13:49 +00:00
|
|
|
'CHECKTYPE', 'REASON_ACCEPTED',
|
2022-11-19 21:27:51 +00:00
|
|
|
'GENERATOR', 'LANGUAGE',
|
|
|
|
);
|
|
|
|
|
|
|
|
my $sql
|
|
|
|
= 'INSERT INTO urls ('
|
|
|
|
. join( ",", @{$keymap}{@keys} ) . ') '
|
|
|
|
. 'VALUES('
|
|
|
|
. join( ',', ('?') x scalar(@keys) ) . ')';
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
_debug( $DEBUG > 0, "addURI query: $sql");
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
return execSQL( $dbh, $sql, $dry_run, @{$uridata}{@keys} );
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: extractEnclosures
|
|
|
|
# PURPOSE: Builds an array of hashes containing enclosure data from an
|
|
|
|
# XML::Feed object
|
|
|
|
# PARAMETERS: $feed XML::Feed object
|
|
|
|
# RETURNS: A reference to the array of anonymous hashes built from the
|
|
|
|
# feed.
|
|
|
|
# DESCRIPTION: The XML::Feed object is expected to contain an array of
|
|
|
|
# entries. These are converted to hashes, references to which
|
|
|
|
# are stored in an array. The two DateTime components are
|
|
|
|
# converted to ISO8601 dates. If there is an enclosure then its
|
|
|
|
# elements are saved. Note that there could be multiple
|
2023-01-09 18:20:17 +00:00
|
|
|
# enclosures per item, but XML::Feed does not cater for them
|
|
|
|
# unless explicitly requested. We do not deal with such a case
|
|
|
|
# here.
|
2022-11-19 21:27:51 +00:00
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub extractEnclosures {
|
|
|
|
my ($feed) = @_;
|
|
|
|
|
|
|
|
my @encs;
|
|
|
|
|
|
|
|
foreach my $entry ( $feed->entries ) {
|
|
|
|
my %ent;
|
|
|
|
|
|
|
|
$ent{title} = $entry->title;
|
|
|
|
$ent{base} = $entry->base;
|
|
|
|
$ent{link} = $entry->link;
|
|
|
|
$ent{category} = join( ", ", $entry->category );
|
|
|
|
$ent{author} = $entry->author;
|
|
|
|
$ent{id} = $entry->id;
|
|
|
|
$ent{issued} = normaliseDT( $entry->issued );
|
|
|
|
$ent{modified} = normaliseDT( $entry->modified );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Work around a bug in XML::Feed where the source method is only
|
|
|
|
# available for Atom feeds. TODO report this.
|
|
|
|
#
|
|
|
|
if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
|
|
|
|
$ent{source} = $entry->source;
|
|
|
|
}
|
|
|
|
|
|
|
|
my ($enclosure) = $entry->enclosure;
|
|
|
|
if ( defined($enclosure) ) {
|
|
|
|
$ent{url} = $enclosure->url;
|
|
|
|
$ent{type} = $enclosure->type;
|
|
|
|
$ent{length} = $enclosure->length;
|
|
|
|
}
|
|
|
|
|
|
|
|
push( @encs, \%ent );
|
|
|
|
}
|
|
|
|
|
|
|
|
return \@encs;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: addEnclosures
|
|
|
|
# PURPOSE: Adds episodes extracted from a feed into the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
|
|
|
|
# DESCRIPTION: The SQL is defined internally and the hash keys are defined as
|
|
|
|
# an array to make it easy to slice the hash. The enclosures (or
|
|
|
|
# more correctly, feed items) are present in the hash as an
|
|
|
|
# array of anonymous hashes. These are processed one at a time
|
|
|
|
# and inserted into the database. A count of the number of
|
|
|
|
# successful inserts is kept. This is compared with the number
|
|
|
|
# of enclosures to determine the boolean value to return.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub addEnclosures {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $dbh, $uridata, $dry_run ) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
my $sql = q{INSERT INTO episodes
|
|
|
|
(urls_id, link, enclosure, title, author, category, source, ep_id,
|
|
|
|
issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
|
|
|
|
|
|
|
|
my @keys = (
|
|
|
|
'link', 'url', 'title', 'author', 'category', 'source',
|
|
|
|
'id', 'issued', 'modified', 'length', 'type'
|
|
|
|
);
|
|
|
|
|
|
|
|
my $successes = 0;
|
|
|
|
|
|
|
|
foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
|
2023-01-14 23:13:49 +00:00
|
|
|
if ( execSQL( $dbh, $sql, $dry_run, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
|
2022-11-19 21:27:51 +00:00
|
|
|
$successes++;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit( $silent, "Failed to add episode $enc->{url}\n" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( $successes == $uridata->{ENCLOSURE_COUNT} );
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: updateURI
|
|
|
|
# PURPOSE: Compare the data in a hash with that in the database
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
|
|
|
# $keymap hashref containing a map of key names to
|
|
|
|
# database field names
|
2023-01-14 23:13:49 +00:00
|
|
|
# $dry_run Boolean, set if in dry-run mode
|
2022-11-19 21:27:51 +00:00
|
|
|
# RETURNS: The number of changes made
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub updateURI {
|
2023-01-14 23:13:49 +00:00
|
|
|
my ( $dbh, $uridata, $keymap, $dry_run ) = @_;
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
# TODO: dry-run mode not implemented here yet
|
2022-11-19 21:27:51 +00:00
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( %fieldvals, %where );
|
|
|
|
my ( $diffs, $updates ) = ( 0, 0 );
|
|
|
|
my @keys = (
|
|
|
|
'URI', 'DNS',
|
|
|
|
'HOSTUP', 'HTTP_STATUS',
|
|
|
|
'HTTP_CONTENT_TYPE', 'TYPE',
|
|
|
|
'FORMAT', 'TITLE',
|
|
|
|
'DESCRIPTION', 'AUTHOR',
|
|
|
|
'MODIFIED', 'LINK',
|
|
|
|
'IMAGE', 'COPYRIGHT',
|
|
|
|
'GENERATOR', 'LANGUAGE',
|
|
|
|
);
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the row from the urls table
|
|
|
|
#
|
|
|
|
$sql1 = q{SELECT * FROM urls WHERE url = ?};
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute( $uridata->{URI} );
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
|
|
|
|
|
|
for my $key (@keys) {
|
|
|
|
unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
|
|
|
|
$diffs++;
|
|
|
|
$fieldvals{$key} = $uridata->{$key};
|
|
|
|
|
|
|
|
#
|
|
|
|
# Temporary report
|
|
|
|
#
|
|
|
|
print "Difference: ($key)\n";
|
|
|
|
print " Feed: ", coalesce($uridata->{$key},''), "\n";
|
|
|
|
print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($diffs > 0) {
|
|
|
|
#
|
|
|
|
# Prepare to use SQL::Abstract
|
|
|
|
#
|
|
|
|
my $sql = SQL::Abstract->new();
|
|
|
|
|
|
|
|
#
|
|
|
|
# Build the update statement
|
|
|
|
#
|
|
|
|
$where{id} = $h1->{id};
|
|
|
|
my ( $stmt, @bindvals )
|
|
|
|
= $sql->update( 'urls', \%fieldvals, \%where );
|
|
|
|
|
|
|
|
# Temporary
|
|
|
|
print "Statement: $stmt\n";
|
|
|
|
print "Bind values: ", join(",",@bindvals),"\n";
|
|
|
|
|
|
|
|
#
|
|
|
|
# Perform the updates
|
|
|
|
#
|
|
|
|
$sth1 = $dbh->prepare($stmt);
|
|
|
|
$sth1->execute(@bindvals);
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn "Processing $h1->{url}\n", $dbh->errstr;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
emit($silent, "Updated $h1->{url}\n");
|
|
|
|
$updates++;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return $updates;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: updateEnclosures
|
|
|
|
# PURPOSE: Update the enclosures stored with a feed URL
|
|
|
|
# PARAMETERS: $dbh database handle
|
|
|
|
# $uridata hashref containing data for the current URI
|
|
|
|
# including an arrayref of hashrefs of episode
|
|
|
|
# data
|
|
|
|
# RETURNS: The number of changes made
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub updateEnclosures {
|
|
|
|
my ( $dbh, $uridata ) = @_;
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
# TODO: This doesn't acttually update annything!
|
2022-11-19 21:27:51 +00:00
|
|
|
my ( $sql1, $sth1, $rv1, $h1 );
|
|
|
|
my ( %fieldvals, %where );
|
|
|
|
my ( $diffs, $updates ) = ( 0, 0 );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get the enclosures associated with this feed
|
|
|
|
#
|
|
|
|
$sql1 = q{
|
|
|
|
SELECT * FROM episodes
|
|
|
|
WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
|
|
|
|
};
|
|
|
|
$sth1 = $dbh->prepare($sql1);
|
|
|
|
$rv1 = $sth1->execute( $uridata->{URI} );
|
|
|
|
if ( $dbh->err ) {
|
|
|
|
warn $dbh->errstr;
|
|
|
|
}
|
|
|
|
$h1 = $sth1->fetchrow_hashref;
|
|
|
|
}
|
|
|
|
|
2023-01-24 22:44:11 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: textFormat
|
|
|
|
# PURPOSE: Formats a block of text in an indented, wrapped style with
|
|
|
|
# a label in the left column
|
|
|
|
# PARAMETERS: $text The text to be formatted, as a scalar string
|
|
|
|
# $tag The label to be added to the left of the top
|
|
|
|
# line
|
|
|
|
# $align Tag alignment, 'L' for left, otherwise right
|
|
|
|
# $lmargin Width of the left margin (assumed to be big
|
|
|
|
# enough for the tag)
|
|
|
|
# $textwidth The width of all of the text plus left margin
|
|
|
|
# (i.e. the right margin)
|
|
|
|
# RETURNS: The formatted result as a string
|
|
|
|
# DESCRIPTION: Chops the incoming text into words (thereby removing any
|
|
|
|
# formatting). Removes any leading spaces. Loops through the
|
|
|
|
# wordlist building them into lines of the right length to fit
|
|
|
|
# between the left and right margins. Saves the lines in an
|
|
|
|
# array. Adds the tag to the first line with the alignment
|
|
|
|
# requested then returns the array joined into a string.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: Inspired by Text::Format but *much* simpler. In fact T::F is
|
|
|
|
# a nasty thing to have to use; I couldn't get it to do what
|
|
|
|
# this routine does.
|
|
|
|
# TODO Make the routine more resilient to silly input values.
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub textFormat {
|
|
|
|
my ( $text, $tag, $align, $lmargin, $textwidth ) = @_;
|
|
|
|
|
|
|
|
my ( $width, $word );
|
|
|
|
my ( @words, @buff, @wrap );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Build the tag early. If there's no text we'll just return the tag.
|
|
|
|
#
|
|
|
|
$tag = sprintf( "%*s",
|
|
|
|
( $align =~ /L/i ? ( $lmargin - 1 ) * -1 : $lmargin - 1 ), $tag );
|
|
|
|
|
|
|
|
return $tag unless $text;
|
|
|
|
|
|
|
|
$text =~ s/(^\s+|\s+$)//g;
|
|
|
|
return $tag unless $text;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Chop up the incoming text removing leading spaces
|
|
|
|
#
|
|
|
|
@words = split( /\s+/, $text );
|
|
|
|
shift(@words) if ( @words && $words[0] eq '' );
|
|
|
|
|
|
|
|
#
|
|
|
|
# Compute the width of the active text
|
|
|
|
#
|
|
|
|
$width = $textwidth - $lmargin;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Format the words into lines with a blank left margin
|
|
|
|
#
|
|
|
|
while ( defined( $word = shift(@words) ) ) {
|
|
|
|
if ( length( join( ' ', @buff, $word ) ) < $width ) {
|
|
|
|
push( @buff, $word );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) );
|
|
|
|
@buff = ($word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Append any remainder
|
|
|
|
#
|
|
|
|
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) ) if @buff;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Insert the tag into the first line
|
|
|
|
#
|
|
|
|
substr( $wrap[0], 0, $lmargin - 1 ) = $tag;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Return the formatted array as a string
|
|
|
|
#
|
|
|
|
return join( "\n", @wrap );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: equal
|
|
|
|
# PURPOSE: Compare two strings even if undefined
|
|
|
|
# PARAMETERS: $s1 The first string
|
|
|
|
# $s2 The second string
|
|
|
|
# RETURNS: True if both strings are undefined, false if one isn't
|
|
|
|
# defined, otherwise the result of comparing them.
|
|
|
|
# DESCRIPTION: Works on the principle that two undefined strings are equal,
|
|
|
|
# a defined and an undefined string are not, and otherwise they
|
|
|
|
# are equal if they are equal!
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub equal {
|
|
|
|
my ( $s1, $s2 ) = @_;
|
|
|
|
|
|
|
|
return 1 if ( !defined($s1) && !defined($s2) );
|
|
|
|
return 0 if ( !defined($s1) || !defined($s2) );
|
|
|
|
|
|
|
|
return ( $s1 eq $s2 );
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: clean_string
|
|
|
|
# PURPOSE: Clean a string of non-printables, newlines, multiple spaces
|
|
|
|
# PARAMETERS: $str The string to process
|
|
|
|
# RETURNS: The processed string
|
|
|
|
# DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
|
|
|
|
# characters. Removes all CR/LF sequences. Removes multiple
|
|
|
|
# spaces.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO:
|
|
|
|
#===============================================================================
|
|
|
|
sub clean_string {
|
|
|
|
my ($str) = @_;
|
|
|
|
|
|
|
|
$str =~ s/(^\s+|\s+$)//g;
|
|
|
|
$str =~ tr/[[:graph:]]//c;
|
|
|
|
$str =~ tr/\x0A\x0D/ /;
|
|
|
|
$str =~ tr/ / /s;
|
|
|
|
|
|
|
|
return $str;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: normaliseDT
|
|
|
|
# PURPOSE: Normalise an ISO8601 date for comparison, etc.
|
|
|
|
# PARAMETERS: $dt a DateTime object
|
|
|
|
# RETURNS: The DateTime object formatted as an ISO8601 string
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub normaliseDT {
|
|
|
|
my ($dt) = @_;
|
|
|
|
|
|
|
|
my $p = DateTime::Format::SQLite->new();
|
|
|
|
|
|
|
|
return (
|
|
|
|
defined($dt)
|
|
|
|
? $p->format_datetime(
|
|
|
|
DateTime::Format::ISO8601->parse_datetime($dt)
|
|
|
|
)
|
|
|
|
: undef
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: flattenArray
|
|
|
|
# PURPOSE: Turns an arrayref into a simple list in a string
|
|
|
|
# PARAMETERS: $item - the item that may be an arrayref
|
|
|
|
# RETURNS: The plain item if it's not an array otherwise the flattened
|
|
|
|
# version
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub flattenArray {
|
|
|
|
my ($item) = @_;
|
|
|
|
|
|
|
|
my $result;
|
|
|
|
if ( ref( $item ) eq 'ARRAY' ) {
|
|
|
|
$result = join(", ",@{$item});
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$result = $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: coalesce
|
|
|
|
# PURPOSE: To find the defined argument and return it
|
|
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
|
|
# RETURNS: The first defined argument
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub coalesce {
|
|
|
|
foreach (@_) {
|
|
|
|
return $_ if defined($_);
|
|
|
|
}
|
|
|
|
return; # undef
|
|
|
|
}
|
|
|
|
|
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: emit
|
|
|
|
# PURPOSE: Print text on STDERR unless silent mode has been selected
|
|
|
|
# PARAMETERS: - Boolean indicating whether to be silent or not
|
|
|
|
# - list of arguments to 'print'
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
|
|
|
|
# a message to STDERR depending on a boolean. We need this to be
|
|
|
|
# able to make the script silent when the -silent option is
|
|
|
|
# selected
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub emit {
|
|
|
|
unless (shift) {
|
|
|
|
print STDERR @_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-14 23:13:49 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: _debug
|
|
|
|
# PURPOSE: Prints debug reports
|
|
|
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
|
|
|
# $messages... Arbitrary list of messages to print
|
|
|
|
# RETURNS: Nothing
|
|
|
|
# DESCRIPTION: Outputs messages if $active is true. It removes any trailing
|
|
|
|
# newline from each one and then adds one in the 'print' to the
|
|
|
|
# caller doesn't have to bother. Prepends each message with 'D>'
|
|
|
|
# to show it's a debug message.
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: Differs from other functions of the same name
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub _debug {
|
|
|
|
my $active = shift;
|
|
|
|
|
|
|
|
my $message;
|
|
|
|
return unless $active;
|
|
|
|
|
|
|
|
while ($message = shift) {
|
|
|
|
chomp($message);
|
|
|
|
print STDERR "D> $message\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
#=== FUNCTION ================================================================
|
|
|
|
# NAME: Options
|
|
|
|
# PURPOSE: Processes command-line options
|
|
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
|
|
# RETURNS: Undef
|
|
|
|
# DESCRIPTION:
|
|
|
|
# THROWS: No exceptions
|
|
|
|
# COMMENTS: None
|
|
|
|
# SEE ALSO: N/A
|
|
|
|
#===============================================================================
|
|
|
|
sub Options {
|
|
|
|
my ($optref) = @_;
|
|
|
|
|
|
|
|
my @options = (
|
2023-02-19 19:54:25 +00:00
|
|
|
"help", "manpage", "debug=i", "dry-run!",
|
|
|
|
"silent!", "load", "delete", "input=s",
|
|
|
|
"scan!", "refresh!", "expire!", "report:s",
|
|
|
|
"ignore-case!", "html!", "check:s", "json:s",
|
|
|
|
"opml:s", "config=s", "out=s", "rejects:s",
|
|
|
|
"template:s",
|
2022-11-19 21:27:51 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
|
|
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
__END__
|
|
|
|
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
# Application Documentation
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
#{{{
|
|
|
|
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
feedWatcher - watch a collection of podcast feeds
|
|
|
|
|
|
|
|
=head1 VERSION
|
|
|
|
|
2023-02-19 19:54:25 +00:00
|
|
|
This documentation refers to I<feedWatcher> version 0.1.4
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
|
|
|
|
feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
|
2023-01-09 18:20:17 +00:00
|
|
|
[-check[=mode]] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
|
2022-11-19 21:27:51 +00:00
|
|
|
[-[no]silent] [-config=FILE] [-debug=N] [URL ...]
|
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
# Load URLs from a file, perform checks, save the rejects and redirect output
|
|
|
|
# to a named file. Uses an alias:
|
|
|
|
# alias isostamp='date +"%Y%m%d_%H%M%S"'
|
|
|
|
#
|
|
|
|
./feedWatcher -load=feedWatcher_dumped_URLs.txt -check=auto \
|
|
|
|
-rej=output/rejects_$(isostamp).out > output/load_$(isostamp).out 2>&1
|
2023-01-09 18:20:17 +00:00
|
|
|
|
|
|
|
# Generate Markdown output with a template writing to a named file
|
|
|
|
./feedWatcher -tem=feedWatcher_3.tpl -out=feedWatcher.mkd
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
=head1 ARGUMENTS
|
|
|
|
|
|
|
|
Arguments are optional and may consist of an arbitrarily long list of URLs to
|
|
|
|
be processed and possibly added to the database by the script.
|
|
|
|
|
|
|
|
These URLs are prepended to any which may be provided through the
|
|
|
|
B<-load=FILE> option described below. The script makes sure the URL collection
|
|
|
|
contains no duplicates.
|
|
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
|
|
|
|
=over 8
|
|
|
|
|
|
|
|
=item B<-help>
|
|
|
|
|
|
|
|
Prints a brief help message describing the usage of the program, and then exits.
|
|
|
|
|
|
|
|
The full documentation can be displayed with the command:
|
|
|
|
|
|
|
|
perldoc -oman feedWatcher
|
|
|
|
|
|
|
|
Alternatively, use the option B<-manpage>
|
|
|
|
|
|
|
|
=item B<-manpage>
|
|
|
|
|
|
|
|
Prints the entire documentation for the script. Note that this mode uses a
|
|
|
|
simpler type of formatter than I<perldoc>.
|
|
|
|
|
|
|
|
=item B<-load=FILE>
|
|
|
|
|
|
|
|
Defines a file from which new URLs are to be added to the database. These URLs
|
|
|
|
are checked in various ways before adding to the database. If arguments are
|
|
|
|
provided when the script is invoked these URLs are appended to the argument list.
|
|
|
|
|
|
|
|
=item B<-delete=FILE>
|
|
|
|
|
|
|
|
Defines a file from which a list of URLs is to be read which are to be
|
|
|
|
deleted from the database.
|
|
|
|
|
|
|
|
Note that it is possible (though inadvisable) to both add and delete an URL in
|
|
|
|
the same run of the script. The URL will first be added (from the load file or
|
|
|
|
argument list) then deleted. This is a pointless exercise which wastes
|
|
|
|
bandwidth, so don't do it!
|
|
|
|
|
|
|
|
=item B<-[no]scan>
|
|
|
|
|
|
|
|
This option (B<-scan>) causes the URLs stored in the database to be scanned
|
|
|
|
and updated. The negated form, which is also the default behaviour of the
|
|
|
|
script, (B<-noscan>) omits the scan.
|
|
|
|
|
|
|
|
NOTE: This function is not implemented yet.
|
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
=item B<-[no]silent>
|
|
|
|
|
|
|
|
This option controls the amount of output written by the script. In
|
|
|
|
B<nosilent> mode the script reports on the processing of each URL it receives,
|
|
|
|
which can be fairly verbose. This can be turned off with this option, though
|
|
|
|
it is often wiser to redirect the output for later review rather than to
|
|
|
|
suppress it.
|
|
|
|
|
2022-11-19 21:27:51 +00:00
|
|
|
=item B<-out=FILE>
|
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
This option defines an output file to receive output from the reporting
|
2023-01-09 18:20:17 +00:00
|
|
|
functions. If the option is omitted the data is written to STDOUT, allowing it
|
2023-01-11 09:45:38 +00:00
|
|
|
to be redirected if required. See the 'Usage' section above for an example of
|
|
|
|
how transactional output can be redirected.
|
|
|
|
|
|
|
|
=item B<-check[=MODE]>
|
|
|
|
|
|
|
|
This option (B<-check[=MODE]>) controls the mode used to check
|
|
|
|
the copyright setting of the current feed and deciide whether to add it.
|
|
|
|
|
|
|
|
Possible settings are: B<auto>, B<manual> and B<none>.
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
|
|
|
|
=item B<-check=auto> or B<-check>
|
|
|
|
|
|
|
|
An automatic check is made against a series of regular expressions looking for
|
|
|
|
something in the I<copyright> field which signifies that the feed is under
|
|
|
|
a Creative Commons licence. A blank field is currently considered to denote
|
|
|
|
this type of licence.
|
|
|
|
|
|
|
|
The option may be written as B<-check> when it is interpreted as B<-check=auto>
|
|
|
|
|
|
|
|
=item B<-check=manual>
|
|
|
|
|
|
|
|
In this mode the script pauses after processing each feed to ask the script
|
|
|
|
user to check that it's OK to add it. The script reports the I<copyright>
|
|
|
|
field and requests a I<y> or I<n> response.
|
|
|
|
|
|
|
|
=item B<-check=manual>
|
|
|
|
|
|
|
|
In this mode no checks are performed.
|
|
|
|
|
|
|
|
=back
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
=item B<-report[=title]>
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
This option (B<-report[=title]>) causes a fairly simplistic report to be
|
|
|
|
generated to enable the database contents to be examined. The I<title>
|
|
|
|
argument specifies a case-sensitive feed title or component of such a title.
|
|
|
|
So, for instance B<-report=Hacker> currently reports on the batabase data
|
|
|
|
relating to the "Hacker Public Radio" feed.
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
If the argument is omitted the entire database is reported.
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
Reports consist of the details of the RSS (or Atom) channel with other
|
|
|
|
information about the site hosting the feed such as the IP address. The latest
|
|
|
|
episode in the feed is also reported.
|
2022-11-19 21:27:51 +00:00
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
Note that the feed information in the database is a snapshot made when the
|
|
|
|
feed details were last loaded. This is static information and does not get
|
|
|
|
updatedunless the feed is deleted and reloaded, or the B<-scan> function is run
|
|
|
|
(not currently available).
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
=item B<-json[=FILE]>
|
|
|
|
|
|
|
|
This option, which may be omitted, defines the location where the feed and
|
|
|
|
episode details are to be written. If omitted no JSON data is written.
|
|
|
|
|
|
|
|
If the option is given as B<-json=FILE> the data is written to the nominated
|
|
|
|
file.
|
|
|
|
|
|
|
|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
|
|
|
|
used.
|
|
|
|
|
|
|
|
=item B<-opml[=FILE]>
|
|
|
|
|
|
|
|
This option, which may be omitted, defines the location where the feed details
|
|
|
|
are to be written. If omitted no OPML data is written.
|
|
|
|
|
|
|
|
If the option is given as B<-opml=FILE> the data is written to the nominated
|
|
|
|
file.
|
|
|
|
|
|
|
|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
|
|
|
|
used.
|
|
|
|
|
2023-01-11 09:45:38 +00:00
|
|
|
=item B<-template[=FILE]>
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
This option defines the template used to generate a form of the feed data. The
|
2023-01-11 09:45:38 +00:00
|
|
|
template is written using the B<Template Toolkit> language.
|
2022-11-19 21:27:51 +00:00
|
|
|
|
|
|
|
If the file name is omitted then the script uses the file B<feedWatcher.tpl>
|
|
|
|
in the same directory as the script. If this file does not exist then the
|
|
|
|
script will exit with an error message.
|
|
|
|
|
|
|
|
For convenience B<feedWatcher.tpl> can be made a soft link which points to the
|
|
|
|
file which is the current default. This allows the development of versions
|
|
|
|
without changing the usual way this script is run.
|
|
|
|
|
|
|
|
=item B<-config=FILE>
|
|
|
|
|
|
|
|
This option allows an alternative configuration file to be used. This file
|
|
|
|
defines the location of the database, its name and the username and
|
|
|
|
password to be used to access it. This feature permits a test database to be
|
|
|
|
used, or for two more sets of feeds to be processed.
|
|
|
|
|
|
|
|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
|
|
|
|
|
|
|
|
If the option is omitted the default file is used: B<feedWatcher.cfg>
|
|
|
|
|
|
|
|
=item B<-debug=N>
|
|
|
|
|
|
|
|
This option selects the debug level, resulting in a lot of output.
|
|
|
|
|
|
|
|
0 (the default) No debug output
|
|
|
|
1 Dumps the list of feeds found in an HTML download.
|
|
|
|
Also shows the SQL query which will result in the insertion of a new
|
|
|
|
row into the database table I<urls>.
|
|
|
|
2 Dumps the collected data which is destined to be written to the database
|
|
|
|
3 Dumps the contents of feeds during analysis
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
A full description of the application and its features.
|
|
|
|
May include numerous subsections (i.e. =head2, =head3, etc.)
|
|
|
|
|
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
|
|
|
|
A list of every error and warning message that the application can generate
|
|
|
|
(even the ones that will "never happen"), with a full explanation of each
|
|
|
|
problem, one or more likely causes, and any suggested remedies. If the
|
|
|
|
application generates exit status codes (e.g. under Unix) then list the exit
|
|
|
|
status associated with each error.
|
|
|
|
|
|
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
|
|
|
|
A configuration file is used by the script. By default the name is
|
|
|
|
B<feedWatcher.cfg> (though, what is searched for is derived from the name of
|
|
|
|
the script). An alternative configuration file can be provided, specified
|
|
|
|
through the B<-config=FILE> option.
|
|
|
|
|
|
|
|
The file conforms to the format managed by the B<Config::General> module:
|
|
|
|
|
|
|
|
<database>
|
|
|
|
type = SQLite
|
|
|
|
file = feedWatcher.db
|
|
|
|
user =
|
|
|
|
password =
|
|
|
|
</database>
|
|
|
|
|
|
|
|
The B<type> value should be I<SQLite>, though it might be possible to use
|
|
|
|
other database types.
|
|
|
|
|
|
|
|
The B<file> value is the name of the database, which can include the
|
|
|
|
directory path.
|
|
|
|
|
|
|
|
The B<user> and B<password> values are not currently required, but if
|
|
|
|
authentication to the database was required it would be implemented this way.
|
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
|
|
|
|
Config::General
|
|
|
|
DBI
|
|
|
|
Data::Dumper
|
|
|
|
DateTime::Format::ISO8601
|
|
|
|
DateTime::Format::SQLite
|
|
|
|
Feed::Find
|
|
|
|
Getopt::Long
|
|
|
|
HTML::Entities
|
|
|
|
IO::Prompter
|
|
|
|
IO::Socket
|
|
|
|
JSON
|
|
|
|
LWP::UserAgent
|
|
|
|
List::MoreUtils qw{uniq}
|
|
|
|
Log::Handler
|
|
|
|
Net::DNS
|
|
|
|
Pod::Usage
|
|
|
|
SQL::Abstract
|
|
|
|
Template
|
|
|
|
Template::Filters
|
|
|
|
Try::Tiny
|
|
|
|
URI
|
|
|
|
WWW::RobotRules
|
|
|
|
XML::Feed
|
|
|
|
XML::RSS::Parser
|
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
|
|
|
|
There are no known bugs in this module.
|
|
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
Patches are welcome.
|
|
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com) 2020
|
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
|
|
|
|
Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
|
|
|
|
|
|
|
This program is free software. You can redistribute it and/or modify it under
|
|
|
|
the same terms as perl itself.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
#}}}
|
|
|
|
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|