Free_Culture_Podcasts/feedWatcher

3294 lines
101 KiB
Plaintext
Raw Normal View History

2022-11-19 21:27:51 +00:00
#!/usr/bin/env perl
#===============================================================================
#
# FILE: feedWatcher
#
# USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
# [-report[=title]] [-check[=mode]] [-out=FILE]
# [-rejects[=FILE]] [-json[=FILE]] [-opml[=FILE]]
# [-template[=FILE]] [-[no]silent] [-debug=N]
2022-11-19 21:27:51 +00:00
# [URL ...]
#
# DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
# podcasts. Ken's script was inspired by Dann Washko's site at
# http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
# August 2012 where it was used to generate handouts.
#
# The script has not been developed since 2014, but is now in
# use in 2020 helping to prepare podcast information for
# a FOSDEM visit under the heading of "Free Culture Podcasts".
# See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
# (created with Pandoc) in the same directory as this script for
# details of what has been done to develop the original design.
#
2023-01-11 09:45:38 +00:00
# Further development is taking place in 2022/2023, again for
# FOSDEM.
#
2022-11-19 21:27:51 +00:00
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.1.4
2022-11-19 21:27:51 +00:00
# CREATED: 2013-12-25 12:40:33
# REVISION: 2023-01-31 20:45:23
2022-11-19 21:27:51 +00:00
#
#-------------------------------------------------------------------------------
# Released under the terms of the GNU Affero General Public License (AGPLv3)
# http://www.gnu.org/licenses/agpl-3.0.html
#-------------------------------------------------------------------------------
#
#===============================================================================
use 5.030;
use strict;
use warnings;
use utf8;
use feature qw{ postderef say signatures state };
no warnings qw{ experimental::postderef experimental::signatures } ;
2023-01-11 09:45:38 +00:00
#
# There's an issue in XML::RSS, so we're using a local version with a hack.
# It's in ./lib/ and FindBin::libs looks there to find it.
2023-01-11 09:45:38 +00:00
#
2023-01-10 20:22:47 +00:00
use FindBin::libs;
use XML::RSS;
2022-11-19 21:27:51 +00:00
use Getopt::Long;
use Pod::Usage;
use Config::General;
use List::MoreUtils qw{uniq};
use Set::Array;
2022-11-19 21:27:51 +00:00
use Log::Handler;
use Try::Tiny;
use URI;
use Net::DNS;
use IO::Socket;
use LWP::UserAgent;
use WWW::RobotRules;
2023-01-10 20:22:47 +00:00
#use XML::RSS::Parser;
2022-11-19 21:27:51 +00:00
use XML::Feed;
use XML::FeedPP; # Fall back?
2022-11-19 21:27:51 +00:00
use Feed::Find;
use Template;
use Template::Filters;
Template::Filters->use_html_entities; # Use HTML::Entities in the template
use HTML::Entities;
use HTML::Parser ();
2022-11-19 21:27:51 +00:00
use IO::Prompter;
use JSON;
use DBI;
use DateTime::Format::SQLite;
use DateTime::Format::ISO8601;
use SQL::Abstract;
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.1.4';
2022-11-19 21:27:51 +00:00
#
# Script name
#
( my $PROG = $0 ) =~ s|.*/||mx;
#
# Declarations
#
my ( $action_mode, @urls, @deletions );
my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" );
my ( $sth1, $h1, $rv, $search_target, $rejectcount );
my ( $loadfile, $deletefile ) = ( '', '' );
2022-11-19 21:27:51 +00:00
my $feeds;
#
# To be written by a handler subroutine called by HTML::Parser
#
our $html_title;
2022-11-19 21:27:51 +00:00
#
# File and directory paths
#
my $basedir = "$ENV{HOME}/HPR/feed_watcher";
my $tempdir = "$basedir/tmp";
my $configfile = "$basedir/$PROG.cfg";
my $logfile = "$basedir/${PROG}.log";
my $deftemplate = "$basedir/${PROG}.tpl";
#
# Hash key map to database field names (chevron comments are to stop Perltidy
# messing with the layout)
#
#<<<
my %keymap = (
AUTHOR => 'author',
COPYRIGHT => 'copyright',
CHECKTYPE => 'check_type',
2022-11-19 21:27:51 +00:00
DESCRIPTION => 'description',
DNS => 'dns',
# ENCLOSURES => undef,
# ENCLOSURE_COUNT => undef,
FORMAT => 'feedformat',
GENERATOR => 'generator',
# HOST => undef,
HOSTUP => 'host_up',
HTTP_CONTENT_TYPE => 'content_type',
HTTP_STATUS => 'http_status',
IMAGE => 'image',
LANGUAGE => 'language',
LINK => 'link',
MODIFIED => 'modified',
# PORT => undef,
REASON_ACCEPTED => 'reason_accepted',
2022-11-19 21:27:51 +00:00
# ROBOTS => undef,
# SAVE => undef,
TITLE => 'title',
TYPE => 'urltype',
URI => 'url',
# URI_ID => undef,
);
#>>>
#
# Headers for LWP::UserAgent
#
my %headers = (
'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
. 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
. 'image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#
# Options and arguments
#
my %options;
Options( \%options );
#
# Any arguments are taken to be URLs
#
@urls = @ARGV;
2022-11-19 21:27:51 +00:00
#
# Default help
#
pod2usage( -msg => "Version $VERSION\n", -exitval => 1, -verbose => 0 )
2022-11-19 21:27:51 +00:00
if ( $options{'help'} );
#
# Detailed help
#
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
2022-11-19 21:27:51 +00:00
if ( $options{'manpage'} );
#
# Collect options
#
my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
my $cfgfile
= ( defined( $options{config} ) ? $options{config} : $configfile );
my $dry_run = ( defined( $options{'dry-run'} ) ? $options{'dry-run'} : 0 );
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $load = $options{'load'};
my $delete = $options{'delete'};
my $inputfile = $options{'input'};
2022-11-19 21:27:51 +00:00
2023-01-24 22:44:11 +00:00
my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 );
my $refresh = ( defined( $options{refresh} ) ? $options{refresh} : 0 );
my $expire = ( defined( $options{expire} ) ? $options{expire} : 0 );
2023-01-24 22:44:11 +00:00
my $html = ( defined( $options{html} ) ? $options{html} : 0 );
my $ignore_case
= ( defined( $options{'ignore-case'} ) ? $options{'ignore-case'} : 0 );
2022-11-19 21:27:51 +00:00
my $check = $options{check};
my $outfile = $options{out};
my $rejectfile = $options{rejects};
my $report = $options{report};
my $json = $options{json};
my $opml = $options{opml};
my $template = $options{template};
2022-11-19 21:27:51 +00:00
#-------------------------------------------------------------------------------
# Validate and process options
#-------------------------------------------------------------------------------
2022-11-19 21:27:51 +00:00
#
# Sanity
2022-11-19 21:27:51 +00:00
#
die "Choose either -load or -delete, not both\n"
if (defined($load) && defined($delete));
2023-01-24 22:44:11 +00:00
die "Options -load and -delete should not be combined with -scan or -refresh\n"
if ( ( defined($load) || defined($delete) ) && ( $scan || $refresh ) );
#
# Check the -input=FILE option is used with -load or -delete and confirm the
# existence and readability of the input file if specified.
#
if ( defined($inputfile) ) {
die "Option -input=FILE must be used with -load or -delete\n"
unless ($load || $delete);
die "File in '-input=$inputfile' does not exist\n"
unless -e $inputfile;
die "File in '-input=$inputfile' is not readable\n"
unless -r $inputfile;
}
2022-11-19 21:27:51 +00:00
#
# Check the configuration file
2022-11-19 21:27:51 +00:00
#
die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
2022-11-19 21:27:51 +00:00
#
# Determine the action mode, reading a file and/or using URLs on the command
# line.
2022-11-19 21:27:51 +00:00
#
if ($load) {
$action_mode = 'load';
$loadfile = $inputfile;
_debug(
$DEBUG > 0,
"Action mode: $action_mode",
( defined($loadfile)
? "File to load: $loadfile"
: "Load from arguments"
)
);
}
elsif ($delete) {
$action_mode = 'delete';
$deletefile = $inputfile;
_debug(
$DEBUG > 0,
"Action mode: $action_mode",
( defined($deletefile)
? "File to delete from: $deletefile"
: "Delete from arguments"
)
);
}
else {
$action_mode = 'none';
2022-11-19 21:27:51 +00:00
}
#
# The copyright checking mode defaults to 'auto' if the option has no value,
# or may be 'manual' or 'none'. If the option is not used at all it defaults
# to 'none'. It's only relevant to the 'load' action though.
#
if ( $action_mode eq 'load' ) {
# if ( $action_mode eq 'load' && defined($check) ) {
if ( defined($check) ) {
$check =~ s/(^\s+|\s+$)//g;
if ($check =~ /^$/) {
$check = "auto";
}
else {
$check = lc($check);
die "Invalid option '-check=$check'\n" .
"Values are <blank>, auto and manual\n"
unless ($check =~ /^(auto|manual|none)$/)
}
}
else {
$check = 'none';
}
emit($silent,"Copyright check mode = $check\n");
}
if ($dry_run) {
emit( $silent, "Dry run mode = On\n" );
emit( $silent, "----\n" );
}
# TODO: Does it make sense to have -load and -report, etc at the same time?
2022-11-19 21:27:51 +00:00
#
# We accept -report, meaning report everything or -report='title' to report
# just the feed with the given title (actually, the title which contains the
# given string).
2022-11-19 21:27:51 +00:00
#
if ( defined($report) ) {
if ($report =~ /^$/) {
$search_target = undef;
}
else {
$search_target = $report;
}
}
2022-11-19 21:27:51 +00:00
#
# We accept -json or -json=filename. In the former case we make a default
# name, otherwise we use the one provided.
#
if ( defined($json) ) {
if ($json =~ /^$/) {
$json = "$PROG.json";
2022-11-19 21:27:51 +00:00
}
}
#
# We accept -opml or -opml=filename. In the former case we make a default
# name, otherwise we use the one provided.
#
if ( defined($opml) ) {
if ($opml =~ /^$/) {
$opml = "$PROG.opml"
}
}
#
# Similarly we accept -template or -template=filename. In the former case we
# make a default name, otherwise we use the one provided.
#
if ( defined($template) ) {
if ($template =~ /^$/) {
$template = "$deftemplate";
}
die "Error: Unable to find template $template\n" unless -r $template;
}
#-------------------------------------------------------------------------------
2022-11-19 21:27:51 +00:00
# Load configuration data
#-------------------------------------------------------------------------------
2022-11-19 21:27:51 +00:00
my $conf = new Config::General(
-ConfigFile => $cfgfile,
-InterPolateVars => 1,
-ExtendedAccess => 1
);
my %config = $conf->getall();
#-------------------------------------------------------------------------------
2022-11-19 21:27:51 +00:00
# Connect to the database
#-------------------------------------------------------------------------------
2022-11-19 21:27:51 +00:00
my $dbtype = $config{database}->{type};
my $dbfile = $config{database}->{file};
my $dbuser = $config{database}->{user};
my $dbpwd = $config{database}->{password};
my $dbh
= DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
{ AutoCommit => 1, sqlite_unicode => 1, } )
or die $DBI::errstr;
#
# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
# foreign key constraints to function)
#
$dbh->do('PRAGMA foreign_keys = ON');
#
# Check we have something to do. NOTE: this check is a bit early because we
# haven't read the input file yet, if there is one.
2022-11-19 21:27:51 +00:00
#
my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
my $work = (
( ( scalar(@urls) > 0 || defined($inputfile) )
&& $action_mode =~ /load|delete/
)
|| ( defined($report)
|| defined($json)
|| defined($opml)
|| defined($template) )
|| ( ( $scan || $refresh ) && $rows > 0 )
2022-11-19 21:27:51 +00:00
);
unless ($work) {
print STDERR "Nothing to do!\n";
exit;
}
2022-11-19 21:27:51 +00:00
#-------------------------------------------------------------------------------
2023-01-27 20:33:31 +00:00
# Set up logging keeping the default log layout except for the date. Enable
# 'utf-8' mode (documented in Log::Handler::Output::File).
2022-11-19 21:27:51 +00:00
#-------------------------------------------------------------------------------
my $LOG = Log::Handler->new();
$LOG->add(
file => {
timeformat => "%Y/%m/%d %H:%M:%S",
filename => $logfile,
minlevel => 0,
maxlevel => 7,
2023-01-27 20:33:31 +00:00
'utf-8' => 1,
2022-11-19 21:27:51 +00:00
}
);
#-------------------------------------------------------------------------------
# Open the output file (or STDOUT) - we may need the date to do it
#-------------------------------------------------------------------------------
my $outfh;
if ($outfile) {
open( $outfh, ">:encoding(UTF-8)", $outfile )
or warn "Unable to open $outfile for writing: $!";
}
else {
open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
or warn "Unable to initialise for writing: $!";
}
#-------------------------------------------------------------------------------
# Open the rejects file if requested, otherwise we don't write reject data
#-------------------------------------------------------------------------------
my $rejectfh;
if ($rejectfile) {
if ($rejectfile =~ /^$/) {
$rejectfile = "${PROG}_rejected_URLs.txt";
}
open( $rejectfh, ">:encoding(UTF-8)", $rejectfile )
or warn "Unable to open $rejectfile for writing: $!";
$rejectcount = 0;
}
2022-11-19 21:27:51 +00:00
#
# Set up a robot.txt rules parser
#
$rules = WWW::RobotRules->new($robot_name);
#-------------------------------------------------------------------------------
# Check the mode we are in and prepare to load or delete according to the
# answer.
#-------------------------------------------------------------------------------
if ($action_mode eq 'load') {
2022-11-19 21:27:51 +00:00
#
# Slurp the load file into @urls if the file is provided
2022-11-19 21:27:51 +00:00
#
if ($loadfile) {
#
# Load the input file
#
open( my $in, '<:encoding(utf8)', $loadfile )
or die "$PROG : failed to open load file '$loadfile' : $!\n";
chomp( my @loaded = <$in> );
close($in)
or warn "$PROG : failed to close load file '$loadfile' : $!\n";
2022-11-19 21:27:51 +00:00
$LOG->info("Loaded URLs from $loadfile");
2022-11-19 21:27:51 +00:00
#
# Add the loaded URLs to the array
#
push( @urls, @loaded );
}
2022-11-19 21:27:51 +00:00
#
# Now, we either have URLs from the command line, or from the load file (or
# both), so we process these.
2022-11-19 21:27:51 +00:00
#
# It's a loop because 'loadUrls' might find some more URLs by scanning HTML
# URLs if given them. If it does we replace @urls with the found URLs and
# go again. When there's nothing returned the loop stops.
# ----
# NOTE: This seems dirty, but all the 'while' is testing is whether the array
# contains anything or not. It's not iterating over it or anything, which would
# be messy!
2022-11-19 21:27:51 +00:00
#
while (@urls) {
#
# Remove duplicates, finish if it deletes them all!
#
@urls = uniq(@urls);
last unless @urls;
2022-11-19 21:27:51 +00:00
#
# Remove any commented out lines, finish if it deletes them all!
#
@urls = grep {!/^\s*#/} @urls;
last unless @urls;
2022-11-19 21:27:51 +00:00
$LOG->info( "Loading ", scalar(@urls), " URLs to the database" );
2022-11-19 21:27:51 +00:00
#
# Load these URLs as appropriate, returning any more that we find by
# following HTML urls. We overwrite the original list and start all over
# again.
#
@urls = loadUrls( $dbh, \@urls, $rules, \%keymap, $dry_run );
}
}
elsif ( $action_mode eq 'delete' ) {
2022-11-19 21:27:51 +00:00
#
# Process the delete file if there is one
2022-11-19 21:27:51 +00:00
#
if ($deletefile) {
#
# Load the delete file
#
open( my $del, '<:encoding(utf8)', $deletefile )
or die "$PROG : failed to open load file '$deletefile' : $!\n";
chomp( @deletions = <$del> );
close($del)
or warn
"$PROG : failed to close delete file '$deletefile' : $!\n";
2022-11-19 21:27:51 +00:00
#
# Add the loaded URLs to the array
#
push( @urls, @deletions );
2022-11-19 21:27:51 +00:00
#
# Remove duplicates
#
@urls = uniq(@urls);
}
2022-11-19 21:27:51 +00:00
if (@urls) {
#
# TODO: check that these URLs are actually in the database! Seems
# silly to report "Failed to delete" when it's not there anyway!
#
#
# There are URLs to delete. Process them one by one.
#
if ($dry_run) {
emit( $silent,
"Would have deleted "
. scalar(@urls)
. " URLs (after checking)\n" );
}
else {
my @missing;
#
# Check the URLs exist
#
$sth1 = $dbh->prepare(q{SELECT id from urls WHERE url = ?});
foreach my $rec (@urls) {
$rv = $sth1->execute($rec);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
unless ($h1) {
emit( $silent, "Could not find URL $rec in the database\n" );
$LOG->warning(
"Failed to delete '$rec'; not in the database");
push( @missing, $rec );
}
}
#
# Remove the missing URLs from @urls
#
if (@missing) {
my $sa1 = Set::Array->new(@urls);
my $sa2 = Set::Array->new(@missing);
@urls = $sa1->difference($sa2);
}
#
# If nothing is left we're done
#
unless (@urls) {
warn "No URLs left after cleaning\n";
}
else {
#
# Delete what's left after cleaning
#
$sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
foreach my $rec (@urls) {
$rv = $sth1->execute($rec);
if ( $dbh->err ) {
warn $dbh->errstr;
}
if ( $rv != 0 ) {
emit( $silent, "Deleted $rec ($rv rows)\n" );
$LOG->info("Deleted URL '$rec' from the database");
}
else {
emit( $silent, "Failed to delete $rec\n" );
$LOG->warning(
"Failed to delete '$rec' from the database");
}
}
2022-11-19 21:27:51 +00:00
}
}
}
}
#-------------------------------------------------------------------------------
# Perform a database scan if requested
# TODO: Needs to be developed; does nothing at the moment.
2022-11-19 21:27:51 +00:00
#-------------------------------------------------------------------------------
if ($scan) {
2023-01-24 22:44:11 +00:00
warn "Refresh is not implemented yet\n";
#$LOG->warning( "Scan is not fully implemented yet" );
2022-11-20 22:49:57 +00:00
# Testing. Processes the first two feeds
2023-01-24 22:44:11 +00:00
# TODO: Currently broken
#scanDB($dbh, \%keymap, $dry_run);
}
#-------------------------------------------------------------------------------
# Perform a feed refresh
#-------------------------------------------------------------------------------
if ($refresh) {
warn "Refresh is not implemented yet";
2022-11-19 21:27:51 +00:00
}
#-------------------------------------------------------------------------------
# Report all or selected database contents if requested
#-------------------------------------------------------------------------------
if ( defined($report) ) {
if ( defined($search_target) ) {
#
# Reporting a specific title
#
my @matches = searchTitle( $dbh, $search_target, $ignore_case );
2022-11-19 21:27:51 +00:00
if (@matches) {
#
# Too many matches!
#
if ( $#matches > 1 ) {
say "Multiple matches:\n";
say "- ", join( "\n- ", @matches );
say "\nTry again with a different title";
}
else {
#
# One match, report it
#
reportFeed( findFeed( $dbh, $matches[0] ), $outfh );
}
}
else {
say "Feed not found with target '$search_target'";
}
}
else {
#
# Reporting everything
#
reportDB( $dbh, $outfh );
$LOG->info("Report generated");
}
}
#
# Do any image repairs we need, but temporarily
#
if ($json || $opml || $template) {
$feeds = collectData($dbh);
#
# Clean up the image references (until we have a proper fix)
#
for (my $i = 0; $i < scalar(@$feeds); $i++) {
my $f = $feeds->[$i];
if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
$f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
$LOG->info( "Fixed image for $f->{urls_url}" );
}
}
}
#-------------------------------------------------------------------------------
# Output all feeds to a JSON file if requested
#-------------------------------------------------------------------------------
if ($json) {
my $js = JSON->new->utf8->canonical->pretty;
open( my $out, '>:encoding(UTF-8)', $json )
or die "Unable to open output file $json $!\n";
print $out $js->encode($feeds), "\n";
close($out);
emit ( $silent, "JSON data written to $json\n" );
$LOG->info( "JSON data written to $json" );
}
#-------------------------------------------------------------------------------
# Requesting the -opml option means to dump the entire feed table as OPML
#-------------------------------------------------------------------------------
if ($opml) {
require XML::OPML;
open( my $out, '>:encoding(UTF-8)', $opml )
or die "Unable to open output file $opml $!\n";
#
# Start building OPML. Mandatory attributes are 'type', 'text' and
# 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
# attribute is poorly defined; this module uses 'rss' as the type and
# 'RSS' as the value of the 'version' attribute. This is not linked to the
# type of the feed.
#
my $OPML = XML::OPML->new(version => '1.1');
#
# Create the 'head' element
#
my $DT = normaliseDT(DateTime->now());
$OPML->head(
title => 'Free Culture Podcasts',
dateCreated => $DT,
dateModified => $DT,
);
#
# Walk the feeds generating OPML
#
for (my $i = 0; $i < scalar(@$feeds); $i++) {
my $f = $feeds->[$i];
$OPML->add_outline(
title => $f->{urls_title},
text => $f->{urls_title},
description => $f->{urls_description},
xmlUrl => $f->{urls_url},
htmlUrl => $f->{urls_link},
);
}
#
# Output OPML to the nominated file
#
print $out $OPML->as_string;
close($out);
emit ( $silent, "OPML data written to $opml\n" );
$LOG->info( "OPML data written to $opml" );
}
#-------------------------------------------------------------------------------
# Fill and print the TT² template if requested
2022-11-19 21:27:51 +00:00
#-------------------------------------------------------------------------------
if ($template) {
my $tt = Template->new(
{ ABSOLUTE => 1,
ENCODING => 'utf8',
INCLUDE_PATH => $basedir,
}
);
#
# Make the structure the template needs
#
my $vars = {
feeds => $feeds,
};
# print Dumper($vars),"\n";
my $document;
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|| die $tt->error(), "\n";
print $outfh $document;
emit ( $silent, "Data processed with $template\n" );
$LOG->info( "Data processed with $template" );
}
#$dbh->disconnect;
if ($rejectfile) {
emit( $silent,
"Number of rejected URLs written to $rejectfile is $rejectcount\n" );
}
2022-11-19 21:27:51 +00:00
exit;
#=== FUNCTION ================================================================
# NAME: loadUrls
# PURPOSE: To load URLs read from the input file (and the arguments) into
# the database
2022-11-19 21:27:51 +00:00
# PARAMETERS: $dbh database handle
# $new_urls arrayref containing URLs
# $rules WWW::RobotRules object
# $keymap hashref containing a map of key names to
# database field names
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# RETURNS: Any new URLs discovered by investigating non-feed URLs.
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub loadUrls {
my ( $dbh, $new_urls, $rules, $keymap, $dry_run ) = @_;
2022-11-19 21:27:51 +00:00
my ( $stream, $feed );
my ( %uridata, $roboturl, @found_urls );
#
# Work through the list of URLs
#
foreach my $rec (@$new_urls) {
2022-11-19 21:27:51 +00:00
%uridata = ();
#
# By default save the collected data
#
$uridata{SAVE} = 1;
#
# Check that we have a valid URL. We don't save them if they are
# invalid ($uridata{SAVE} is set to 0 in the routine).
#
my $uri = validateURI($rec,\%uridata);
next unless $uri;
#
# Check whether we already have this URI in the database
#
if (dbSearch(
$dbh, 'SELECT * FROM urls WHERE url = ?',
$uridata{URI}
)
)
{
emit( $silent, "$uri is already in the database\n" );
$uridata{SAVE} = 0;
next;
}
#
# Check the hostname resolves in DNS
#
if ( checkDNS( $uri, \%uridata ) ) {
$uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
emit( $silent, "DNS: $uridata{DNS}\n" );
}
else {
emit( $silent, "$uri has no DNS entry\n" );
$uridata{SAVE} = 0;
next;
}
#
# Check the server is available. Don't save if it's not (probably will
# not get the feed anyway).
#
if ( serverUp( $uri, \%uridata ) ) {
emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
}
else {
emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
$uridata{SAVE} = 0;
next;
}
#
# Check the site's robots.txt. If there's a block then don't save.
#
if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
emit( $silent, "Check of robots.txt rules succeeded\n" );
}
else {
if ( $uridata{ROBOTS} =~ /404/ ) {
emit( $silent, "Search for robots.txt file failed\n" );
}
else {
emit( $silent, "Check of robots.txt rules blocks access\n" );
2022-11-19 21:27:51 +00:00
$uridata{SAVE} = 0;
next;
}
}
#
# Look for the HTTP content type. Don't save if the request failed.
#
if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) {
emit( $silent, "HTTP request to check type OK\n" );
2022-11-19 21:27:51 +00:00
}
else {
emit( $silent, "HTTP request failed\n" );
$uridata{SAVE} = 0;
next;
}
#
# Parse the feed
#
if ( $uridata{TYPE} eq 'Feed' ) {
$LOG->info('Processing feed: ',$uridata{URI});
$stream = getFeed( $uridata{URI} );
if ($stream) {
$feed = parseFeed( $uridata{URI}, $stream );
unless ( $feed ) {
$uridata{SAVE} = 0;
emit( $silent, "Feed did not parse $uridata{URI}\n" );
$LOG->warning('Feed did not parse: ',$uridata{URI});
2022-11-19 21:27:51 +00:00
next;
}
#
# Provide a means of examining the returned feed object. There
# are some weird behaviours in there (partly because of the
# weirdness of RSS and poor adherence to what standards there
# are).
#
_debug( $DEBUG > 2, Dumper($feed));
2022-11-19 21:27:51 +00:00
storeFeed($feed,\%uridata);
#
# Perform a check on the copyright. The routine sets
2022-11-19 21:27:51 +00:00
# $uridata{SAVE} = 0 if the copyright is not acceptable.
#
2023-01-24 22:44:11 +00:00
$uridata{CHECKTYPE} = $check;
if ( $check ne 'none' ) {
unless (checkCopyright( $check, \%uridata )) {
#
# Rejected, write URL to a file if requested
#
if ($rejectfile) {
printf $rejectfh "%s\n", $uridata{URI};
$rejectcount++;
}
next;
}
2022-11-19 21:27:51 +00:00
}
}
else {
emit($silent, "Problem collecting feed");
$uridata{SAVE} = 0;
next;
}
}
elsif ( $uridata{TYPE} eq 'HTML' ) {
#
# TODO Save the URL and find any feeds it contains
#
if (@found_urls) {
emit( $silent,
"Type $uridata{TYPE} contained "
. scalar(@found_urls)
2023-01-10 20:22:47 +00:00
. " feeds; queued to be checked for inclusion\n" );
2022-11-19 21:27:51 +00:00
}
else {
emit( $silent,
"Type $uridata{TYPE} contained no feeds; skipped\n" );
}
next;
}
else {
emit( $silent,
"Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
$uridata{SAVE} = 0;
next;
}
}
continue {
# { This 'continue' block is executed for each iteration or 'next' }
#
# Decide whether to save what we have collected
#
if ( $uridata{SAVE} ) {
if ( addURI( $dbh, \%uridata, $keymap, $dry_run ) ) {
2022-11-19 21:27:51 +00:00
emit( $silent, "$uridata{URI} added to the database\n" );
$LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database');
#
# Get the id the database allocated for the row we added
#
$uridata{URI_ID}
= $dbh->last_insert_id( undef, undef, 'urls', undef );
#
# Add any enclosures we found
#
if ( defined( $uridata{ENCLOSURE_COUNT} )
&& $uridata{ENCLOSURE_COUNT} > 0 )
{
if ( addEnclosures( $dbh, \%uridata, $dry_run ) ) {
2022-11-19 21:27:51 +00:00
emit( $silent, $uridata{ENCLOSURE_COUNT},
" enclosures for $uridata{URI} added to the database\n"
);
}
}
}
else {
emit( $silent,
"$uridata{URI} was not added to the database\n" );
$LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database');
}
}
#
# Dump what we have if requested
#
_debug( $DEBUG > 1, Dumper( \%uridata ) );
2022-11-19 21:27:51 +00:00
emit( $silent, '-' x 80, "\n" );
}
#
# Return any further urls we found
#
return @found_urls;
}
#=== FUNCTION ================================================================
# NAME: searchTitle
# PURPOSE: Search the database for a feed with a given title
# PARAMETERS: $dbh database handle
# $target search target
# $ignore_case Boolean controlling whether it's a caseles
# search
2022-11-19 21:27:51 +00:00
# RETURNS: A list of titles
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub searchTitle {
my ($dbh, $target, $ignore_case) = @_;
2022-11-19 21:27:51 +00:00
my ( $sql1, $sql2, $sth, $rv, $h );
my ( $count, @result );
$sql1 = q{
SELECT count(*) AS count
FROM urls
WHERE urltype = 'Feed'
AND title REGEXP ?
};
$sql2 = q{
SELECT title
FROM urls
WHERE urltype = 'Feed'
AND title REGEXP ?
ORDER BY title
};
#
# Handle caseless searches
#
$target = ($ignore_case ? '(?i)' : '') . $target;
2022-11-19 21:27:51 +00:00
#
# Count the number of matches
#
$sth = $dbh->prepare($sql1);
$rv = $sth->execute($target);
2022-11-19 21:27:51 +00:00
if ( $dbh->err ) {
warn $dbh->errstr;
return;
}
#
# Proceed if we have 1 or more matches
#
if ( $h = $sth->fetchrow_hashref ) {
$count = $h->{count};
$sth->finish;
if ( $count >= 1 ) {
$sth = $dbh->prepare($sql2);
$rv = $sth->execute($target);
2022-11-19 21:27:51 +00:00
if ( $dbh->err ) {
warn $dbh->errstr;
return;
}
while ( $h = $sth->fetchrow_hashref ) {
push(@result, $h->{title});
}
}
return @result;
}
else {
return;
}
}
2023-01-24 22:44:11 +00:00
#=== FUNCTION ================================================================
# NAME: refreshFeeds
# PURPOSE: To refresh the episodes on all feeds
# PARAMETERS: $dbh database handle
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub refreshFeeds {
my ($dbh) = @_;
my ( $sql1, $sth1, $rv1, $h1 );
my ( $aref, @urls, $DT, $stream, $feed );
my ( %uridata, $encref, $enc_changes );
#
# Query to return all feed URLs
#
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Collect everything as an arrayref pointing to a bunch of arrayrefs
# containing the column details requested
#
$aref = $sth1->fetchall_arrayref;
#
# Extract just the URL strings
#
@urls = map { $_->[1] } @{$aref};
#
# Loop through the feed URLs
#
foreach my $url (@urls) {
#
# Get the feed as XML
#
$stream = getFeed($url);
next unless $stream;
#
# Parse the feed as an XML::Feed object
#
$feed = parseFeed($url,$stream);
next unless $feed;
#
# Turn the enclosures in the feed into an array of anonymous hashes
#
$encref = extractEnclosures($feed);
}
}
2022-11-19 21:27:51 +00:00
#=== FUNCTION ================================================================
# NAME: scanDB
# PURPOSE: To scan the URLs in the database and update the stored data
# PARAMETERS: $dbh database handle
# $keymap hashref containing a map of key names to
# database field names
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub scanDB {
my ($dbh, $keymap, $dry_run) = @_;
2022-11-19 21:27:51 +00:00
# TODO: dry-run mode not implemented here yet
2022-11-19 21:27:51 +00:00
my ( $sql1, $sth1, $rv1, $h1 );
my ( $aref, @urls, $DT, $stream, $feed );
my ( %uridata, $urichanges, $enc_changes );
#
# Query to return all feed URLs
#
$sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Collect everything as an arrayref pointing to a bunch of arrayrefs
# containing the column details requested
#
$aref = $sth1->fetchall_arrayref;
#
# Extract just the URL strings
#
@urls = map { $_->[1] } @{$aref};
#
# Now process these URLs from the database one at a time
#
# $sql1 = q{SELECT * FROM urls WHERE url = ?};
# $sth1 = $dbh->prepare($sql1);
my $count = 0;
foreach my $url (@urls) {
%uridata = ();
$count++;
scanFeed($dbh,$url,\%uridata);
# #
# # Record the scan for this URL
# #
# $DT = normaliseDT( DateTime->now() );
# $uridata{SCANNED_ON} = $DT;
#
# #
# # Default to OK
# #
# $uridata{SCAN_OK} = 1;
#
# emit( $silent, "Scanning '$url'\n" );
# $rv1 = $sth1->execute($url);
# if ( $dbh->err ) {
# warn $dbh->errstr;
# }
# $h1 = $sth1->fetchrow_hashref;
# emit( $silent, $h1->{title}, "\n" );
#
# #
# # The URL should be valid already from the point at which it was added
# # to the database, but if we do this we get a canonical version (it
# # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
# # it. It also sets $uridata{URI}, which is useful.)
# #
# my $uri = validateURI( $url, \%uridata );
#
# #
# # Check the hostname resolves in DNS
# #
# if ( checkDNS( $uri, \%uridata ) ) {
# $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
# emit( $silent, "DNS: $uridata{DNS}\n" );
# }
# else {
# emit( $silent, "$uri has no DNS entry\n" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Check the server is available
# #
# if ( serverUp( $uri, \%uridata ) ) {
# emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
# }
# else {
# emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Look for the HTTP content type.
# #
# if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
# emit( $silent, "HTTP request OK\n" );
# }
# else {
# emit( $silent, "HTTP request failed\n" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Note: not doing the robots.txt check since it was done at load time
# #
#
# #
# # We know we have a feed, so go get it
# #
# $stream = getFeed( $uridata{URI} );
# if ($stream) {
# #
# # Parse the feed. The routine generates its own error messages
# #
# $feed = parseFeed( $uridata{URI}, $stream );
# unless ( $feed ) {
# $uridata{SCAN_OK} = 0;
# next;
# }
#
# #
# # Save the important feed components in the %uridata hash
# #
# print Dumper($feed), "\n" if ( $DEBUG > 2 );
# storeFeed( $feed, \%uridata );
# }
# else {
# emit( $silent, "Problem collecting feed" );
# $uridata{SCAN_OK} = 0;
# next;
# }
#
}
continue {
# { This 'continue' block is executed for each iteration or 'next' }
#
# If the scan went OK then perform a comparison between the new feed
# data and that which is stored
#
if ( $uridata{SCAN_OK} ) {
$urichanges = updateURI( $dbh, \%uridata, $keymap );
$LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
$urichanges )
if $urichanges > 0;
# TODO Update enclosures
}
else {
emit( $silent, "$uridata{URI} was not scanned successfully\n" );
$LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
}
#
# Finished scanning this feed
#
emit( $silent, '-' x 80, "\n" );
#
# NOTE: Temporarily stop after the first N feeds
#
last if $count == 2;
}
return;
}
#=== FUNCTION ================================================================
# NAME: scanFeed
# PURPOSE: Performs a scan on a single feed
# PARAMETERS: $dbh database handle
# $url feed URL to look up
# $uridata hashref to the hash of data collected from the
# feed
# RETURNS: True (1) if all the steps worked, otherwise false (0)
2022-11-19 21:27:51 +00:00
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub scanFeed {
my ( $dbh, $url, $uridata ) = @_;
my ( $sql, $sth, $rv, $h );
my ( $DT, $stream, $feed );
my ( $urichanges, $enc_changes );
$sql = q{SELECT * FROM urls WHERE url = ?};
$sth = $dbh->prepare($sql);
#
# Record the scan for this URL
#
$DT = normaliseDT( DateTime->now() );
$uridata->{SCANNED_ON} = $DT;
#
# Default to OK
#
$uridata->{SCAN_OK} = 1;
emit( $silent, "Scanning '$url'\n" );
$rv = $sth->execute($url);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h = $sth->fetchrow_hashref;
emit( $silent, $h->{title}, "\n" );
#
# The URL should be valid already from the point at which it was added
# to the database, but if we do this we get a canonical version (it
# sets $uridata->{SAVE} which makes no sense here, but we'll just ignore
# it. It also sets $uridata->{URI}, which is useful.)
#
my $uri = validateURI( $url, $uridata );
#
# Check the hostname resolves in DNS
#
if ( checkDNS( $uri, $uridata ) ) {
$uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } );
emit( $silent, "DNS: $uridata->{DNS}\n" );
}
else {
emit( $silent, "$uri has no DNS entry\n" );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Check the server is available
#
if ( serverUp( $uri, $uridata ) ) {
emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) );
}
else {
emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Look for the HTTP content type.
#
if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) {
emit( $silent, "HTTP request OK\n" );
}
else {
emit( $silent, "HTTP request failed\n" );
$uridata->{SCAN_OK} = 0;
return 0;
}
#
# Note: not doing the robots.txt check since it was done at load time
#
#
# We know we have a feed, so go get it
#
$stream = getFeed( $uridata->{URI} );
if ($stream) {
#
# Parse the feed. The routine generates its own error messages
#
$feed = parseFeed( $uridata->{URI}, $stream );
unless ( $feed ) {
$uridata->{SCAN_OK} = 0;
emit( $silent, "Feed did not parse $uridata->{URI}\n" );
$LOG->warning('Feed did not parse: ',$uridata->{URI});
2022-11-19 21:27:51 +00:00
return 0;
}
#
# Save the important feed components in the %uridata hash
#
_debug( $DEBUG > 2, Dumper( $feed ) );
2022-11-19 21:27:51 +00:00
storeFeed( $feed, $uridata );
}
else {
emit( $silent, "Problem collecting feed" );
$uridata->{SCAN_OK} = 0;
return 0;
}
return 1;
}
#=== FUNCTION ================================================================
# NAME: reportDB
# PURPOSE: To generate a printed report from the database
# PARAMETERS: $dbh database handle
# $fh output file handle
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: reportFeed generates a standardised report
# SEE ALSO: N/A
#===============================================================================
sub reportDB {
my ($dbh, $fh) = @_;
#
# Query to report the contents of the 'all_episodes' view with the latest
# associated episode
#
my $sql = q{
SELECT urls.id, ae.*,
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
ORDER BY urls_title
};
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
while ( my $h1 = $sth1->fetchrow_hashref() ) {
reportFeed( $h1, $fh );
}
$sth1->finish;
return;
}
#=== FUNCTION ================================================================
# NAME: findFeed
# PURPOSE: Find and return a single feed by title
# PARAMETERS: $dbh database handle
# $target search target
# RETURNS: Hashref indexing the feed
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub findFeed {
my ( $dbh, $target ) = @_;
my ( $sql, $sth, $rv, $h );
#
# Query to report the contents of the 'all_episodes' view with the latest
# associated episode
#
$sql = q{
SELECT urls.id, ae.*,
max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
AND ae.urls_title = ?
};
$sth = $dbh->prepare($sql);
$rv = $sth->execute($target);
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h = $sth->fetchrow_hashref();
return $h;
}
#=== FUNCTION ================================================================
# NAME: reportFeed
# PURPOSE: Reports a single feed from the database
# PARAMETERS: $feed a hash from the database containing the
# details of a particular feed
# $fh output file handle
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub reportFeed {
my ($feed, $fh) = @_;
my $lwidth = 15;
2022-11-19 21:27:51 +00:00
#
# Hash for converting database keys to labels for the report and arrays
# for controlling the sequence.
#
state ( %keys, @seq1, @seq2 );
%keys = (
'id' => 'DB Primary key',
'latest_ep' => 'Latest episode date',
'ep_author' => 'Author',
'ep_byte_length' => 'Byte length',
'ep_category' => 'Category',
'ep_enclosure' => 'URL',
'ep_ep_id' => 'ID',
'ep_id' => 'Key',
'ep_issued' => 'Issued on',
'ep_last_update' => 'Last updated',
'ep_link' => 'Permalink',
'ep_mime_type' => 'MIME type',
'ep_modified' => 'Modified on',
'ep_source' => 'Source',
'ep_title' => 'Title',
'ep_urls_id' => 'URL key',
'urls_author' => 'Author',
'urls_check_type' => 'Check type',
'urls_content_type' => 'Content type',
'urls_copyright' => 'Copyright',
'urls_description' => 'Description',
'urls_dns' => 'DNS',
'urls_feedformat' => 'Feed format',
'urls_generator' => 'Generator',
'urls_host_up' => 'Host up',
'urls_http_status' => 'HTTP status',
'urls_id' => 'Key',
'urls_image' => 'Image',
'urls_language' => 'Language',
'urls_last_update' => 'Last updated',
'urls_link' => 'Link',
'urls_modified' => 'Modified on',
'urls_reason_accepted' => 'Reason accepted',
'urls_status' => 'Status',
'urls_summary' => 'Summary',
'urls_title' => 'Title',
'urls_url' => 'Feed URL',
'urls_urltype' => 'URL type',
'urls_parent_id' => 'Parent ID',
'urls_child_count' => 'Child count',
2022-11-19 21:27:51 +00:00
);
2023-01-24 22:44:11 +00:00
2022-11-19 21:27:51 +00:00
@seq1 = (
'urls_title',
'urls_url',
'urls_feedformat',
'urls_link',
'urls_author',
'urls_content_type',
'urls_copyright',
'urls_check_type',
'urls_reason_accepted',
2022-11-19 21:27:51 +00:00
'urls_description',
'urls_summary',
2022-11-19 21:27:51 +00:00
'urls_dns',
'urls_generator',
'urls_host_up',
'urls_http_status',
'urls_image',
'urls_language',
'urls_last_update',
'urls_modified',
'urls_parent_id',
'urls_child_count',
'urls_status',
2022-11-19 21:27:51 +00:00
);
2023-01-24 22:44:11 +00:00
2022-11-19 21:27:51 +00:00
@seq2 = (
'ep_title',
'ep_enclosure',
'ep_category',
'ep_link',
'ep_author',
'ep_byte_length',
'ep_ep_id',
'ep_mime_type',
'ep_issued',
'ep_last_update',
'ep_modified',
'ep_source',
);
if ($feed) {
print $fh "Channel:\n";
foreach my $key (@seq1) {
2023-01-24 22:44:11 +00:00
#
# Format the feed description, summary and copyright with a left
# margin using textFormat. Everything else gets a simpler layout.
2023-01-24 22:44:11 +00:00
#
if ($key =~ /^urls_(description|summary|copyright)$/) {
2023-01-24 22:44:11 +00:00
printf $fh "%s\n",
textFormat(
coalesce( $feed->{$key}, '--' ),
sprintf( " %-*s:", $lwidth, $keys{$key} ),
'L', $lwidth + 4, 80
);
}
else {
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
coalesce( $feed->{$key}, '--' );
}
2022-11-19 21:27:51 +00:00
}
print $fh "\nLatest episode:\n";
foreach my $key (@seq2) {
printf $fh " %-*s: %s\n", $lwidth, $keys{$key},
coalesce( $feed->{$key}, '--' );
}
print $fh "\n";
}
return;
}
#=== FUNCTION ================================================================
# NAME: collectData
# PURPOSE: Collects data from the database for generating a report
# PARAMETERS: $dbh database handle
# RETURNS: An array of hashrefs from the query
# DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
# table and the 'all_episodes' view, showing the date of the
# latest episode. The result is an array of rows, each
# represented as a hash, all sorted by the feed title.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub collectData {
my ($dbh) = @_;
#
# Query to report only the feeds from the contents of the 'urls' table
# with the details of the latest episode
2022-11-19 21:27:51 +00:00
#
my $sql = q{
SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
FROM all_episodes ae
JOIN urls ON (ae.urls_id = urls.id)
GROUP by urls.id
HAVING ae.urls_urltype = 'Feed'
ORDER BY ae.urls_title
};
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
#
# Return everything as an arrayref of hashrefs
#
my $tbl_ary_ref = $sth1->fetchall_arrayref({});
$sth1->finish;
return $tbl_ary_ref;
}
#=== FUNCTION ================================================================
# NAME: countRows
# PURPOSE: To count the rows in a table
# PARAMETERS: $dbh database handle
# $sql SQL expression to use
# RETURNS: Number of rows found (note that zero is returned as 0E0)
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub countRows {
my ( $dbh, $sql ) = @_;
my $sth1 = $dbh->prepare($sql);
my $rv = $sth1->execute();
if ( $dbh->err ) {
warn $dbh->errstr;
}
my $h1 = $sth1->fetch;
$sth1->finish;
return @$h1[0];
}
#=== FUNCTION ================================================================
# NAME: dbSearch
# PURPOSE: To perform a simple search in the database
# PARAMETERS: $dbh database handle
# $sql SQL expression to use (expected to be
# a SELECT)
# @args arguments for the 'execute'
# RETURNS: True (1) if the row exists, otherwise false (0).
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
# SQL without incurring the overhead of repeated 'prepare'
# calls. Only the first row is fetched (we expect there to be
# only one) and the success or failure is determined by its
# existence.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub dbSearch {
my ( $dbh, $sql, @args ) = @_;
my $sth1 = $dbh->prepare_cached($sql);
my $rv = $sth1->execute(@args);
if ( $dbh->err ) {
warn $dbh->errstr;
}
my $h1 = $sth1->fetchrow_hashref();
$sth1->finish;
return defined($h1);
}
#=== FUNCTION ================================================================
# NAME: execSQL
# PURPOSE: To perform a non-SELECT query
# PARAMETERS: $dbh database handle
# $sql SQL expression to use
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# @args arguments for the 'execute'
# RETURNS: True (1) if the query succeeded, otherwise false (0).
# DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
# SQL without incurring the overhead of repeated 'prepare'
# calls.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub execSQL {
my ( $dbh, $sql, $dry_run, @args ) = @_;
2022-11-19 21:27:51 +00:00
my ( $sth1, $rv );
if ($dry_run) {
emit($silent,"Dry-run: Would have run SQL '$sql'\n");
return;
}
$sth1 = $dbh->prepare_cached($sql);
try {
$rv = $sth1->execute(@args);
2022-11-19 21:27:51 +00:00
}
catch {
warn "Problem with query '$sql'\n";
if ( $dbh->err ) {
warn '** ' . $dbh->errstr;
}
};
2022-11-19 21:27:51 +00:00
$sth1->finish;
return $rv;
}
#=== FUNCTION ================================================================
# NAME: validateURI
# PURPOSE: Checks a URL for validity
# PARAMETERS: $rec the URL handed to the script
# $uridata hashref containing data for this URI
# RETURNS: A URI object if valid otherwise undef
# DESCRIPTION: The URL string is validated with the URI module. A canonical
# string version is stored in the hash referenced by $uridata
# hash if valid otherwise the URL is marked as invalid.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub validateURI {
my ( $rec, $uridata ) = @_;
#
# Check that we have a valid URL. We don't save them if they are
# invalid.
#
my $uri = URI->new( $rec, 'http' );
if ( $uri->scheme ) {
emit( $silent, "URI $uri is valid\n" );
$uridata->{URI} = $uri->canonical->as_string;
return $uri;
}
else {
emit( $silent, "URI $uri is not valid\n" );
$uridata->{SAVE} = 0;
return;
}
}
#=== FUNCTION ================================================================
# NAME: checkDNS
# PURPOSE: Looks up a host DNS entry
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the DNS query was successful, otherwise false (0)
# DESCRIPTION: The host name is extracted from the URI (and stored). The
# hostname is searched for in the DNS and if successful, an
# array of addresses from the 'A' records is built. This is
# sorted and stored in the hash referenced by $uridata.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkDNS {
my ( $uri, $uridata ) = @_;
my @adds;
my $hostname = $uri->host;
$uridata->{HOST} = $hostname;
my $res = Net::DNS::Resolver->new;
my $query = $res->search($hostname);
if ($query) {
foreach my $rr ( $query->answer ) {
next unless $rr->type eq "A";
push( @adds, $rr->address );
}
@adds = sort(@adds);
$uridata->{DNS} = \@adds;
return 1;
}
else {
warn "Query failed: ", $res->errorstring, "\n";
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: serverUp
# PURPOSE: Checks that a given host is responding
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the host responds to a TCP connection, false (0)
# otherwise
# DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
# to the appropriate default for the scheme, such as 80 for
# http). Attempts to connect to this host and port. If the
# connect fails then details are written to the data structure
# pointed to by $uridata.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub serverUp {
my ( $uri, $uridata ) = @_;
my ( $hostname, $port, $socket );
$hostname = $uri->host();
$port = $uri->port();
$uridata->{PORT} = $port;
$socket = IO::Socket::INET->new(
Proto => "tcp",
PeerAddr => $hostname,
PeerPort => $port,
Reuse => 1,
Timeout => 10
);
if ($socket) {
$socket->close;
$uridata->{HOSTUP} = 1;
return 1;
}
else {
$uridata->{HOSTUP} = 0;
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: robotRulesOK
# PURPOSE: To check the intended URL against the site's robots.txt rules
# PARAMETERS: $uri URI object
# $rules WWW::RobotRules object
# $uridata hashref containing data for this URI
# RETURNS: True (1) if the GET of the robots.txt file succeeded and the
# rules allow the URI object to be fetched, false (0) otherwise.
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub robotRulesOK {
my ( $uri, $rules, $uridata ) = @_;
#
# Build the URL of robots.txt
#
my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
my $robots_txt;
#
# Use LWP::UserAgent to get the feed and handle errors
#
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("$PROG/$VERSION");
my $response = $ua->get($roboturl);
if ( $response->is_success ) {
$uridata->{ROBOTS} = 'Found';
$robots_txt = $response->decoded_content;
$rules->parse( $roboturl, $robots_txt );
return $rules->allowed("$uri");
}
else {
$uridata->{ROBOTS} = $response->status_line;
warn "Failed to get $roboturl\n";
warn $response->status_line . "\n";
return; # undef
}
}
#=== FUNCTION ================================================================
# NAME: checkContentType
# PURPOSE: Check the content_type of the url
# PARAMETERS: $uri URI object
# $uridata hashref containing data for this URI
# $headers hashref containing query headers
# $children arrayref to return any 'child' feeds we might
# find; they are appended to the array
# $log Log::Handler object
# RETURNS: True (1) if all was well, otherwise false (0)
# DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
# for the URI object using the LWP::UserAgent head method. Then
# we examine the 'content-type' header looking for the string
# 'xml' or 'html' in it. The former denotes a feed, and the
# latter a normal HTML page.
2022-11-20 22:49:57 +00:00
# We use Feed::Find if the page is HTML in order to find any
2022-11-19 21:27:51 +00:00
# "child" feeds which we really want to process if we find any.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkContentType {
my ( $uri, $uridata, $headers, $children, $log ) = @_;
my @feeds;
my $content;
2022-11-19 21:27:51 +00:00
$uridata->{HTTP_STATUS} = 'Unknown';
my $browser = LWP::UserAgent->new or return 0;
$browser->timeout(10);
$browser->agent("$PROG/$VERSION");
2022-11-19 21:27:51 +00:00
#my $response = $browser->head( $uri->as_string, %{$headers} )
my $response = $browser->get( $uri->as_string, %{$headers} )
2022-11-19 21:27:51 +00:00
or return 0;
$uridata->{HTTP_STATUS} = $response->status_line;
if ( $response->is_success ) {
$uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
#
# Decode the content-type we received
#
if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
$uridata->{TYPE} = 'Feed';
}
elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
$uridata->{TYPE} = 'HTML';
}
else {
$uridata->{TYPE} = 'Unknown';
}
#
# Sometimes perfectly valid feeds misleadingly return text/html so we
# try to spot such cases here and adjust the internal type accordingly
#
if ( $uridata->{TYPE} eq 'HTML' ) {
@feeds = Feed::Find->find( $uri->as_string );
if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
emit( $silent, "Feed found with wrong content-type\n" );
$uridata->{TYPE} = 'Feed';
}
else {
#
# This is HTML and we found 'child' feeds of some kind
2022-11-19 21:27:51 +00:00
#
emit( $silent, "Found ", scalar(@feeds),
" feeds within this HTML page\n" );
_debug( $DEBUG > 0, Dumper( \@feeds ) );
2022-11-19 21:27:51 +00:00
push(@{$children}, @feeds);
}
}
#
# TODO: Get the title for an HTML page by parsing the source
#
if ( $uridata->{TYPE} eq 'HTML' ) {
unless ($uridata->{TITLE}) {
$content = $response->decoded_content;
$uridata->{TITLE} = getHTMLTitle($uri->as_string, $content);
}
}
2022-11-19 21:27:51 +00:00
$log->info( "URL content classified as: ", $uridata->{TYPE} );
emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
return 1;
}
else {
$uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: getURL
# PURPOSE: Download the contents of an URL
# PARAMETERS: $url URL of the page to download
# RETURNS: String representation of the contents or undef if the
# download failed.
# DESCRIPTION: Issues a GET on the URL. If successful the contents are
# decoded and returned, otherwise undef is returned.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub getURL {
my ($url) = @_;
#
# Use LWP::UserAgent to get the feed and handle errors
#
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("$PROG/$VERSION");
my $response = $ua->get($url);
my $content;
if ( $response->is_success ) {
$content = $response->decoded_content;
return $content;
}
else {
warn "Failed to get $url\n";
warn $response->status_line, "\n";
return; # undef
}
}
2022-11-19 21:27:51 +00:00
#=== FUNCTION ================================================================
# NAME: getFeed
# PURPOSE: Download the contents of a feed URL
# PARAMETERS: $feed_url URL of the feed to download
# RETURNS: String representation of the feed contents or undef if the
# download failed.
# DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
# need not be). If successful the contents are decoded and
# returned, otherwise undef is returned.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub getFeed {
my ($feed_url) = @_;
#
# Use LWP::UserAgent to get the feed and handle errors
#
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("$PROG/$VERSION");
my $response = $ua->get($feed_url);
my $feed_content;
if ( $response->is_success ) {
$feed_content = $response->decoded_content;
return $feed_content;
}
else {
warn "Failed to get $feed_url\n";
warn $response->status_line, "\n";
return; # undef
}
}
#=== FUNCTION ================================================================
# NAME: getHTMLTitle
# PURPOSE: Parse an HTML page to get data. At the moment this is just the
# title in the header section.
# PARAMETERS: $url URL of the page; only used for messages and as
# a fallback title.
# $content Decoded content of the HTML page
# RETURNS: The title, if there is one, otherwise the URL
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub getHTMLTitle {
my ($url, $content) = @_;
#
# Is this effectively a local subroutine?
#
my $start_handler = sub {
#
# Ignore any tags which are not 'title'
#
return if shift ne "title";
#
# Define more handlers if we have a title. One to collect the title string
# and the other to abort the parse on encountering the end of the title.
#
my $self = shift;
$self->handler(text => sub { $main::html_title = shift }, "dtext");
$self->handler(
end => sub {
shift->eof if shift eq "title";
},
"tagname,self"
);
};
#
# Create the HTML::Parser object
#
my $p = HTML::Parser->new(api_version => 3);
#
# Define a 'start' handler. When called with 'title' as the 'tagname' it
# creates a 'text' event handler to print the text and an 'end' handler to
# quit when the closing '</title' tag is found.
#
#$p->handler(start => \&start_handler, "tagname,self");
$p->handler(start => $start_handler, "tagname,self");
#
# Invoke the parser on the string containing the returned HTML.
#
$p->parse($content);
return $main::html_title // $url;
}
2022-11-19 21:27:51 +00:00
#=== FUNCTION ================================================================
# NAME: parseFeed
# PURPOSE: Parse a podcast feed that has already been downloaded
# PARAMETERS: $feed_url URL of the feed previously downloaded
# $feed_content String containing the content of the feed, for
# parsing
# RETURNS: An XML::Feed object containing the parsed feed or undef if the
# parse failed
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub parseFeed {
my ( $feed_url, $feed_content ) = @_;
my $feed;
#
# Catch errors, returning from the subroutine with 'undef' if any are
# triggered.
#
try {
$feed = XML::Feed->parse( \$feed_content );
unless ($feed) {
#
# Something went wrong. Abort this feed
#
warn "Warning: Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
return; # undef
}
2022-11-19 21:27:51 +00:00
}
catch {
warn "Warning: Failed to parse $feed_url: $_\n";
return;
};
2022-11-19 21:27:51 +00:00
return $feed;
}
#=== FUNCTION ================================================================
# NAME: storeFeed
# PURPOSE: Stores feed attributes in a hash
# PARAMETERS: $feed XML::Feed object returned from parsing the
# feed
# $uridata hashref containing data for this URI
# RETURNS: Nothing
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub storeFeed {
my ($feed, $uridata) = @_;
( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g;
( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
$uridata->{LINK} = $feed->link;
$uridata->{DESCRIPTION} = clean_string( $feed->tagline );
$uridata->{AUTHOR} = $feed->author;
$uridata->{MODIFIED} = normaliseDT( $feed->modified );
$uridata->{IMAGE} = flattenArray( $feed->image );
$uridata->{COPYRIGHT} = $feed->copyright;
$uridata->{GENERATOR} = $feed->generator;
$uridata->{LANGUAGE} = $feed->language;
#print coalesce($feed->webMaster,'No webMaster'),"\n";
$uridata->{ENCLOSURES} = extractEnclosures($feed);
$uridata->{ENCLOSURE_COUNT}
= scalar( @{ $uridata->{ENCLOSURES} } );
}
#=== FUNCTION ================================================================
# NAME: checkCopyright
# PURPOSE: Ask the user to check, or applies simple rules, to accept or
# reject a feed based on the copyright
# PARAMETERS: $checkmode the mode string: auto, manual or none
# $uridata reference to the hash containing details of
# this feed
# RETURNS: 1 (true) if the feed is to be added, 0 (false) if not
2022-11-19 21:27:51 +00:00
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkCopyright {
my ($checkmode, $uridata) = @_;
2022-11-19 21:27:51 +00:00
my ( $copyright, $re, $decision, $reason );
$LOG->info("Checking copyright of feed (mode: $checkmode)");
2022-11-19 21:27:51 +00:00
if ( $checkmode eq 'manual' ) {
#
# Prompt the user, failing gracefully if there's
# a problem. If the user types 'Y' or 'y' we accept the
# feed, otherwise we do not (thus a blank return = 'no').
#
try {
printf STDERR
"Feed '%s' has the copyright string:\n%s\n",
$uridata->{TITLE},
coalesce( $uridata->{COPYRIGHT}, '' );
$decision = prompt(
-in => *STDIN,
-out => *STDERR,
-prompt => 'OK to add this feed?',
-style => 'bold red underlined',
-yes
);
}
catch {
warn "Problem processing copyright decision: $_";
$decision = 0;
};
2023-01-24 22:44:11 +00:00
#
# If accepted we want a reason
#
if ($decision) {
try {
$reason = prompt(
-in => *STDIN,
-out => *STDERR,
-prompt => 'Please give a reason for this decision:',
-style => 'bold red underlined',
-default => 'No reason given'
);
}
catch {
warn "Problem processing reason for copyright decision: $_";
$reason = 'Error while processing the reason';
};
$uridata->{REASON_ACCEPTED} = $reason;
}
2022-11-19 21:27:51 +00:00
}
else {
#
# Automatic mode.
# Careful. Un-escaped spaces are ignored in the regex
#
$re = qr{(
\bCC\b|
\bCreative\ Commons\b|
creativecommons.org|
Attribution.NonCommercial.No.?Derivatives?
)}xmi;
2022-11-19 21:27:51 +00:00
$decision = 0;
$copyright = coalesce( $uridata->{COPYRIGHT}, '' );
emit( $silent, "Copyright: '$copyright'\n" );
$LOG->info("Copyright: '$copyright'");
if ( $copyright eq '' || $copyright =~ /$re/ ) {
$decision = 1;
}
}
2022-11-19 21:27:51 +00:00
#
# Take action on the decision (or default)
#
$uridata->{SAVE} = $decision;
if ($decision) {
emit( $silent, "Feed added\n" );
$LOG->info('Copyright OK');
return 1;
}
else {
emit( $silent, "Feed not added\n" );
$LOG->info('Copyright not OK');
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: parseRSS
# PURPOSE: Attempt to parse a feed as RSS
# PARAMETERS: $feed_url URL of the feed previously downloaded
# $feed_content String containing the content of the feed, for
# parsing
# RETURNS: An XML::RSS object containing the parsed feed or undef if the
# parse failed
# DESCRIPTION: ** Incomplete **
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub parseRSS {
my ( $feed_url, $feed_content ) = @_;
my $rss = XML::RSS->parse(\$feed_content);
unless ($rss) {
}
}
#=== FUNCTION ================================================================
# NAME: addURI
# PURPOSE: Adds the data for a URI to the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# $keymap hashref containing a map of key names to
# database field names
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# RETURNS: True (1) if the insert succeeded, false (0) otherwise
# DESCRIPTION: The hash keys are defined as an array to make it easy to slice
# the hash, and the SQL is defined internally using the size of
2022-11-19 21:27:51 +00:00
# the key array as a guide to the number of '?' placeholders.
# These are passed to execSQL to do the work.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub addURI {
my ( $dbh, $uridata, $keymap, $dry_run ) = @_;
2022-11-19 21:27:51 +00:00
my @keys = (
'URI', 'DNS',
'HOSTUP', 'HTTP_STATUS',
'HTTP_CONTENT_TYPE', 'TYPE',
'FORMAT', 'TITLE',
'DESCRIPTION', 'AUTHOR',
'MODIFIED', 'LINK',
'IMAGE', 'COPYRIGHT',
'CHECKTYPE', 'REASON_ACCEPTED',
2022-11-19 21:27:51 +00:00
'GENERATOR', 'LANGUAGE',
);
my $sql
= 'INSERT INTO urls ('
. join( ",", @{$keymap}{@keys} ) . ') '
. 'VALUES('
. join( ',', ('?') x scalar(@keys) ) . ')';
_debug( $DEBUG > 0, "addURI query: $sql");
2022-11-19 21:27:51 +00:00
return execSQL( $dbh, $sql, $dry_run, @{$uridata}{@keys} );
2022-11-19 21:27:51 +00:00
}
#=== FUNCTION ================================================================
# NAME: extractEnclosures
# PURPOSE: Builds an array of hashes containing enclosure data from an
# XML::Feed object
# PARAMETERS: $feed XML::Feed object
# RETURNS: A reference to the array of anonymous hashes built from the
# feed.
# DESCRIPTION: The XML::Feed object is expected to contain an array of
# entries. These are converted to hashes, references to which
# are stored in an array. The two DateTime components are
# converted to ISO8601 dates. If there is an enclosure then its
# elements are saved. Note that there could be multiple
# enclosures per item, but XML::Feed does not cater for them
# unless explicitly requested. We do not deal with such a case
# here.
2022-11-19 21:27:51 +00:00
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub extractEnclosures {
my ($feed) = @_;
my @encs;
foreach my $entry ( $feed->entries ) {
my %ent;
$ent{title} = $entry->title;
$ent{base} = $entry->base;
$ent{link} = $entry->link;
$ent{category} = join( ", ", $entry->category );
$ent{author} = $entry->author;
$ent{id} = $entry->id;
$ent{issued} = normaliseDT( $entry->issued );
$ent{modified} = normaliseDT( $entry->modified );
#
# Work around a bug in XML::Feed where the source method is only
# available for Atom feeds. TODO report this.
#
if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
$ent{source} = $entry->source;
}
my ($enclosure) = $entry->enclosure;
if ( defined($enclosure) ) {
$ent{url} = $enclosure->url;
$ent{type} = $enclosure->type;
$ent{length} = $enclosure->length;
}
push( @encs, \%ent );
}
return \@encs;
}
#=== FUNCTION ================================================================
# NAME: addEnclosures
# PURPOSE: Adds episodes extracted from a feed into the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
# DESCRIPTION: The SQL is defined internally and the hash keys are defined as
# an array to make it easy to slice the hash. The enclosures (or
# more correctly, feed items) are present in the hash as an
# array of anonymous hashes. These are processed one at a time
# and inserted into the database. A count of the number of
# successful inserts is kept. This is compared with the number
# of enclosures to determine the boolean value to return.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub addEnclosures {
my ( $dbh, $uridata, $dry_run ) = @_;
2022-11-19 21:27:51 +00:00
my $sql = q{INSERT INTO episodes
(urls_id, link, enclosure, title, author, category, source, ep_id,
issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
my @keys = (
'link', 'url', 'title', 'author', 'category', 'source',
'id', 'issued', 'modified', 'length', 'type'
);
my $successes = 0;
foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
if ( execSQL( $dbh, $sql, $dry_run, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
2022-11-19 21:27:51 +00:00
$successes++;
}
else {
emit( $silent, "Failed to add episode $enc->{url}\n" );
}
}
return ( $successes == $uridata->{ENCLOSURE_COUNT} );
}
#=== FUNCTION ================================================================
# NAME: updateURI
# PURPOSE: Compare the data in a hash with that in the database
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# $keymap hashref containing a map of key names to
# database field names
# $dry_run Boolean, set if in dry-run mode
2022-11-19 21:27:51 +00:00
# RETURNS: The number of changes made
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub updateURI {
my ( $dbh, $uridata, $keymap, $dry_run ) = @_;
2022-11-19 21:27:51 +00:00
# TODO: dry-run mode not implemented here yet
2022-11-19 21:27:51 +00:00
my ( $sql1, $sth1, $rv1, $h1 );
my ( %fieldvals, %where );
my ( $diffs, $updates ) = ( 0, 0 );
my @keys = (
'URI', 'DNS',
'HOSTUP', 'HTTP_STATUS',
'HTTP_CONTENT_TYPE', 'TYPE',
'FORMAT', 'TITLE',
'DESCRIPTION', 'AUTHOR',
'MODIFIED', 'LINK',
'IMAGE', 'COPYRIGHT',
'GENERATOR', 'LANGUAGE',
);
#
# Get the row from the urls table
#
$sql1 = q{SELECT * FROM urls WHERE url = ?};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute( $uridata->{URI} );
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
for my $key (@keys) {
unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
$diffs++;
$fieldvals{$key} = $uridata->{$key};
#
# Temporary report
#
print "Difference: ($key)\n";
print " Feed: ", coalesce($uridata->{$key},''), "\n";
print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
}
}
if ($diffs > 0) {
#
# Prepare to use SQL::Abstract
#
my $sql = SQL::Abstract->new();
#
# Build the update statement
#
$where{id} = $h1->{id};
my ( $stmt, @bindvals )
= $sql->update( 'urls', \%fieldvals, \%where );
# Temporary
print "Statement: $stmt\n";
print "Bind values: ", join(",",@bindvals),"\n";
#
# Perform the updates
#
$sth1 = $dbh->prepare($stmt);
$sth1->execute(@bindvals);
if ( $dbh->err ) {
warn "Processing $h1->{url}\n", $dbh->errstr;
}
else {
emit($silent, "Updated $h1->{url}\n");
$updates++;
}
}
return $updates;
}
#=== FUNCTION ================================================================
# NAME: updateEnclosures
# PURPOSE: Update the enclosures stored with a feed URL
# PARAMETERS: $dbh database handle
# $uridata hashref containing data for the current URI
# including an arrayref of hashrefs of episode
# data
# RETURNS: The number of changes made
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub updateEnclosures {
my ( $dbh, $uridata ) = @_;
# TODO: This doesn't acttually update annything!
2022-11-19 21:27:51 +00:00
my ( $sql1, $sth1, $rv1, $h1 );
my ( %fieldvals, %where );
my ( $diffs, $updates ) = ( 0, 0 );
#
# Get the enclosures associated with this feed
#
$sql1 = q{
SELECT * FROM episodes
WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
};
$sth1 = $dbh->prepare($sql1);
$rv1 = $sth1->execute( $uridata->{URI} );
if ( $dbh->err ) {
warn $dbh->errstr;
}
$h1 = $sth1->fetchrow_hashref;
}
2023-01-24 22:44:11 +00:00
#=== FUNCTION ================================================================
# NAME: textFormat
# PURPOSE: Formats a block of text in an indented, wrapped style with
# a label in the left column
# PARAMETERS: $text The text to be formatted, as a scalar string
# $tag The label to be added to the left of the top
# line
# $align Tag alignment, 'L' for left, otherwise right
# $lmargin Width of the left margin (assumed to be big
# enough for the tag)
# $textwidth The width of all of the text plus left margin
# (i.e. the right margin)
# RETURNS: The formatted result as a string
# DESCRIPTION: Chops the incoming text into words (thereby removing any
# formatting). Removes any leading spaces. Loops through the
# wordlist building them into lines of the right length to fit
# between the left and right margins. Saves the lines in an
# array. Adds the tag to the first line with the alignment
# requested then returns the array joined into a string.
# THROWS: No exceptions
# COMMENTS: Inspired by Text::Format but *much* simpler. In fact T::F is
# a nasty thing to have to use; I couldn't get it to do what
# this routine does.
# TODO Make the routine more resilient to silly input values.
# SEE ALSO:
#===============================================================================
sub textFormat {
my ( $text, $tag, $align, $lmargin, $textwidth ) = @_;
my ( $width, $word );
my ( @words, @buff, @wrap );
#
# Build the tag early. If there's no text we'll just return the tag.
#
$tag = sprintf( "%*s",
( $align =~ /L/i ? ( $lmargin - 1 ) * -1 : $lmargin - 1 ), $tag );
return $tag unless $text;
$text =~ s/(^\s+|\s+$)//g;
return $tag unless $text;
#
# Chop up the incoming text removing leading spaces
#
@words = split( /\s+/, $text );
shift(@words) if ( @words && $words[0] eq '' );
#
# Compute the width of the active text
#
$width = $textwidth - $lmargin;
#
# Format the words into lines with a blank left margin
#
while ( defined( $word = shift(@words) ) ) {
if ( length( join( ' ', @buff, $word ) ) < $width ) {
push( @buff, $word );
}
else {
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) );
@buff = ($word);
}
}
#
# Append any remainder
#
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) ) if @buff;
#
# Insert the tag into the first line
#
substr( $wrap[0], 0, $lmargin - 1 ) = $tag;
#
# Return the formatted array as a string
#
return join( "\n", @wrap );
}
2022-11-19 21:27:51 +00:00
#=== FUNCTION ================================================================
# NAME: equal
# PURPOSE: Compare two strings even if undefined
# PARAMETERS: $s1 The first string
# $s2 The second string
# RETURNS: True if both strings are undefined, false if one isn't
# defined, otherwise the result of comparing them.
# DESCRIPTION: Works on the principle that two undefined strings are equal,
# a defined and an undefined string are not, and otherwise they
# are equal if they are equal!
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub equal {
my ( $s1, $s2 ) = @_;
return 1 if ( !defined($s1) && !defined($s2) );
return 0 if ( !defined($s1) || !defined($s2) );
return ( $s1 eq $s2 );
}
#=== FUNCTION ================================================================
# NAME: clean_string
# PURPOSE: Clean a string of non-printables, newlines, multiple spaces
# PARAMETERS: $str The string to process
# RETURNS: The processed string
# DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
# characters. Removes all CR/LF sequences. Removes multiple
# spaces.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO:
#===============================================================================
sub clean_string {
my ($str) = @_;
$str =~ s/(^\s+|\s+$)//g;
$str =~ tr/[[:graph:]]//c;
$str =~ tr/\x0A\x0D/ /;
$str =~ tr/ / /s;
return $str;
}
#=== FUNCTION ================================================================
# NAME: normaliseDT
# PURPOSE: Normalise an ISO8601 date for comparison, etc.
# PARAMETERS: $dt a DateTime object
# RETURNS: The DateTime object formatted as an ISO8601 string
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub normaliseDT {
my ($dt) = @_;
my $p = DateTime::Format::SQLite->new();
return (
defined($dt)
? $p->format_datetime(
DateTime::Format::ISO8601->parse_datetime($dt)
)
: undef
);
}
#=== FUNCTION ================================================================
# NAME: flattenArray
# PURPOSE: Turns an arrayref into a simple list in a string
# PARAMETERS: $item - the item that may be an arrayref
# RETURNS: The plain item if it's not an array otherwise the flattened
# version
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub flattenArray {
my ($item) = @_;
my $result;
if ( ref( $item ) eq 'ARRAY' ) {
$result = join(", ",@{$item});
}
else {
$result = $item;
}
return $item;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
return $_ if defined($_);
}
return; # undef
}
#=== FUNCTION ================================================================
# NAME: emit
# PURPOSE: Print text on STDERR unless silent mode has been selected
# PARAMETERS: - Boolean indicating whether to be silent or not
# - list of arguments to 'print'
# RETURNS: Nothing
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
# a message to STDERR depending on a boolean. We need this to be
# able to make the script silent when the -silent option is
# selected
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub emit {
unless (shift) {
print STDERR @_;
}
}
#=== FUNCTION ================================================================
# NAME: _debug
# PURPOSE: Prints debug reports
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
# $messages... Arbitrary list of messages to print
# RETURNS: Nothing
# DESCRIPTION: Outputs messages if $active is true. It removes any trailing
# newline from each one and then adds one in the 'print' to the
# caller doesn't have to bother. Prepends each message with 'D>'
# to show it's a debug message.
# THROWS: No exceptions
# COMMENTS: Differs from other functions of the same name
# SEE ALSO: N/A
#===============================================================================
sub _debug {
my $active = shift;
my $message;
return unless $active;
while ($message = shift) {
chomp($message);
print STDERR "D> $message\n";
}
}
2022-11-19 21:27:51 +00:00
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "manpage", "debug=i", "dry-run!",
"silent!", "load", "delete", "input=s",
"scan!", "refresh!", "expire!", "report:s",
"ignore-case!", "html!", "check:s", "json:s",
"opml:s", "config=s", "out=s", "rejects:s",
"template:s",
2022-11-19 21:27:51 +00:00
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
feedWatcher - watch a collection of podcast feeds
=head1 VERSION
This documentation refers to I<feedWatcher> version 0.1.4
2022-11-19 21:27:51 +00:00
=head1 USAGE
feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
[-check[=mode]] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
2022-11-19 21:27:51 +00:00
[-[no]silent] [-config=FILE] [-debug=N] [URL ...]
2023-01-11 09:45:38 +00:00
# Load URLs from a file, perform checks, save the rejects and redirect output
# to a named file. Uses an alias:
# alias isostamp='date +"%Y%m%d_%H%M%S"'
#
./feedWatcher -load=feedWatcher_dumped_URLs.txt -check=auto \
-rej=output/rejects_$(isostamp).out > output/load_$(isostamp).out 2>&1
# Generate Markdown output with a template writing to a named file
./feedWatcher -tem=feedWatcher_3.tpl -out=feedWatcher.mkd
2022-11-19 21:27:51 +00:00
=head1 ARGUMENTS
Arguments are optional and may consist of an arbitrarily long list of URLs to
be processed and possibly added to the database by the script.
These URLs are prepended to any which may be provided through the
B<-load=FILE> option described below. The script makes sure the URL collection
contains no duplicates.
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
The full documentation can be displayed with the command:
perldoc -oman feedWatcher
Alternatively, use the option B<-manpage>
=item B<-manpage>
Prints the entire documentation for the script. Note that this mode uses a
simpler type of formatter than I<perldoc>.
=item B<-load=FILE>
Defines a file from which new URLs are to be added to the database. These URLs
are checked in various ways before adding to the database. If arguments are
provided when the script is invoked these URLs are appended to the argument list.
=item B<-delete=FILE>
Defines a file from which a list of URLs is to be read which are to be
deleted from the database.
Note that it is possible (though inadvisable) to both add and delete an URL in
the same run of the script. The URL will first be added (from the load file or
argument list) then deleted. This is a pointless exercise which wastes
bandwidth, so don't do it!
=item B<-[no]scan>
This option (B<-scan>) causes the URLs stored in the database to be scanned
and updated. The negated form, which is also the default behaviour of the
script, (B<-noscan>) omits the scan.
NOTE: This function is not implemented yet.
2023-01-11 09:45:38 +00:00
=item B<-[no]silent>
This option controls the amount of output written by the script. In
B<nosilent> mode the script reports on the processing of each URL it receives,
which can be fairly verbose. This can be turned off with this option, though
it is often wiser to redirect the output for later review rather than to
suppress it.
2022-11-19 21:27:51 +00:00
=item B<-out=FILE>
2023-01-11 09:45:38 +00:00
This option defines an output file to receive output from the reporting
functions. If the option is omitted the data is written to STDOUT, allowing it
2023-01-11 09:45:38 +00:00
to be redirected if required. See the 'Usage' section above for an example of
how transactional output can be redirected.
=item B<-check[=MODE]>
This option (B<-check[=MODE]>) controls the mode used to check
the copyright setting of the current feed and deciide whether to add it.
Possible settings are: B<auto>, B<manual> and B<none>.
=over 4
=item B<-check=auto> or B<-check>
An automatic check is made against a series of regular expressions looking for
something in the I<copyright> field which signifies that the feed is under
a Creative Commons licence. A blank field is currently considered to denote
this type of licence.
The option may be written as B<-check> when it is interpreted as B<-check=auto>
=item B<-check=manual>
In this mode the script pauses after processing each feed to ask the script
user to check that it's OK to add it. The script reports the I<copyright>
field and requests a I<y> or I<n> response.
=item B<-check=manual>
In this mode no checks are performed.
=back
2022-11-19 21:27:51 +00:00
2023-01-11 09:45:38 +00:00
=item B<-report[=title]>
2022-11-19 21:27:51 +00:00
2023-01-11 09:45:38 +00:00
This option (B<-report[=title]>) causes a fairly simplistic report to be
generated to enable the database contents to be examined. The I<title>
argument specifies a case-sensitive feed title or component of such a title.
So, for instance B<-report=Hacker> currently reports on the batabase data
relating to the "Hacker Public Radio" feed.
2022-11-19 21:27:51 +00:00
2023-01-11 09:45:38 +00:00
If the argument is omitted the entire database is reported.
2022-11-19 21:27:51 +00:00
2023-01-11 09:45:38 +00:00
Reports consist of the details of the RSS (or Atom) channel with other
information about the site hosting the feed such as the IP address. The latest
episode in the feed is also reported.
2022-11-19 21:27:51 +00:00
2023-01-11 09:45:38 +00:00
Note that the feed information in the database is a snapshot made when the
feed details were last loaded. This is static information and does not get
updatedunless the feed is deleted and reloaded, or the B<-scan> function is run
(not currently available).
2022-11-19 21:27:51 +00:00
=item B<-json[=FILE]>
This option, which may be omitted, defines the location where the feed and
episode details are to be written. If omitted no JSON data is written.
If the option is given as B<-json=FILE> the data is written to the nominated
file.
If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
used.
=item B<-opml[=FILE]>
This option, which may be omitted, defines the location where the feed details
are to be written. If omitted no OPML data is written.
If the option is given as B<-opml=FILE> the data is written to the nominated
file.
If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
used.
2023-01-11 09:45:38 +00:00
=item B<-template[=FILE]>
2022-11-19 21:27:51 +00:00
This option defines the template used to generate a form of the feed data. The
2023-01-11 09:45:38 +00:00
template is written using the B<Template Toolkit> language.
2022-11-19 21:27:51 +00:00
If the file name is omitted then the script uses the file B<feedWatcher.tpl>
in the same directory as the script. If this file does not exist then the
script will exit with an error message.
For convenience B<feedWatcher.tpl> can be made a soft link which points to the
file which is the current default. This allows the development of versions
without changing the usual way this script is run.
=item B<-config=FILE>
This option allows an alternative configuration file to be used. This file
defines the location of the database, its name and the username and
password to be used to access it. This feature permits a test database to be
used, or for two more sets of feeds to be processed.
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
If the option is omitted the default file is used: B<feedWatcher.cfg>
=item B<-debug=N>
This option selects the debug level, resulting in a lot of output.
0 (the default) No debug output
1 Dumps the list of feeds found in an HTML download.
Also shows the SQL query which will result in the insertion of a new
row into the database table I<urls>.
2 Dumps the collected data which is destined to be written to the database
3 Dumps the contents of feeds during analysis
=back
=head1 DESCRIPTION
A full description of the application and its features.
May include numerous subsections (i.e. =head2, =head3, etc.)
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
A configuration file is used by the script. By default the name is
B<feedWatcher.cfg> (though, what is searched for is derived from the name of
the script). An alternative configuration file can be provided, specified
through the B<-config=FILE> option.
The file conforms to the format managed by the B<Config::General> module:
<database>
type = SQLite
file = feedWatcher.db
user =
password =
</database>
The B<type> value should be I<SQLite>, though it might be possible to use
other database types.
The B<file> value is the name of the database, which can include the
directory path.
The B<user> and B<password> values are not currently required, but if
authentication to the database was required it would be implemented this way.
=head1 DEPENDENCIES
Config::General
DBI
Data::Dumper
DateTime::Format::ISO8601
DateTime::Format::SQLite
Feed::Find
Getopt::Long
HTML::Entities
IO::Prompter
IO::Socket
JSON
LWP::UserAgent
List::MoreUtils qw{uniq}
Log::Handler
Net::DNS
Pod::Usage
SQL::Abstract
Template
Template::Filters
Try::Tiny
URI
WWW::RobotRules
XML::Feed
XML::RSS::Parser
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com) 2020
=head1 LICENCE AND COPYRIGHT
Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This program is free software. You can redistribute it and/or modify it under
the same terms as perl itself.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker