forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env perl
 | 
						|
#===============================================================================
 | 
						|
#
 | 
						|
#         FILE: feedWatcher
 | 
						|
#
 | 
						|
#        USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
 | 
						|
#               [-[no]report] [-[no]check] [-out=FILE] [-json[=FILE]]
 | 
						|
#               [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
 | 
						|
#               [URL ...]
 | 
						|
#
 | 
						|
#  DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
 | 
						|
#               podcasts. Ken's script was inspired by Dann Washko's site at
 | 
						|
#               http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
 | 
						|
#               August 2012 where it was used to generate handouts.
 | 
						|
#
 | 
						|
#               The script has not been developed since 2014, but is now in
 | 
						|
#               use in 2020 helping to prepare podcast information for
 | 
						|
#               a FOSDEM visit under the heading of "Free Culture Podcasts".
 | 
						|
#               See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
 | 
						|
#               (created with Pandoc) in the same directory as this script for
 | 
						|
#               details of what has been done to develop the original design.
 | 
						|
#
 | 
						|
#      OPTIONS: ---
 | 
						|
# REQUIREMENTS: ---
 | 
						|
#         BUGS: ---
 | 
						|
#        NOTES: ---
 | 
						|
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | 
						|
#      VERSION: 0.0.14
 | 
						|
#      CREATED: 2013-12-25 12:40:33
 | 
						|
#     REVISION: 2020-02-18 22:45:28
 | 
						|
#
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Released under the terms of the GNU Affero General Public License (AGPLv3)
 | 
						|
# http://www.gnu.org/licenses/agpl-3.0.html
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
#
 | 
						|
#===============================================================================
 | 
						|
 | 
						|
use 5.030;
 | 
						|
use strict;
 | 
						|
use warnings;
 | 
						|
use utf8;
 | 
						|
 | 
						|
use Getopt::Long;
 | 
						|
use Pod::Usage;
 | 
						|
use Config::General;
 | 
						|
use List::MoreUtils qw{uniq};
 | 
						|
use Log::Handler;
 | 
						|
 | 
						|
use Try::Tiny;
 | 
						|
 | 
						|
use URI;
 | 
						|
use Net::DNS;
 | 
						|
use IO::Socket;
 | 
						|
use LWP::UserAgent;
 | 
						|
use WWW::RobotRules;
 | 
						|
use XML::RSS::Parser;
 | 
						|
use XML::Feed;
 | 
						|
use Feed::Find;
 | 
						|
 | 
						|
use Template;
 | 
						|
use Template::Filters;
 | 
						|
Template::Filters->use_html_entities;    # Use HTML::Entities in the template
 | 
						|
 | 
						|
use HTML::Entities;
 | 
						|
 | 
						|
use IO::Prompter;
 | 
						|
 | 
						|
use JSON;
 | 
						|
 | 
						|
use DBI;
 | 
						|
use DateTime::Format::SQLite;
 | 
						|
use DateTime::Format::ISO8601;
 | 
						|
use SQL::Abstract;
 | 
						|
 | 
						|
use Data::Dumper;
 | 
						|
 | 
						|
#
 | 
						|
# Version number (manually incremented)
 | 
						|
#
 | 
						|
our $VERSION = '0.0.14';
 | 
						|
 | 
						|
#
 | 
						|
# Script name
 | 
						|
#
 | 
						|
( my $PROG = $0 ) =~ s|.*/||mx;
 | 
						|
 | 
						|
#
 | 
						|
# Declarations
 | 
						|
#
 | 
						|
my ( @new_urls, @deletions );
 | 
						|
my ( $rules,    $robot_name ) = ( undef, "$PROG/$VERSION" );
 | 
						|
my ( $sth1, $h1, $rv );
 | 
						|
 | 
						|
my $feeds;
 | 
						|
 | 
						|
#
 | 
						|
# File and directory paths
 | 
						|
#
 | 
						|
my $basedir     = "$ENV{HOME}/HPR/feed_watcher";
 | 
						|
my $tempdir     = "$basedir/tmp";
 | 
						|
my $configfile  = "$basedir/$PROG.cfg";
 | 
						|
my $logfile     = "$basedir/${PROG}.log";
 | 
						|
my $deftemplate = "$basedir/${PROG}.tpl";
 | 
						|
 | 
						|
#
 | 
						|
# Hash key map to database field names (chevron comments are to stop Perltidy
 | 
						|
# messing with the layout)
 | 
						|
#
 | 
						|
                                                #<<<
 | 
						|
my %keymap = (
 | 
						|
    AUTHOR            => 'author',
 | 
						|
    COPYRIGHT         => 'copyright',
 | 
						|
    DESCRIPTION       => 'description',
 | 
						|
    DNS               => 'dns',
 | 
						|
#   ENCLOSURES        => undef,
 | 
						|
#   ENCLOSURE_COUNT   => undef,
 | 
						|
    FORMAT            => 'feedformat',
 | 
						|
    GENERATOR         => 'generator',
 | 
						|
#   HOST              => undef,
 | 
						|
    HOSTUP            => 'host_up',
 | 
						|
    HTTP_CONTENT_TYPE => 'content_type',
 | 
						|
    HTTP_STATUS       => 'http_status',
 | 
						|
    IMAGE             => 'image',
 | 
						|
    LANGUAGE          => 'language',
 | 
						|
    LINK              => 'link',
 | 
						|
    MODIFIED          => 'modified',
 | 
						|
#   PORT              => undef,
 | 
						|
#   ROBOTS            => undef,
 | 
						|
#   SAVE              => undef,
 | 
						|
    TITLE             => 'title',
 | 
						|
    TYPE              => 'urltype',
 | 
						|
    URI               => 'url',
 | 
						|
#   URI_ID            => undef,
 | 
						|
);
 | 
						|
                                                #>>>
 | 
						|
 | 
						|
#
 | 
						|
# Headers for LWP::UserAgent
 | 
						|
#
 | 
						|
my %headers = (
 | 
						|
    'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
 | 
						|
        . 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
 | 
						|
    'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
 | 
						|
        . 'image/pjpeg, image/png, */*',
 | 
						|
    'Accept-Charset'  => 'iso-8859-1,*,utf-8',
 | 
						|
    'Accept-Language' => 'en-US',
 | 
						|
);
 | 
						|
 | 
						|
#
 | 
						|
# Enable Unicode mode
 | 
						|
#
 | 
						|
binmode STDOUT, ":encoding(UTF-8)";
 | 
						|
binmode STDERR, ":encoding(UTF-8)";
 | 
						|
 | 
						|
#
 | 
						|
# Options and arguments
 | 
						|
#
 | 
						|
my %options;
 | 
						|
Options( \%options );
 | 
						|
 | 
						|
#
 | 
						|
# Any arguments are taken to be URLs
 | 
						|
#
 | 
						|
@new_urls = @ARGV;
 | 
						|
 | 
						|
#
 | 
						|
# Default help
 | 
						|
#
 | 
						|
pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
 | 
						|
    if ( $options{'help'} );
 | 
						|
 | 
						|
#
 | 
						|
# Detailed help
 | 
						|
#
 | 
						|
pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
 | 
						|
    if ( $options{'manpage'} );
 | 
						|
 | 
						|
#
 | 
						|
# Collect options
 | 
						|
#
 | 
						|
my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
 | 
						|
 | 
						|
my $cfgfile
 | 
						|
    = ( defined( $options{config} ) ? $options{config} : $configfile );
 | 
						|
my $silent     = ( defined( $options{silent} )    ? $options{silent}    : 0 );
 | 
						|
my $loadfile   = $options{'load'};
 | 
						|
my $deletefile = $options{'delete'};
 | 
						|
 | 
						|
my $scan   = ( defined( $options{scan} )   ? $options{scan}   : 0 );
 | 
						|
my $report = ( defined( $options{report} ) ? $options{report} : 0 );
 | 
						|
my $check  = ( defined( $options{check} )  ? $options{check}  : 0 );
 | 
						|
 | 
						|
my $outfile  = $options{out};
 | 
						|
my $json     = $options{json};
 | 
						|
my $opml     = $options{opml};
 | 
						|
my $template = $options{template};
 | 
						|
 | 
						|
#
 | 
						|
# Check the configuration file
 | 
						|
#
 | 
						|
die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
 | 
						|
 | 
						|
#
 | 
						|
# Check the load file
 | 
						|
#
 | 
						|
if ($loadfile) {
 | 
						|
    die "File $loadfile does not exist\n"  unless -e $loadfile;
 | 
						|
    die "File $loadfile is not readable\n" unless -r $loadfile;
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Check the delete file
 | 
						|
#
 | 
						|
if ($deletefile) {
 | 
						|
    die "File $deletefile does not exist\n"  unless -e $deletefile;
 | 
						|
    die "File $deletefile is not readable\n" unless -r $deletefile;
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# We accept -json or -json=filename. In the former case we make a default
 | 
						|
# name, otherwise we use the one provided.
 | 
						|
#
 | 
						|
if ( defined($json) ) {
 | 
						|
    if ($json =~ /^$/) {
 | 
						|
        $json = "$PROG.json"
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# We accept -opml or -opml=filename. In the former case we make a default
 | 
						|
# name, otherwise we use the one provided.
 | 
						|
#
 | 
						|
if ( defined($opml) ) {
 | 
						|
    if ($opml =~ /^$/) {
 | 
						|
        $opml = "$PROG.opml"
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Similarly we accept -template or -template=filename. In the former case we
 | 
						|
# make a default name, otherwise we use the one provided.
 | 
						|
#
 | 
						|
if ( defined($template) ) {
 | 
						|
    if ($template =~ /^$/) {
 | 
						|
        $template = "$deftemplate";
 | 
						|
    }
 | 
						|
 | 
						|
    die "Error: Unable to find template $template\n" unless -r $template;
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Load configuration data
 | 
						|
#
 | 
						|
my $conf = new Config::General(
 | 
						|
    -ConfigFile      => $cfgfile,
 | 
						|
    -InterPolateVars => 1,
 | 
						|
    -ExtendedAccess  => 1
 | 
						|
);
 | 
						|
my %config = $conf->getall();
 | 
						|
 | 
						|
#
 | 
						|
# Connect to the database
 | 
						|
#
 | 
						|
my $dbtype = $config{database}->{type};
 | 
						|
my $dbfile = $config{database}->{file};
 | 
						|
my $dbuser = $config{database}->{user};
 | 
						|
my $dbpwd  = $config{database}->{password};
 | 
						|
my $dbh
 | 
						|
    = DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
 | 
						|
    { AutoCommit => 1, sqlite_unicode => 1, } )
 | 
						|
    or die $DBI::errstr;
 | 
						|
 | 
						|
#
 | 
						|
# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
 | 
						|
# foreign key constraints to function)
 | 
						|
#
 | 
						|
$dbh->do('PRAGMA foreign_keys = ON');
 | 
						|
 | 
						|
#
 | 
						|
# Check we have something to do
 | 
						|
#
 | 
						|
my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
 | 
						|
my $work = (
 | 
						|
           scalar(@new_urls) > 0
 | 
						|
        || defined($loadfile)
 | 
						|
        || defined($deletefile)
 | 
						|
        || ( $report
 | 
						|
        || defined($json)
 | 
						|
        || defined($opml)
 | 
						|
        || defined($template)
 | 
						|
        || $scan && $rows > 0 )
 | 
						|
);
 | 
						|
 | 
						|
die "Nothing to do!\n" unless $work;
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Set up logging keeping the default log layout except for the date
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
my $LOG = Log::Handler->new();
 | 
						|
 | 
						|
$LOG->add(
 | 
						|
    file => {
 | 
						|
        timeformat => "%Y/%m/%d %H:%M:%S",
 | 
						|
        filename   => $logfile,
 | 
						|
        minlevel   => 0,
 | 
						|
        maxlevel   => 7,
 | 
						|
    }
 | 
						|
);
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Open the output file (or STDOUT) - we may need the date to do it
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
my $outfh;
 | 
						|
if ($outfile) {
 | 
						|
    open( $outfh, ">:encoding(UTF-8)", $outfile )
 | 
						|
        or warn "Unable to open $outfile for writing: $!";
 | 
						|
}
 | 
						|
else {
 | 
						|
    open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
 | 
						|
        or warn "Unable to initialise for writing: $!";
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
#
 | 
						|
# Set up a robot.txt rules parser
 | 
						|
#
 | 
						|
$rules = WWW::RobotRules->new($robot_name);
 | 
						|
 | 
						|
#
 | 
						|
# Slurp the load file if there is one
 | 
						|
#
 | 
						|
if ($loadfile) {
 | 
						|
    #
 | 
						|
    # Load the input file
 | 
						|
    #
 | 
						|
    open( my $in, '<:encoding(utf8)', $loadfile )
 | 
						|
        or die "$PROG : failed to open load file '$loadfile' : $!\n";
 | 
						|
    chomp( my @loaded = <$in> );
 | 
						|
    close($in)
 | 
						|
        or warn "$PROG : failed to close load file '$loadfile' : $!\n";
 | 
						|
 | 
						|
    $LOG->info("Loaded URLs from $loadfile");
 | 
						|
 | 
						|
    #
 | 
						|
    # Add the loaded URLs to the array
 | 
						|
    #
 | 
						|
    push( @new_urls, @loaded );
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Now, we either have URLs from the command line, or from the load file, so we
 | 
						|
# process these.
 | 
						|
#
 | 
						|
if (@new_urls) {
 | 
						|
    #
 | 
						|
    # Remove duplicates
 | 
						|
    #
 | 
						|
    @new_urls = uniq(@new_urls);
 | 
						|
 | 
						|
    $LOG->info( "Adding ", scalar(@new_urls), " URLs to the database" );
 | 
						|
 | 
						|
    #
 | 
						|
    # Load these URLs as appropriate, returning any more that we find by
 | 
						|
    # following HTML urls.
 | 
						|
    #
 | 
						|
    @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
 | 
						|
 | 
						|
    #
 | 
						|
    # Now process any URLs that came back. Since we are explicitly looking for
 | 
						|
    # feeds we can assume that's what we have so don't need to recurse again.
 | 
						|
    #
 | 
						|
    # TODO
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Process the delete file if there is one
 | 
						|
#
 | 
						|
if ($deletefile) {
 | 
						|
    #
 | 
						|
    # Load the delete file
 | 
						|
    #
 | 
						|
    open( my $del, '<:encoding(utf8)', $deletefile )
 | 
						|
        or die "$PROG : failed to open load file '$deletefile' : $!\n";
 | 
						|
    chomp( @deletions = <$del> );
 | 
						|
    close($del)
 | 
						|
        or warn "$PROG : failed to close load file '$deletefile' : $!\n";
 | 
						|
 | 
						|
    #
 | 
						|
    # Remove duplicates
 | 
						|
    #
 | 
						|
    @deletions = uniq(@deletions);
 | 
						|
 | 
						|
    $LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" );
 | 
						|
 | 
						|
    #
 | 
						|
    # There are URLs to delete. Process them on by one.
 | 
						|
    #
 | 
						|
    if (@deletions) {
 | 
						|
        $sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
 | 
						|
        foreach my $rec (@deletions) {
 | 
						|
            $rv = $sth1->execute($rec);
 | 
						|
            if ( $dbh->err ) {
 | 
						|
                warn $dbh->errstr;
 | 
						|
            }
 | 
						|
            if ( $rv != 0 ) {
 | 
						|
                emit ( $silent, "Deleted $rec ($rv rows)\n" );
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                emit ( $silent, "Failed to delete $rec\n" );
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Perform a database scan if requested
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
if ($scan) {
 | 
						|
    $LOG->info( "Scan is not fully implemented yet" );
 | 
						|
    scanDB($dbh, \%keymap);
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Report database contents if requested
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
if ($report) {
 | 
						|
    reportDB($dbh, $outfh);
 | 
						|
    $LOG->info( "Report generated" );
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Do any image repairs we need, but temporarily
 | 
						|
#
 | 
						|
if ($json || $opml || $template) {
 | 
						|
    $feeds = collectData($dbh);
 | 
						|
 | 
						|
    #
 | 
						|
    # Clean up the image references (until we have a proper fix)
 | 
						|
    #
 | 
						|
    for (my $i = 0; $i < scalar(@$feeds); $i++) {
 | 
						|
        my $f = $feeds->[$i];
 | 
						|
        if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
 | 
						|
            $f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
 | 
						|
            $LOG->info( "Fixed image for $f->{urls_url}" );
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Output all feeds to a JSON file if requested
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
if ($json) {
 | 
						|
    my $js = JSON->new->utf8->canonical->pretty;
 | 
						|
    open( my $out, '>:encoding(UTF-8)', $json )
 | 
						|
        or die "Unable to open output file $json $!\n";
 | 
						|
 | 
						|
    print $out $js->encode($feeds), "\n";
 | 
						|
    close($out);
 | 
						|
    emit ( $silent, "JSON data written to $json\n" );
 | 
						|
    $LOG->info( "JSON data written to $json" );
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Requesting the -opml option means to dump the entire feed table as OPML
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
if ($opml) {
 | 
						|
    require XML::OPML;
 | 
						|
 | 
						|
    open( my $out, '>:encoding(UTF-8)', $opml )
 | 
						|
        or die "Unable to open output file $opml $!\n";
 | 
						|
 | 
						|
    #
 | 
						|
    # Start building OPML. Mandatory attributes are 'type', 'text' and
 | 
						|
    # 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
 | 
						|
    # attribute is poorly defined; this module uses 'rss' as the type and
 | 
						|
    # 'RSS' as the value of the 'version' attribute. This is not linked to the
 | 
						|
    # type of the feed.
 | 
						|
    #
 | 
						|
    my $OPML = XML::OPML->new(version => '1.1');
 | 
						|
 | 
						|
    #
 | 
						|
    # Create the 'head' element
 | 
						|
    #
 | 
						|
    my $DT = normaliseDT(DateTime->now());
 | 
						|
    $OPML->head(
 | 
						|
        title => 'Free Culture Podcasts',
 | 
						|
        dateCreated => $DT,
 | 
						|
        dateModified => $DT,
 | 
						|
    );
 | 
						|
 | 
						|
    #
 | 
						|
    # Walk the feeds generating OPML
 | 
						|
    #
 | 
						|
    for (my $i = 0; $i < scalar(@$feeds); $i++) {
 | 
						|
        my $f = $feeds->[$i];
 | 
						|
 | 
						|
        $OPML->add_outline(
 | 
						|
            title       => $f->{urls_title},
 | 
						|
            text        => $f->{urls_title},
 | 
						|
            description => $f->{urls_description},
 | 
						|
            xmlUrl      => $f->{urls_url},
 | 
						|
            htmlUrl     => $f->{urls_link},
 | 
						|
        );
 | 
						|
    }
 | 
						|
 | 
						|
    #
 | 
						|
    # Output OPML to the nominated file
 | 
						|
    #
 | 
						|
    print $out $OPML->as_string;
 | 
						|
 | 
						|
    close($out);
 | 
						|
    emit ( $silent, "OPML data written to $opml\n" );
 | 
						|
    $LOG->info( "OPML data written to $opml" );
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
# Fill and print the template if requested
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
if ($template) {
 | 
						|
    my $tt = Template->new(
 | 
						|
        {   ABSOLUTE     => 1,
 | 
						|
            ENCODING     => 'utf8',
 | 
						|
            INCLUDE_PATH => $basedir,
 | 
						|
        }
 | 
						|
    );
 | 
						|
 | 
						|
    #
 | 
						|
    # Make the structure the template needs
 | 
						|
    #
 | 
						|
    my $vars = {
 | 
						|
        feeds => $feeds,
 | 
						|
    };
 | 
						|
 | 
						|
#   print Dumper($vars),"\n";
 | 
						|
 | 
						|
    my $document;
 | 
						|
    $tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
 | 
						|
        || die $tt->error(), "\n";
 | 
						|
    print $outfh $document;
 | 
						|
 | 
						|
    emit ( $silent, "Data processed with $template\n" );
 | 
						|
    $LOG->info( "Data processed with $template" );
 | 
						|
}
 | 
						|
 | 
						|
#$dbh->disconnect;
 | 
						|
 | 
						|
exit;
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: loadUrls
 | 
						|
#      PURPOSE: To load URLs read from the input file into the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $new_urls       arrayref containing URLs
 | 
						|
#               $rules          WWW::RobotRules object
 | 
						|
#               $keymap         hashref containing a map of key names to
 | 
						|
#                               database field names
 | 
						|
#      RETURNS: Currently nothing. The plan is to return any new URLs
 | 
						|
#               discovered by investigating non-feed URLs.
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub loadUrls {
 | 
						|
    my ( $dbh, $new_urls, $rules, $keymap ) = @_;
 | 
						|
 | 
						|
    my ( $stream, $feed );
 | 
						|
    my %uridata;
 | 
						|
    my $roboturl;
 | 
						|
 | 
						|
    #
 | 
						|
    # Work through the list of URLs
 | 
						|
    #
 | 
						|
    foreach my $rec (@new_urls) {
 | 
						|
        %uridata = ();
 | 
						|
 | 
						|
        #
 | 
						|
        # By default save the collected data
 | 
						|
        #
 | 
						|
        $uridata{SAVE} = 1;
 | 
						|
 | 
						|
        #
 | 
						|
        # Check that we have a valid URL. We don't save them if they are
 | 
						|
        # invalid ($uridata{SAVE} is set to 0 in the routine).
 | 
						|
        #
 | 
						|
        my $uri = validateURI($rec,\%uridata);
 | 
						|
        next unless $uri;
 | 
						|
 | 
						|
        #
 | 
						|
        # Check whether we already have this URI in the database
 | 
						|
        #
 | 
						|
        if (dbSearch(
 | 
						|
                $dbh, 'SELECT * FROM urls WHERE url = ?',
 | 
						|
                $uridata{URI}
 | 
						|
            )
 | 
						|
            )
 | 
						|
        {
 | 
						|
            emit( $silent, "$uri is already in the database\n" );
 | 
						|
            $uridata{SAVE} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Check the hostname resolves in DNS
 | 
						|
        #
 | 
						|
        if ( checkDNS( $uri, \%uridata ) ) {
 | 
						|
            $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
 | 
						|
            emit( $silent, "DNS: $uridata{DNS}\n" );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "$uri has no DNS entry\n" );
 | 
						|
            $uridata{SAVE} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Check the server is available. Don't save if it's not (probably will
 | 
						|
        # not get the feed anyway).
 | 
						|
        #
 | 
						|
        if ( serverUp( $uri, \%uridata ) ) {
 | 
						|
            emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
 | 
						|
            $uridata{SAVE} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Check the site's robots.txt. If there's a block then don't save.
 | 
						|
        #
 | 
						|
        if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
 | 
						|
            emit( $silent, "Check of robots.txt rules succeeded\n" );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            if ( $uridata{ROBOTS} =~ /404/ ) {
 | 
						|
                emit( $silent, "Search for robots.txt file failed\n" );
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                emit( $silent, "Check of robots.txt rules failed\n" );
 | 
						|
                $uridata{SAVE} = 0;
 | 
						|
                next;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Look for the HTTP content type. Don't save if the request failed.
 | 
						|
        #
 | 
						|
        if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
 | 
						|
            emit( $silent, "HTTP request OK\n" );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "HTTP request failed\n" );
 | 
						|
            $uridata{SAVE} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Parse the feed
 | 
						|
        #
 | 
						|
        if ( $uridata{TYPE} eq 'Feed' ) {
 | 
						|
            $LOG->info('Processing feed: ',$uridata{URI});
 | 
						|
            $stream = getFeed( $uridata{URI} );
 | 
						|
            if ($stream) {
 | 
						|
                $feed = parseFeed( $uridata{URI}, $stream );
 | 
						|
                unless ( $feed ) {
 | 
						|
                    $uridata{SAVE} = 0;
 | 
						|
                    next;
 | 
						|
                }
 | 
						|
 | 
						|
                #
 | 
						|
                # Provide a means of examining the returned feed object. There
 | 
						|
                # are some weird behaviours in there (partly because of the
 | 
						|
                # weirdness of RSS and poor adherence to what standards there
 | 
						|
                # are).
 | 
						|
                #
 | 
						|
                print Dumper($feed), "\n" if ( $DEBUG > 2 );
 | 
						|
                storeFeed($feed,\%uridata);
 | 
						|
 | 
						|
                #
 | 
						|
                # Perform a check on the copyright. Routine sets
 | 
						|
                # $uridata{SAVE} = 0 if the copyright is not acceptable.
 | 
						|
                #
 | 
						|
                if ($check) {
 | 
						|
                    next unless checkCopyright(\%uridata);
 | 
						|
                }
 | 
						|
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                emit($silent, "Problem collecting feed");
 | 
						|
                $uridata{SAVE} = 0;
 | 
						|
                next;
 | 
						|
            }
 | 
						|
        }
 | 
						|
        elsif ( $uridata{TYPE} eq 'HTML' ) {
 | 
						|
            #
 | 
						|
            # TODO Save the URL and find any feeds it contains
 | 
						|
            #
 | 
						|
            emit( $silent,
 | 
						|
                "Type $uridata{TYPE} not implemented yet; skipped\n" );
 | 
						|
            next;
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent,
 | 
						|
                "Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
 | 
						|
            $uridata{SAVE} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
    }
 | 
						|
    continue {
 | 
						|
        # { This 'continue' block is executed for each iteration or 'next' }
 | 
						|
 | 
						|
        #
 | 
						|
        # Decide whether to save what we have collected
 | 
						|
        #
 | 
						|
        if ( $uridata{SAVE} ) {
 | 
						|
            if ( addURI( $dbh, \%uridata, $keymap ) ) {
 | 
						|
                emit( $silent, "$uridata{URI} added to the database\n" );
 | 
						|
                $LOG->info('Feed ',$uridata{URI},' added to the database');
 | 
						|
 | 
						|
                #
 | 
						|
                # Get the id the database allocated for the row we added
 | 
						|
                #
 | 
						|
                $uridata{URI_ID}
 | 
						|
                    = $dbh->last_insert_id( undef, undef, 'urls', undef );
 | 
						|
 | 
						|
                #
 | 
						|
                # Add any enclosures we found
 | 
						|
                #
 | 
						|
                if ( defined( $uridata{ENCLOSURE_COUNT} )
 | 
						|
                    && $uridata{ENCLOSURE_COUNT} > 0 )
 | 
						|
                {
 | 
						|
                    if ( addEnclosures( $dbh, \%uridata ) ) {
 | 
						|
                        emit( $silent, $uridata{ENCLOSURE_COUNT},
 | 
						|
                            " enclosures for $uridata{URI} added to the database\n"
 | 
						|
                        );
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                emit( $silent,
 | 
						|
                    "$uridata{URI} was not added to the database\n" );
 | 
						|
                $LOG->info('Feed ',$uridata{URI},' not added to the database');
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Dump what we have if requested
 | 
						|
        #
 | 
						|
        print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 );
 | 
						|
 | 
						|
        emit( $silent, '-' x 80, "\n" );
 | 
						|
    }
 | 
						|
 | 
						|
    #
 | 
						|
    # Temporary
 | 
						|
    #
 | 
						|
    return;    # undef
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: scanDB
 | 
						|
#      PURPOSE: To scan the URLs in the database and update the stored data
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $keymap         hashref containing a map of key names to
 | 
						|
#                               database field names
 | 
						|
#      RETURNS: Nothing
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub scanDB {
 | 
						|
    my ($dbh, $keymap) = @_;
 | 
						|
 | 
						|
    my ( $sql1, $sth1, $rv1, $h1 );
 | 
						|
    my ( $aref, @urls, $DT, $stream, $feed );
 | 
						|
    my ( %uridata, $urichanges, $enc_changes );
 | 
						|
 | 
						|
    #
 | 
						|
    # Query to return all feed URLs
 | 
						|
    #
 | 
						|
    $sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' order by title};
 | 
						|
 | 
						|
    $sth1 = $dbh->prepare($sql1);
 | 
						|
    $rv1  = $sth1->execute();
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
 | 
						|
    #
 | 
						|
    # Collect everything as an arrayref pointing to a bunch of arrayrefs
 | 
						|
    # containing the column details requested
 | 
						|
    #
 | 
						|
    $aref = $sth1->fetchall_arrayref;
 | 
						|
 | 
						|
    #
 | 
						|
    # Extract just the URL strings
 | 
						|
    #
 | 
						|
    @urls = map { $_->[1] } @{$aref};
 | 
						|
 | 
						|
    #
 | 
						|
    # Now process these URLs from the database one at a time
 | 
						|
    #
 | 
						|
    $sql1 = q{SELECT * FROM urls WHERE url = ?};
 | 
						|
    $sth1 = $dbh->prepare($sql1);
 | 
						|
 | 
						|
    my $count = 0;
 | 
						|
 | 
						|
    foreach my $url (@urls) {
 | 
						|
        %uridata = ();
 | 
						|
 | 
						|
        $count++;
 | 
						|
 | 
						|
        #
 | 
						|
        # Record the scan for this URL
 | 
						|
        #
 | 
						|
        $DT = normaliseDT( DateTime->now() );
 | 
						|
        $uridata{SCANNED_ON} = $DT;
 | 
						|
 | 
						|
        #
 | 
						|
        # Default to OK
 | 
						|
        #
 | 
						|
        $uridata{SCAN_OK} = 1;
 | 
						|
 | 
						|
        emit( $silent, "Scanning '$url'\n" );
 | 
						|
        $rv1 = $sth1->execute($url);
 | 
						|
        if ( $dbh->err ) {
 | 
						|
            warn $dbh->errstr;
 | 
						|
        }
 | 
						|
        $h1 = $sth1->fetchrow_hashref;
 | 
						|
        emit( $silent, $h1->{title}, "\n" );
 | 
						|
 | 
						|
        #
 | 
						|
        # The URL should be valid already from the point at which it was added
 | 
						|
        # to the database, but if we do this we get a canonical version (it
 | 
						|
        # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
 | 
						|
        # it. It also sets $uridata{URI}, which is useful.)
 | 
						|
        #
 | 
						|
        my $uri = validateURI( $url, \%uridata );
 | 
						|
 | 
						|
        #
 | 
						|
        # Check the hostname resolves in DNS
 | 
						|
        #
 | 
						|
        if ( checkDNS( $uri, \%uridata ) ) {
 | 
						|
            $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
 | 
						|
            emit( $silent, "DNS: $uridata{DNS}\n" );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "$uri has no DNS entry\n" );
 | 
						|
            $uridata{SCAN_OK} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Check the server is available
 | 
						|
        #
 | 
						|
        if ( serverUp( $uri, \%uridata ) ) {
 | 
						|
            emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
 | 
						|
            $uridata{SCAN_OK} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Look for the HTTP content type.
 | 
						|
        #
 | 
						|
        if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
 | 
						|
            emit( $silent, "HTTP request OK\n" );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "HTTP request failed\n" );
 | 
						|
            $uridata{SCAN_OK} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Note: not doing the robots.txt check since it was done at load time
 | 
						|
        #
 | 
						|
 | 
						|
        #
 | 
						|
        # We know we have a feed, so go get it
 | 
						|
        #
 | 
						|
        $stream = getFeed( $uridata{URI} );
 | 
						|
        if ($stream) {
 | 
						|
            #
 | 
						|
            # Parse the feed. The routine generates its own error messages
 | 
						|
            #
 | 
						|
            $feed = parseFeed( $uridata{URI}, $stream );
 | 
						|
            unless ( $feed ) {
 | 
						|
                $uridata{SCAN_OK} = 0;
 | 
						|
                next;
 | 
						|
            }
 | 
						|
 | 
						|
            #
 | 
						|
            # Save the important feed components in the %uridata hash
 | 
						|
            #
 | 
						|
            print Dumper($feed), "\n" if ( $DEBUG > 2 );
 | 
						|
            storeFeed( $feed, \%uridata );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "Problem collecting feed" );
 | 
						|
            $uridata{SCAN_OK} = 0;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
    }
 | 
						|
    continue {
 | 
						|
        # { This 'continue' block is executed for each iteration or 'next' }
 | 
						|
 | 
						|
        #
 | 
						|
        # If the scan went OK then perform a comparison between the new feed
 | 
						|
        # data and that which is stored
 | 
						|
        #
 | 
						|
        if ( $uridata{SCAN_OK} ) {
 | 
						|
                $urichanges = updateURI( $dbh, \%uridata, $keymap );
 | 
						|
                $LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
 | 
						|
                    $urichanges )
 | 
						|
                    if $urichanges > 0;
 | 
						|
 | 
						|
                # TODO Update enclosures
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "$uridata{URI} was not scanned successfully\n" );
 | 
						|
            $LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Finished scanning this feed
 | 
						|
        #
 | 
						|
        emit( $silent, '-' x 80, "\n" );
 | 
						|
 | 
						|
        #
 | 
						|
        # Temporarily stop after the first N feeds
 | 
						|
        #
 | 
						|
        last if $count == 2;
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: reportDB
 | 
						|
#      PURPOSE: To generate a printed report from the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $fh             output file handle
 | 
						|
#      RETURNS: Nothing
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: Perhaps we can get fancy with stored reports in the future
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub reportDB {
 | 
						|
    my ($dbh,  $fh) = @_;
 | 
						|
 | 
						|
    #
 | 
						|
    # Query to report the contents of the 'urls' table with the number of
 | 
						|
    # associated episodes
 | 
						|
    #
 | 
						|
    my $sql = q{
 | 
						|
        SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
 | 
						|
            FROM all_episodes ae
 | 
						|
            JOIN urls ON (ae.urls_id = urls.id)
 | 
						|
            GROUP by urls.id
 | 
						|
            HAVING ae.urls_urltype = 'Feed'
 | 
						|
    };
 | 
						|
 | 
						|
    my $sth1 = $dbh->prepare($sql);
 | 
						|
    my $rv   = $sth1->execute();
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
 | 
						|
    while ( my $h1 = $sth1->fetchrow_hashref() ) {
 | 
						|
        foreach my $key ( sort( keys( %{$h1} ) ) ) {
 | 
						|
            printf $fh "%20s: %s\n", $key, coalesce( $h1->{$key}, 'undef' );
 | 
						|
        }
 | 
						|
        print $fh "\n";
 | 
						|
    }
 | 
						|
 | 
						|
    $sth1->finish;
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: collectData
 | 
						|
#      PURPOSE: Collects data from the database for generating a report
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#      RETURNS: An array of hashrefs from the query
 | 
						|
#  DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
 | 
						|
#               table and the 'all_episodes' view, showing the date of the
 | 
						|
#               latest episode. The result is an array of rows, each
 | 
						|
#               represented as a hash, all sorted by the feed title.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub collectData {
 | 
						|
    my ($dbh) = @_;
 | 
						|
 | 
						|
    #
 | 
						|
    # Query to report the contents of the 'urls' table with the details of the
 | 
						|
    # latest episode
 | 
						|
    #
 | 
						|
    my $sql = q{
 | 
						|
        SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
 | 
						|
            FROM all_episodes ae
 | 
						|
            JOIN urls ON (ae.urls_id = urls.id)
 | 
						|
            GROUP by urls.id
 | 
						|
            HAVING ae.urls_urltype = 'Feed'
 | 
						|
            ORDER BY ae.urls_title
 | 
						|
    };
 | 
						|
 | 
						|
    my $sth1 = $dbh->prepare($sql);
 | 
						|
    my $rv   = $sth1->execute();
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
 | 
						|
    #
 | 
						|
    # Return everything as an arrayref of hashrefs
 | 
						|
    #
 | 
						|
    my $tbl_ary_ref = $sth1->fetchall_arrayref({});
 | 
						|
 | 
						|
    $sth1->finish;
 | 
						|
 | 
						|
    return $tbl_ary_ref;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: countRows
 | 
						|
#      PURPOSE: To count the rows in a table
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $sql            SQL expression to use
 | 
						|
#      RETURNS: Number of rows found (note that zero is returned as 0E0)
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub countRows {
 | 
						|
    my ( $dbh, $sql ) = @_;
 | 
						|
 | 
						|
    my $sth1 = $dbh->prepare($sql);
 | 
						|
    my $rv   = $sth1->execute();
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
    my $h1 = $sth1->fetch;
 | 
						|
    $sth1->finish;
 | 
						|
 | 
						|
    return @$h1[0];
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: dbSearch
 | 
						|
#      PURPOSE: To perform a simple search in the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $sql            SQL expression to use (expected to be
 | 
						|
#                               a SELECT)
 | 
						|
#               @args           arguments for the 'execute'
 | 
						|
#      RETURNS: True (1) if the row exists, otherwise false (0).
 | 
						|
#  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
 | 
						|
#               SQL without incurring the overhead of repeated 'prepare'
 | 
						|
#               calls. Only the first row is fetched (we expect there to be
 | 
						|
#               only one) and the success or failure is determined by its
 | 
						|
#               existence.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub dbSearch {
 | 
						|
    my ( $dbh, $sql, @args ) = @_;
 | 
						|
 | 
						|
    my $sth1 = $dbh->prepare_cached($sql);
 | 
						|
    my $rv   = $sth1->execute(@args);
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
    my $h1 = $sth1->fetchrow_hashref();
 | 
						|
    $sth1->finish;
 | 
						|
 | 
						|
    return defined($h1);
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: execSQL
 | 
						|
#      PURPOSE: To perform a non-SELECT query
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $sql            SQL expression to use
 | 
						|
#               @args           arguments for the 'execute'
 | 
						|
#      RETURNS: True (1) if the query succeeded, otherwise false (0).
 | 
						|
#  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
 | 
						|
#               SQL without incurring the overhead of repeated 'prepare'
 | 
						|
#               calls.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub execSQL {
 | 
						|
    my ( $dbh, $sql, @args ) = @_;
 | 
						|
 | 
						|
    my $sth1 = $dbh->prepare_cached($sql);
 | 
						|
    my $rv   = $sth1->execute(@args);
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
    $sth1->finish;
 | 
						|
 | 
						|
    return $rv;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: validateURI
 | 
						|
#      PURPOSE: Checks a URL for validity
 | 
						|
#   PARAMETERS: $rec            the URL handed to the script
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#      RETURNS: A URI object if valid otherwise undef
 | 
						|
#  DESCRIPTION: The URL string is validated with the URI module. A canonical
 | 
						|
#               string version is stored in the hash referenced by $uridata
 | 
						|
#               hash if valid otherwise the URL is marked as invalid.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub validateURI {
 | 
						|
    my ( $rec, $uridata ) = @_;
 | 
						|
 | 
						|
    #
 | 
						|
    # Check that we have a valid URL. We don't save them if they are
 | 
						|
    # invalid.
 | 
						|
    #
 | 
						|
    my $uri = URI->new( $rec, 'http' );
 | 
						|
    if ( $uri->scheme ) {
 | 
						|
        emit( $silent, "URI $uri is valid\n" );
 | 
						|
        $uridata->{URI} = $uri->canonical->as_string;
 | 
						|
        return $uri;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        emit( $silent, "URI $uri is not valid\n" );
 | 
						|
        $uridata->{SAVE} = 0;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: checkDNS
 | 
						|
#      PURPOSE: Looks up a host DNS entry
 | 
						|
#   PARAMETERS: $uri            URI object
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#      RETURNS: True (1) if the DNS query was successful, otherwise false (0)
 | 
						|
#  DESCRIPTION: The host name is extracted from the URI (and stored). The
 | 
						|
#               hostname is searched for in the DNS and if successful, an
 | 
						|
#               array of addresses from the 'A' records is built. This is
 | 
						|
#               stored in the hash referenced by $uridata.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub checkDNS {
 | 
						|
    my ( $uri, $uridata ) = @_;
 | 
						|
 | 
						|
    my @adds;
 | 
						|
    my $hostname = $uri->host;
 | 
						|
    $uridata->{HOST} = $hostname;
 | 
						|
 | 
						|
    my $res   = Net::DNS::Resolver->new;
 | 
						|
    my $query = $res->search($hostname);
 | 
						|
    if ($query) {
 | 
						|
        foreach my $rr ( $query->answer ) {
 | 
						|
            next unless $rr->type eq "A";
 | 
						|
            push( @adds, $rr->address );
 | 
						|
        }
 | 
						|
        $uridata->{DNS} = \@adds;
 | 
						|
        return 1;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        warn "Query failed: ", $res->errorstring, "\n";
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: serverUp
 | 
						|
#      PURPOSE: Checks that a given host is responding
 | 
						|
#   PARAMETERS: $uri            URI object
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#      RETURNS: True (1) if the host responds to a TCP connection, false (0)
 | 
						|
#               otherwise
 | 
						|
#  DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
 | 
						|
#               to the appropriate default for the scheme, such as 80 for
 | 
						|
#               http). Attempts to connect to this host and port. If the
 | 
						|
#               connect fails then details are written to the data structure
 | 
						|
#               pointed to by $uridata.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub serverUp {
 | 
						|
    my ( $uri, $uridata ) = @_;
 | 
						|
 | 
						|
    my ( $hostname, $port, $socket );
 | 
						|
 | 
						|
    $hostname        = $uri->host();
 | 
						|
    $port            = $uri->port();
 | 
						|
    $uridata->{PORT} = $port;
 | 
						|
 | 
						|
    $socket = IO::Socket::INET->new(
 | 
						|
        Proto    => "tcp",
 | 
						|
        PeerAddr => $hostname,
 | 
						|
        PeerPort => $port,
 | 
						|
        Reuse    => 1,
 | 
						|
        Timeout  => 10
 | 
						|
    );
 | 
						|
 | 
						|
    if ($socket) {
 | 
						|
        $socket->close;
 | 
						|
        $uridata->{HOSTUP} = 1;
 | 
						|
        return 1;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $uridata->{HOSTUP} = 0;
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: robotRulesOK
 | 
						|
#      PURPOSE: To check the intended URL against the site's robots.txt rules
 | 
						|
#   PARAMETERS: $uri            URI object
 | 
						|
#               $rules          WWW::RobotRules object
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#      RETURNS: True (1) if the GET of the robots.txt file succeeded and the
 | 
						|
#               rules allow the URI object to be fetched, false (0) otherwise.
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub robotRulesOK {
 | 
						|
    my ( $uri, $rules, $uridata ) = @_;
 | 
						|
 | 
						|
    #
 | 
						|
    # Build the URL of robots.txt
 | 
						|
    #
 | 
						|
    my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
 | 
						|
    my $robots_txt;
 | 
						|
 | 
						|
    #
 | 
						|
    # Use LWP::UserAgent to get the feed and handle errors
 | 
						|
    #
 | 
						|
    my $ua = LWP::UserAgent->new;
 | 
						|
    $ua->timeout(10);
 | 
						|
    $ua->agent("$PROG/$VERSION");
 | 
						|
 | 
						|
    my $response = $ua->get($roboturl);
 | 
						|
    if ( $response->is_success ) {
 | 
						|
        $uridata->{ROBOTS} = 'Found';
 | 
						|
        $robots_txt = $response->decoded_content;
 | 
						|
        $rules->parse( $roboturl, $robots_txt );
 | 
						|
        return $rules->allowed("$uri");
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $uridata->{ROBOTS} = $response->status_line;
 | 
						|
        warn "Failed to get $roboturl\n";
 | 
						|
        warn $response->status_line . "\n";
 | 
						|
        return; # undef
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: checkContentType
 | 
						|
#      PURPOSE: Check the content_type of the url
 | 
						|
#   PARAMETERS: $uri            URI object
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#               $headers        hashref containing query headers
 | 
						|
#               $log            Log::Handler object
 | 
						|
#      RETURNS: True (1) if all was well, otherwise false (0)
 | 
						|
#  DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
 | 
						|
#               for the URI object using the LWP::UserAgent head method. Then
 | 
						|
#               we examine the 'content-type' header looking for the string
 | 
						|
#               'xml' or 'html' in it. The former denotes a feed, and the
 | 
						|
#               latter a normal HTML page.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub checkContentType {
 | 
						|
    my ( $uri, $uridata, $headers, $log ) = @_;
 | 
						|
 | 
						|
    my @feeds;
 | 
						|
 | 
						|
    $uridata->{HTTP_STATUS} = 'Unknown';
 | 
						|
 | 
						|
    my $browser = LWP::UserAgent->new or return 0;
 | 
						|
 | 
						|
    my $response = $browser->head( $uri->as_string, %{$headers} )
 | 
						|
        or return 0;
 | 
						|
 | 
						|
    $uridata->{HTTP_STATUS} = $response->status_line;
 | 
						|
 | 
						|
    if ( $response->is_success ) {
 | 
						|
        $uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
 | 
						|
 | 
						|
        #
 | 
						|
        # Decode the content-type we received
 | 
						|
        #
 | 
						|
        if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
 | 
						|
            $uridata->{TYPE} = 'Feed';
 | 
						|
        }
 | 
						|
        elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
 | 
						|
            $uridata->{TYPE} = 'HTML';
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            $uridata->{TYPE} = 'Unknown';
 | 
						|
        }
 | 
						|
 | 
						|
        #
 | 
						|
        # Sometimes perfectly valid feeds misleadingly return text/html so we
 | 
						|
        # try to spot such cases here and adjust the internal type accordingly
 | 
						|
        #
 | 
						|
        if ( $uridata->{TYPE} eq 'HTML' ) {
 | 
						|
            @feeds = Feed::Find->find( $uri->as_string );
 | 
						|
            if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
 | 
						|
                emit( $silent, "Feed found with wrong content-type\n" );
 | 
						|
                $uridata->{TYPE} = 'Feed';
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                emit( $silent, "Found ", scalar(@feeds),
 | 
						|
                    " feeds within this HTML page\n" );
 | 
						|
                print Dumper( \@feeds ), "\n" if $DEBUG > 0;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        $log->info( "URL content classified as: ", $uridata->{TYPE} );
 | 
						|
        emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
 | 
						|
        return 1;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: getFeed
 | 
						|
#      PURPOSE: Download the contents of a feed URL
 | 
						|
#   PARAMETERS: $feed_url       URL of the feed to download
 | 
						|
#      RETURNS: String representation of the feed contents or undef if the
 | 
						|
#               download failed.
 | 
						|
#  DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
 | 
						|
#               need not be). If successful the contents are decoded and
 | 
						|
#               returned, otherwise undef is returned.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO:
 | 
						|
#===============================================================================
 | 
						|
sub getFeed {
 | 
						|
    my ($feed_url) = @_;
 | 
						|
 | 
						|
    #
 | 
						|
    # Use LWP::UserAgent to get the feed and handle errors
 | 
						|
    #
 | 
						|
    my $ua = LWP::UserAgent->new;
 | 
						|
    $ua->timeout(10);
 | 
						|
    $ua->agent("$PROG/$VERSION");
 | 
						|
    my $response = $ua->get($feed_url);
 | 
						|
 | 
						|
    my $feed_content;
 | 
						|
    if ( $response->is_success ) {
 | 
						|
        $feed_content = $response->decoded_content;
 | 
						|
        return $feed_content;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        warn "Failed to get $feed_url\n";
 | 
						|
        warn $response->status_line, "\n";
 | 
						|
        return; # undef
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: parseFeed
 | 
						|
#      PURPOSE: Parse a podcast feed that has already been downloaded
 | 
						|
#   PARAMETERS: $feed_url       URL of the feed previously downloaded
 | 
						|
#               $feed_content   String containing the content of the feed, for
 | 
						|
#                               parsing
 | 
						|
#      RETURNS: An XML::Feed object containing the parsed feed or undef if the
 | 
						|
#               parse failed
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub parseFeed {
 | 
						|
    my ( $feed_url, $feed_content ) = @_;
 | 
						|
 | 
						|
    my $feed = XML::Feed->parse( \$feed_content );
 | 
						|
    unless ($feed) {
 | 
						|
        #
 | 
						|
        # Something went wrong. Abort this feed
 | 
						|
        #
 | 
						|
        warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
 | 
						|
        return; # undef
 | 
						|
    }
 | 
						|
 | 
						|
    return $feed;
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: storeFeed
 | 
						|
#      PURPOSE: Stores feed attributes in a hash
 | 
						|
#   PARAMETERS: $feed           XML::Feed object returned from parsing the
 | 
						|
#                               feed
 | 
						|
#               $uridata        hashref containing data for this URI
 | 
						|
#      RETURNS: Nothing
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub storeFeed {
 | 
						|
    my ($feed, $uridata) = @_;
 | 
						|
 | 
						|
    ( $uridata->{TITLE}  = $feed->title ) =~ s/(^\s+|\s+$)//g;
 | 
						|
    ( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
 | 
						|
 | 
						|
    $uridata->{LINK}        = $feed->link;
 | 
						|
    $uridata->{DESCRIPTION} = clean_string( $feed->tagline );
 | 
						|
    $uridata->{AUTHOR}      = $feed->author;
 | 
						|
    $uridata->{MODIFIED}    = normaliseDT( $feed->modified );
 | 
						|
    $uridata->{IMAGE}       = flattenArray( $feed->image );
 | 
						|
    $uridata->{COPYRIGHT}   = $feed->copyright;
 | 
						|
    $uridata->{GENERATOR}   = $feed->generator;
 | 
						|
    $uridata->{LANGUAGE}    = $feed->language;
 | 
						|
    #print coalesce($feed->webMaster,'No webMaster'),"\n";
 | 
						|
 | 
						|
    $uridata->{ENCLOSURES} = extractEnclosures($feed);
 | 
						|
    $uridata->{ENCLOSURE_COUNT}
 | 
						|
        = scalar( @{ $uridata->{ENCLOSURES} } );
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: checkCopyright
 | 
						|
#      PURPOSE: Ask the user to check the copyright of a feed
 | 
						|
#   PARAMETERS:
 | 
						|
#      RETURNS:
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub checkCopyright {
 | 
						|
    my ($uridata) = @_;
 | 
						|
 | 
						|
    my $decision;
 | 
						|
    $LOG->info('Checking copyright of feed');
 | 
						|
 | 
						|
    #
 | 
						|
    # Prompt the user, failing gracefully if there's
 | 
						|
    # a problem. If the user types 'Y' or 'y' we accept the
 | 
						|
    # feed, otherwise we do not (thus a blank return = 'no').
 | 
						|
    #
 | 
						|
    try {
 | 
						|
        printf STDERR
 | 
						|
            "Feed '%s' has the copyright string:\n%s\n",
 | 
						|
            $uridata->{TITLE},
 | 
						|
            coalesce( $uridata->{COPYRIGHT}, '' );
 | 
						|
        $decision = prompt(
 | 
						|
            -in     => *STDIN,
 | 
						|
            -prompt => 'OK to add this feed?',
 | 
						|
            -style  => 'bold red underlined',
 | 
						|
            -yes
 | 
						|
        );
 | 
						|
    }
 | 
						|
    catch {
 | 
						|
        warn "Problem processing copyright decision: $_";
 | 
						|
        $decision = 0;
 | 
						|
    };
 | 
						|
 | 
						|
    #
 | 
						|
    # Take action on the decision (or default)
 | 
						|
    #
 | 
						|
    $uridata->{SAVE} = $decision;
 | 
						|
    if ($decision) {
 | 
						|
        emit( $silent, "Feed added\n" );
 | 
						|
        $LOG->info('Copyright OK');
 | 
						|
        return 1;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        emit( $silent, "Feed not added\n" );
 | 
						|
        $LOG->info('Copyright not OK');
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: parseRSS
 | 
						|
#      PURPOSE: Attempt to parse a feed as RSS
 | 
						|
#   PARAMETERS: $feed_url       URL of the feed previously downloaded
 | 
						|
#               $feed_content   String containing the content of the feed, for
 | 
						|
#                               parsing
 | 
						|
#      RETURNS: An XML::RSS object containing the parsed feed or undef if the
 | 
						|
#               parse failed
 | 
						|
#  DESCRIPTION: ** Incomplete **
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub parseRSS {
 | 
						|
    my ( $feed_url, $feed_content ) = @_;
 | 
						|
 | 
						|
    my $rss = XML::RSS->parse(\$feed_content);
 | 
						|
    unless ($rss) {
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: addURI
 | 
						|
#      PURPOSE: Adds the data for a URI to the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $uridata        hashref containing data for the current URI
 | 
						|
#               $keymap         hashref containing a map of key names to
 | 
						|
#                               database field names
 | 
						|
#      RETURNS: True (1) if the insert succeeded, false (0) otherwise
 | 
						|
#  DESCRIPTION: The hash keys are defined as an array to make it easy to slice
 | 
						|
#               the hash and the SQL is defined internally using the size of
 | 
						|
#               the key array as a guide to the number of '?' placeholders.
 | 
						|
#               These are passed to execSQL to do the work.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub addURI {
 | 
						|
    my ( $dbh, $uridata, $keymap ) = @_;
 | 
						|
 | 
						|
    my @keys = (
 | 
						|
        'URI',               'DNS',
 | 
						|
        'HOSTUP',            'HTTP_STATUS',
 | 
						|
        'HTTP_CONTENT_TYPE', 'TYPE',
 | 
						|
        'FORMAT',            'TITLE',
 | 
						|
        'DESCRIPTION',       'AUTHOR',
 | 
						|
        'MODIFIED',          'LINK',
 | 
						|
        'IMAGE',             'COPYRIGHT',
 | 
						|
        'GENERATOR',         'LANGUAGE',
 | 
						|
    );
 | 
						|
 | 
						|
    my $sql
 | 
						|
        = 'INSERT INTO urls ('
 | 
						|
        . join( ",", @{$keymap}{@keys} ) . ') '
 | 
						|
        . 'VALUES('
 | 
						|
        . join( ',', ('?') x scalar(@keys) ) . ')';
 | 
						|
 | 
						|
    print "addURI query: $sql\n" if $DEBUG > 0;
 | 
						|
 | 
						|
    return execSQL( $dbh, $sql, @{$uridata}{@keys} );
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: extractEnclosures
 | 
						|
#      PURPOSE: Builds an array of hashes containing enclosure data from an
 | 
						|
#               XML::Feed object
 | 
						|
#   PARAMETERS: $feed           XML::Feed object
 | 
						|
#      RETURNS: A reference to the array of anonymous hashes built from the
 | 
						|
#               feed.
 | 
						|
#  DESCRIPTION: The XML::Feed object is expected to contain an array of
 | 
						|
#               entries. These are converted to hashes, references to which
 | 
						|
#               are stored in an array. The two DateTime components are
 | 
						|
#               converted to ISO8601 dates. If there is an enclosure then its
 | 
						|
#               elements are saved.  Note that there could be multiple
 | 
						|
#               enclosures, but XML::Feed does not cater for them unless
 | 
						|
#               explicitly requested. We do not deal with such a case here.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub extractEnclosures {
 | 
						|
    my ($feed) = @_;
 | 
						|
 | 
						|
    my @encs;
 | 
						|
 | 
						|
    foreach my $entry ( $feed->entries ) {
 | 
						|
        my %ent;
 | 
						|
 | 
						|
        $ent{title}    = $entry->title;
 | 
						|
        $ent{base}     = $entry->base;
 | 
						|
        $ent{link}     = $entry->link;
 | 
						|
        $ent{category} = join( ", ", $entry->category );
 | 
						|
        $ent{author}   = $entry->author;
 | 
						|
        $ent{id}       = $entry->id;
 | 
						|
        $ent{issued}   = normaliseDT( $entry->issued );
 | 
						|
        $ent{modified} = normaliseDT( $entry->modified );
 | 
						|
 | 
						|
        #
 | 
						|
        # Work around a bug in XML::Feed where the source method is only
 | 
						|
        # available for Atom feeds. TODO report this.
 | 
						|
        #
 | 
						|
        if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
 | 
						|
            $ent{source} = $entry->source;
 | 
						|
        }
 | 
						|
 | 
						|
        my ($enclosure) = $entry->enclosure;
 | 
						|
        if ( defined($enclosure) ) {
 | 
						|
            $ent{url}    = $enclosure->url;
 | 
						|
            $ent{type}   = $enclosure->type;
 | 
						|
            $ent{length} = $enclosure->length;
 | 
						|
        }
 | 
						|
 | 
						|
        push( @encs, \%ent );
 | 
						|
    }
 | 
						|
 | 
						|
    return \@encs;
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: addEnclosures
 | 
						|
#      PURPOSE: Adds episodes extracted from a feed into the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $uridata        hashref containing data for the current URI
 | 
						|
#                               including an arrayref of hashrefs of episode
 | 
						|
#                               data
 | 
						|
#      RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
 | 
						|
#  DESCRIPTION: The SQL is defined internally and the hash keys are defined as
 | 
						|
#               an array to make it easy to slice the hash. The enclosures (or
 | 
						|
#               more correctly, feed items) are present in the hash as an
 | 
						|
#               array of anonymous hashes. These are processed one at a time
 | 
						|
#               and inserted into the database. A count of the number of
 | 
						|
#               successful inserts is kept. This is compared with the number
 | 
						|
#               of enclosures to determine the boolean value to return.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub addEnclosures {
 | 
						|
    my ( $dbh, $uridata ) = @_;
 | 
						|
 | 
						|
    my $sql = q{INSERT INTO episodes
 | 
						|
        (urls_id, link, enclosure, title, author, category, source, ep_id,
 | 
						|
        issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
 | 
						|
 | 
						|
    my @keys = (
 | 
						|
        'link', 'url',    'title',    'author', 'category', 'source',
 | 
						|
        'id',   'issued', 'modified', 'length', 'type'
 | 
						|
    );
 | 
						|
 | 
						|
    my $successes = 0;
 | 
						|
 | 
						|
    foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
 | 
						|
        if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
 | 
						|
            $successes++;
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit( $silent, "Failed to add episode $enc->{url}\n" );
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    return ( $successes == $uridata->{ENCLOSURE_COUNT} );
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: updateURI
 | 
						|
#      PURPOSE: Compare the data in a hash with that in the database
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $uridata        hashref containing data for the current URI
 | 
						|
#                               including an arrayref of hashrefs of episode
 | 
						|
#                               data
 | 
						|
#               $keymap         hashref containing a map of key names to
 | 
						|
#                               database field names
 | 
						|
#      RETURNS: The number of changes made
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub updateURI {
 | 
						|
    my ( $dbh, $uridata, $keymap ) = @_;
 | 
						|
 | 
						|
    my ( $sql1, $sth1, $rv1, $h1 );
 | 
						|
    my ( %fieldvals, %where );
 | 
						|
    my ( $diffs, $updates ) = ( 0, 0 );
 | 
						|
    my @keys = (
 | 
						|
        'URI',               'DNS',
 | 
						|
        'HOSTUP',            'HTTP_STATUS',
 | 
						|
        'HTTP_CONTENT_TYPE', 'TYPE',
 | 
						|
        'FORMAT',            'TITLE',
 | 
						|
        'DESCRIPTION',       'AUTHOR',
 | 
						|
        'MODIFIED',          'LINK',
 | 
						|
        'IMAGE',             'COPYRIGHT',
 | 
						|
        'GENERATOR',         'LANGUAGE',
 | 
						|
    );
 | 
						|
 | 
						|
    #
 | 
						|
    # Get the row from the urls table
 | 
						|
    #
 | 
						|
    $sql1 = q{SELECT * FROM urls WHERE url = ?};
 | 
						|
    $sth1 = $dbh->prepare($sql1);
 | 
						|
    $rv1  = $sth1->execute( $uridata->{URI} );
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
    $h1 = $sth1->fetchrow_hashref;
 | 
						|
 | 
						|
    for my $key (@keys) {
 | 
						|
        unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
 | 
						|
            $diffs++;
 | 
						|
            $fieldvals{$key} = $uridata->{$key};
 | 
						|
 | 
						|
            #
 | 
						|
            # Temporary report
 | 
						|
            #
 | 
						|
            print "Difference: ($key)\n";
 | 
						|
            print "  Feed:     ", coalesce($uridata->{$key},''), "\n";
 | 
						|
            print "  Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if ($diffs > 0) {
 | 
						|
        #
 | 
						|
        # Prepare to use SQL::Abstract
 | 
						|
        #
 | 
						|
        my $sql = SQL::Abstract->new();
 | 
						|
 | 
						|
        #
 | 
						|
        # Build the update statement
 | 
						|
        #
 | 
						|
        $where{id} = $h1->{id};
 | 
						|
        my ( $stmt, @bindvals )
 | 
						|
            = $sql->update( 'urls', \%fieldvals, \%where );
 | 
						|
 | 
						|
        # Temporary
 | 
						|
        print "Statement: $stmt\n";
 | 
						|
        print "Bind values: ", join(",",@bindvals),"\n";
 | 
						|
 | 
						|
        #
 | 
						|
        # Perform the updates
 | 
						|
        #
 | 
						|
        $sth1 = $dbh->prepare($stmt);
 | 
						|
        $sth1->execute(@bindvals);
 | 
						|
        if ( $dbh->err ) {
 | 
						|
            warn "Processing $h1->{url}\n", $dbh->errstr;
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            emit($silent, "Updated $h1->{url}\n");
 | 
						|
            $updates++;
 | 
						|
        }
 | 
						|
 | 
						|
    }
 | 
						|
 | 
						|
    return $updates;
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: updateEnclosures
 | 
						|
#      PURPOSE: Update the enclosures stored with a feed URL
 | 
						|
#   PARAMETERS: $dbh            database handle
 | 
						|
#               $uridata        hashref containing data for the current URI
 | 
						|
#                               including an arrayref of hashrefs of episode
 | 
						|
#                               data
 | 
						|
#      RETURNS: The number of changes made
 | 
						|
#  DESCRIPTION: 
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub updateEnclosures {
 | 
						|
    my ( $dbh, $uridata ) = @_;
 | 
						|
 | 
						|
    my ( $sql1, $sth1, $rv1, $h1 );
 | 
						|
    my ( %fieldvals, %where );
 | 
						|
    my ( $diffs, $updates ) = ( 0, 0 );
 | 
						|
 | 
						|
    #
 | 
						|
    # Get the enclosures associated with this feed
 | 
						|
    #
 | 
						|
    $sql1 = q{
 | 
						|
        SELECT * FROM episodes
 | 
						|
        WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
 | 
						|
    };
 | 
						|
    $sth1 = $dbh->prepare($sql1);
 | 
						|
    $rv1  = $sth1->execute( $uridata->{URI} );
 | 
						|
    if ( $dbh->err ) {
 | 
						|
        warn $dbh->errstr;
 | 
						|
    }
 | 
						|
    $h1 = $sth1->fetchrow_hashref;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: equal
 | 
						|
#      PURPOSE: Compare two strings even if undefined
 | 
						|
#   PARAMETERS: $s1             The first string
 | 
						|
#               $s2             The second string
 | 
						|
#      RETURNS: True if both strings are undefined, false if one isn't
 | 
						|
#               defined, otherwise the result of comparing them.
 | 
						|
#  DESCRIPTION: Works on the principle that two undefined strings are equal,
 | 
						|
#               a defined and an undefined string are not, and otherwise they
 | 
						|
#               are equal if they are equal!
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO:
 | 
						|
#===============================================================================
 | 
						|
sub equal {
 | 
						|
    my ( $s1, $s2 ) = @_;
 | 
						|
 | 
						|
    return 1 if ( !defined($s1) && !defined($s2) );
 | 
						|
    return 0 if ( !defined($s1) || !defined($s2) );
 | 
						|
 | 
						|
    return ( $s1 eq $s2 );
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: clean_string
 | 
						|
#      PURPOSE: Clean a string of non-printables, newlines, multiple spaces
 | 
						|
#   PARAMETERS: $str            The string to process
 | 
						|
#      RETURNS: The processed string
 | 
						|
#  DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
 | 
						|
#               characters. Removes all CR/LF sequences. Removes multiple
 | 
						|
#               spaces.
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO:
 | 
						|
#===============================================================================
 | 
						|
sub clean_string {
 | 
						|
    my ($str) = @_;
 | 
						|
 | 
						|
    $str =~ s/(^\s+|\s+$)//g;
 | 
						|
    $str =~ tr/[[:graph:]]//c;
 | 
						|
    $str =~ tr/\x0A\x0D/ /;
 | 
						|
    $str =~ tr/ / /s;
 | 
						|
 | 
						|
    return $str;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: normaliseDT
 | 
						|
#      PURPOSE: Normalise an ISO8601 date for comparison, etc.
 | 
						|
#   PARAMETERS: $dt             a DateTime object
 | 
						|
#      RETURNS: The DateTime object formatted as an ISO8601 string
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub normaliseDT {
 | 
						|
    my ($dt) = @_;
 | 
						|
 | 
						|
    my $p = DateTime::Format::SQLite->new();
 | 
						|
 | 
						|
    return (
 | 
						|
        defined($dt)
 | 
						|
        ? $p->format_datetime(
 | 
						|
            DateTime::Format::ISO8601->parse_datetime($dt)
 | 
						|
            )
 | 
						|
        : undef
 | 
						|
    );
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: flattenArray
 | 
						|
#      PURPOSE: Turns an arrayref into a simple list in a string
 | 
						|
#   PARAMETERS: $item - the item that may be an arrayref
 | 
						|
#      RETURNS: The plain item if it's not an array otherwise the flattened
 | 
						|
#               version
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub flattenArray {
 | 
						|
    my ($item) = @_;
 | 
						|
 | 
						|
    my $result;
 | 
						|
    if ( ref( $item ) eq 'ARRAY' ) {
 | 
						|
        $result = join(", ",@{$item});
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $result = $item;
 | 
						|
    }
 | 
						|
 | 
						|
    return $item;
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: coalesce
 | 
						|
#      PURPOSE: To find the defined argument and return it
 | 
						|
#   PARAMETERS: Arbitrary number of arguments
 | 
						|
#      RETURNS: The first defined argument
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub coalesce {
 | 
						|
    foreach (@_) {
 | 
						|
        return $_ if defined($_);
 | 
						|
    }
 | 
						|
    return; # undef
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: emit
 | 
						|
#      PURPOSE: Print text on STDERR unless silent mode has been selected
 | 
						|
#   PARAMETERS: - Boolean indicating whether to be silent or not
 | 
						|
#               - list of arguments to 'print'
 | 
						|
#      RETURNS: Nothing
 | 
						|
#  DESCRIPTION: This is a wrapper around 'print' to determine whether to send
 | 
						|
#               a message to STDERR depending on a boolean. We need this to be
 | 
						|
#               able to make the script silent when the -silent option is
 | 
						|
#               selected
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub emit {
 | 
						|
    unless (shift) {
 | 
						|
        print STDERR @_;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#===  FUNCTION  ================================================================
 | 
						|
#         NAME: Options
 | 
						|
#      PURPOSE: Processes command-line options
 | 
						|
#   PARAMETERS: $optref     Hash reference to hold the options
 | 
						|
#      RETURNS: Undef
 | 
						|
#  DESCRIPTION:
 | 
						|
#       THROWS: No exceptions
 | 
						|
#     COMMENTS: None
 | 
						|
#     SEE ALSO: N/A
 | 
						|
#===============================================================================
 | 
						|
sub Options {
 | 
						|
    my ($optref) = @_;
 | 
						|
 | 
						|
    my @options = (
 | 
						|
        "help",   "manpage",  "debug=i", "silent!",
 | 
						|
        "load=s", "delete=s", "scan!",   "report!",
 | 
						|
        "check!", "json:s",   "opml:s",  "config=s",
 | 
						|
        "out=s",  "template:s",
 | 
						|
    );
 | 
						|
 | 
						|
    if ( !GetOptions( $optref, @options ) ) {
 | 
						|
        pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
__END__
 | 
						|
 | 
						|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | 
						|
#  Application Documentation
 | 
						|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | 
						|
#{{{
 | 
						|
 | 
						|
 | 
						|
=head1 NAME
 | 
						|
 | 
						|
feedWatcher - watch a collection of podcast feeds
 | 
						|
 | 
						|
=head1 VERSION
 | 
						|
 | 
						|
This documentation refers to I<feedWatcher> version 0.0.14
 | 
						|
 | 
						|
 | 
						|
=head1 USAGE
 | 
						|
 | 
						|
 feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
 | 
						|
    [-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
 | 
						|
    [-[no]silent] [-config=FILE] [-debug=N] [URL ...]
 | 
						|
 | 
						|
=head1 ARGUMENTS
 | 
						|
 | 
						|
Arguments are optional and may consist of an arbitrarily long list of URLs to
 | 
						|
be processed and possibly added to the database by the script.
 | 
						|
 | 
						|
These URLs are prepended to any which may be provided through the
 | 
						|
B<-load=FILE> option described below. The script makes sure the URL collection
 | 
						|
contains no duplicates.
 | 
						|
 | 
						|
=head1 OPTIONS
 | 
						|
 | 
						|
=over 8
 | 
						|
 | 
						|
=item B<-help>
 | 
						|
 | 
						|
Prints a brief help message describing the usage of the program, and then exits.
 | 
						|
 | 
						|
The full documentation can be displayed with the command:
 | 
						|
 | 
						|
    perldoc -oman feedWatcher
 | 
						|
 | 
						|
Alternatively, use the option B<-manpage>
 | 
						|
 | 
						|
=item B<-manpage>
 | 
						|
 | 
						|
Prints the entire documentation for the script. Note that this mode uses a
 | 
						|
simpler type of formatter than I<perldoc>.
 | 
						|
 | 
						|
=item B<-load=FILE>
 | 
						|
 | 
						|
Defines a file from which new URLs are to be added to the database. These URLs
 | 
						|
are checked in various ways before adding to the database. If arguments are
 | 
						|
provided when the script is invoked these URLs are appended to the argument list.
 | 
						|
 | 
						|
=item B<-delete=FILE>
 | 
						|
 | 
						|
Defines a file from which a list of URLs is to be read which are to be
 | 
						|
deleted from the database.
 | 
						|
 | 
						|
Note that it is possible (though inadvisable) to both add and delete an URL in
 | 
						|
the same run of the script. The URL will first be added (from the load file or
 | 
						|
argument list) then deleted. This is a pointless exercise which wastes
 | 
						|
bandwidth, so don't do it!
 | 
						|
 | 
						|
=item B<-[no]scan>
 | 
						|
 | 
						|
This option (B<-scan>) causes the URLs stored in the database to be scanned
 | 
						|
and updated. The negated form, which is also the default behaviour of the
 | 
						|
script, (B<-noscan>) omits the scan.
 | 
						|
 | 
						|
NOTE: This function is not implemented yet.
 | 
						|
 | 
						|
=item B<-out=FILE>
 | 
						|
 | 
						|
This option defines an output file to receive any output. If the option is
 | 
						|
omitted the data is written to STDOUT, allowing it to be redirected if
 | 
						|
required.
 | 
						|
 | 
						|
=item B<-[no]check>
 | 
						|
 | 
						|
This option (B<-check>) causes each feed which is being to be checked against
 | 
						|
the script user to check that it's OK to add it. The script reports the
 | 
						|
I<copyright> field and requests a I<y> or I<n> response.
 | 
						|
 | 
						|
=item B<-[no]report>
 | 
						|
 | 
						|
This option (B<-report>) causes a report of the contents of the database to be
 | 
						|
generated.  The negated form, which is also the default behaviour of the
 | 
						|
script, (B<-noreport>) omits the report.
 | 
						|
 | 
						|
NOTE: The report is currently very simple.
 | 
						|
 | 
						|
=item B<-json[=FILE]>
 | 
						|
 | 
						|
This option, which may be omitted, defines the location where the feed and
 | 
						|
episode details are to be written. If omitted no JSON data is written.
 | 
						|
 | 
						|
If the option is given as B<-json=FILE> the data is written to the nominated
 | 
						|
file.
 | 
						|
 | 
						|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
 | 
						|
used.
 | 
						|
 | 
						|
=item B<-opml[=FILE]>
 | 
						|
 | 
						|
This option, which may be omitted, defines the location where the feed details
 | 
						|
are to be written. If omitted no OPML data is written.
 | 
						|
 | 
						|
If the option is given as B<-opml=FILE> the data is written to the nominated
 | 
						|
file.
 | 
						|
 | 
						|
If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
 | 
						|
used.
 | 
						|
 | 
						|
=item B<-template=FILE>
 | 
						|
 | 
						|
This option defines the template used to generate a form of the feed data. The
 | 
						|
template is written using the B<Template> toolkit language.
 | 
						|
 | 
						|
If the file name is omitted then the script uses the file B<feedWatcher.tpl>
 | 
						|
in the same directory as the script. If this file does not exist then the
 | 
						|
script will exit with an error message.
 | 
						|
 | 
						|
For convenience B<feedWatcher.tpl> can be made a soft link which points to the
 | 
						|
file which is the current default. This allows the development of versions
 | 
						|
without changing the usual way this script is run.
 | 
						|
 | 
						|
=item B<-config=FILE>
 | 
						|
 | 
						|
This option allows an alternative configuration file to be used. This file
 | 
						|
defines the location of the database, its name and the username and
 | 
						|
password to be used to access it. This feature permits a test database to be
 | 
						|
used, or for two more sets of feeds to be processed.
 | 
						|
 | 
						|
See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | 
						|
 | 
						|
If the option is omitted the default file is used: B<feedWatcher.cfg>
 | 
						|
 | 
						|
=item B<-debug=N>
 | 
						|
 | 
						|
This option selects the debug level, resulting in a lot of output.
 | 
						|
 | 
						|
    0   (the default) No debug output
 | 
						|
    1   Dumps the list of feeds found in an HTML download.
 | 
						|
        Also shows the SQL query which will result in the insertion of a new
 | 
						|
        row into the database table I<urls>.
 | 
						|
    2   Dumps the collected data which is destined to be written to the database
 | 
						|
    3   Dumps the contents of feeds during analysis
 | 
						|
 | 
						|
=back
 | 
						|
 | 
						|
=head1 DESCRIPTION
 | 
						|
 | 
						|
A full description of the application and its features.
 | 
						|
May include numerous subsections (i.e. =head2, =head3, etc.)
 | 
						|
 | 
						|
 | 
						|
=head1 DIAGNOSTICS
 | 
						|
 | 
						|
A list of every error and warning message that the application can generate
 | 
						|
(even the ones that will "never happen"), with a full explanation of each
 | 
						|
problem, one or more likely causes, and any suggested remedies. If the
 | 
						|
application generates exit status codes (e.g. under Unix) then list the exit
 | 
						|
status associated with each error.
 | 
						|
 | 
						|
 | 
						|
=head1 CONFIGURATION AND ENVIRONMENT
 | 
						|
 | 
						|
A full explanation of any configuration system(s) used by the application,
 | 
						|
including the names and locations of any configuration files, and the
 | 
						|
meaning of any environment variables or properties that can be set. These
 | 
						|
descriptions must also include details of any configuration language used
 | 
						|
 | 
						|
 | 
						|
=head1 DEPENDENCIES
 | 
						|
 | 
						|
    Config::General
 | 
						|
    DBI
 | 
						|
    Data::Dumper
 | 
						|
    DateTime::Format::ISO8601
 | 
						|
    DateTime::Format::SQLite
 | 
						|
    Feed::Find
 | 
						|
    Getopt::Long
 | 
						|
    HTML::Entities
 | 
						|
    IO::Socket
 | 
						|
    JSON
 | 
						|
    LWP::UserAgent
 | 
						|
    List::MoreUtils
 | 
						|
    Log::Handler
 | 
						|
    Net::DNS
 | 
						|
    Pod::Usage
 | 
						|
    Template
 | 
						|
    Template::Filters
 | 
						|
    URI
 | 
						|
    WWW::RobotRules
 | 
						|
    XML::Feed
 | 
						|
    XML::RSS::Parser
 | 
						|
 | 
						|
=head1 INCOMPATIBILITIES
 | 
						|
 | 
						|
A list of any modules that this module cannot be used in conjunction with.
 | 
						|
This may be due to name conflicts in the interface, or competition for
 | 
						|
system or program resources, or due to internal limitations of Perl
 | 
						|
(for example, many modules that use source code filters are mutually
 | 
						|
incompatible).
 | 
						|
 | 
						|
 | 
						|
=head1 BUGS AND LIMITATIONS
 | 
						|
 | 
						|
There are no known bugs in this module.
 | 
						|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
 | 
						|
Patches are welcome.
 | 
						|
 | 
						|
=head1 AUTHOR
 | 
						|
 | 
						|
Dave Morriss (Dave.Morriss@gmail.com) 2020
 | 
						|
 | 
						|
=head1 LICENCE AND COPYRIGHT
 | 
						|
 | 
						|
Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
 | 
						|
 | 
						|
This program is free software. You can redistribute it and/or modify it under
 | 
						|
the same terms as perl itself.
 | 
						|
 | 
						|
=cut
 | 
						|
 | 
						|
#}}}
 | 
						|
 | 
						|
# [zo to open fold, zc to close]
 | 
						|
 | 
						|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |