forked from HPR/hpr-tools
		
	
		
			
	
	
		
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| 
								 | 
							
								#!/usr/bin/env perl
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#         FILE: feedWatcher
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#        USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
							 | 
						||
| 
								 | 
							
								#               [-[no]report] [-[no]check] [-out=FILE] [-json[=FILE]]
							 | 
						||
| 
								 | 
							
								#               [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
							 | 
						||
| 
								 | 
							
								#               [URL ...]
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
							 | 
						||
| 
								 | 
							
								#               podcasts. Ken's script was inspired by Dann Washko's site at
							 | 
						||
| 
								 | 
							
								#               http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
							 | 
						||
| 
								 | 
							
								#               August 2012 where it was used to generate handouts.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#               The script has not been developed since 2014, but is now in
							 | 
						||
| 
								 | 
							
								#               use in 2020 helping to prepare podcast information for
							 | 
						||
| 
								 | 
							
								#               a FOSDEM visit under the heading of "Free Culture Podcasts".
							 | 
						||
| 
								 | 
							
								#               See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
							 | 
						||
| 
								 | 
							
								#               (created with Pandoc) in the same directory as this script for
							 | 
						||
| 
								 | 
							
								#               details of what has been done to develop the original design.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#      OPTIONS: ---
							 | 
						||
| 
								 | 
							
								# REQUIREMENTS: ---
							 | 
						||
| 
								 | 
							
								#         BUGS: ---
							 | 
						||
| 
								 | 
							
								#        NOTES: ---
							 | 
						||
| 
								 | 
							
								#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
							 | 
						||
| 
								 | 
							
								#      VERSION: 0.0.14
							 | 
						||
| 
								 | 
							
								#      CREATED: 2013-12-25 12:40:33
							 | 
						||
| 
								 | 
							
								#     REVISION: 2020-02-18 22:45:28
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Released under the terms of the GNU Affero General Public License (AGPLv3)
							 | 
						||
| 
								 | 
							
								# http://www.gnu.org/licenses/agpl-3.0.html
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use 5.030;
							 | 
						||
| 
								 | 
							
								use strict;
							 | 
						||
| 
								 | 
							
								use warnings;
							 | 
						||
| 
								 | 
							
								use utf8;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use Getopt::Long;
							 | 
						||
| 
								 | 
							
								use Pod::Usage;
							 | 
						||
| 
								 | 
							
								use Config::General;
							 | 
						||
| 
								 | 
							
								use List::MoreUtils qw{uniq};
							 | 
						||
| 
								 | 
							
								use Log::Handler;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use Try::Tiny;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use URI;
							 | 
						||
| 
								 | 
							
								use Net::DNS;
							 | 
						||
| 
								 | 
							
								use IO::Socket;
							 | 
						||
| 
								 | 
							
								use LWP::UserAgent;
							 | 
						||
| 
								 | 
							
								use WWW::RobotRules;
							 | 
						||
| 
								 | 
							
								use XML::RSS::Parser;
							 | 
						||
| 
								 | 
							
								use XML::Feed;
							 | 
						||
| 
								 | 
							
								use Feed::Find;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use Template;
							 | 
						||
| 
								 | 
							
								use Template::Filters;
							 | 
						||
| 
								 | 
							
								Template::Filters->use_html_entities;    # Use HTML::Entities in the template
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use HTML::Entities;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use IO::Prompter;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use JSON;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use DBI;
							 | 
						||
| 
								 | 
							
								use DateTime::Format::SQLite;
							 | 
						||
| 
								 | 
							
								use DateTime::Format::ISO8601;
							 | 
						||
| 
								 | 
							
								use SQL::Abstract;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								use Data::Dumper;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Version number (manually incremented)
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								our $VERSION = '0.0.14';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Script name
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								( my $PROG = $0 ) =~ s|.*/||mx;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Declarations
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my ( @new_urls, @deletions );
							 | 
						||
| 
								 | 
							
								my ( $rules,    $robot_name ) = ( undef, "$PROG/$VERSION" );
							 | 
						||
| 
								 | 
							
								my ( $sth1, $h1, $rv );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								my $feeds;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# File and directory paths
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my $basedir     = "$ENV{HOME}/HPR/feed_watcher";
							 | 
						||
| 
								 | 
							
								my $tempdir     = "$basedir/tmp";
							 | 
						||
| 
								 | 
							
								my $configfile  = "$basedir/$PROG.cfg";
							 | 
						||
| 
								 | 
							
								my $logfile     = "$basedir/${PROG}.log";
							 | 
						||
| 
								 | 
							
								my $deftemplate = "$basedir/${PROG}.tpl";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Hash key map to database field names (chevron comments are to stop Perltidy
							 | 
						||
| 
								 | 
							
								# messing with the layout)
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								                                                #<<<
							 | 
						||
| 
								 | 
							
								my %keymap = (
							 | 
						||
| 
								 | 
							
								    AUTHOR            => 'author',
							 | 
						||
| 
								 | 
							
								    COPYRIGHT         => 'copyright',
							 | 
						||
| 
								 | 
							
								    DESCRIPTION       => 'description',
							 | 
						||
| 
								 | 
							
								    DNS               => 'dns',
							 | 
						||
| 
								 | 
							
								#   ENCLOSURES        => undef,
							 | 
						||
| 
								 | 
							
								#   ENCLOSURE_COUNT   => undef,
							 | 
						||
| 
								 | 
							
								    FORMAT            => 'feedformat',
							 | 
						||
| 
								 | 
							
								    GENERATOR         => 'generator',
							 | 
						||
| 
								 | 
							
								#   HOST              => undef,
							 | 
						||
| 
								 | 
							
								    HOSTUP            => 'host_up',
							 | 
						||
| 
								 | 
							
								    HTTP_CONTENT_TYPE => 'content_type',
							 | 
						||
| 
								 | 
							
								    HTTP_STATUS       => 'http_status',
							 | 
						||
| 
								 | 
							
								    IMAGE             => 'image',
							 | 
						||
| 
								 | 
							
								    LANGUAGE          => 'language',
							 | 
						||
| 
								 | 
							
								    LINK              => 'link',
							 | 
						||
| 
								 | 
							
								    MODIFIED          => 'modified',
							 | 
						||
| 
								 | 
							
								#   PORT              => undef,
							 | 
						||
| 
								 | 
							
								#   ROBOTS            => undef,
							 | 
						||
| 
								 | 
							
								#   SAVE              => undef,
							 | 
						||
| 
								 | 
							
								    TITLE             => 'title',
							 | 
						||
| 
								 | 
							
								    TYPE              => 'urltype',
							 | 
						||
| 
								 | 
							
								    URI               => 'url',
							 | 
						||
| 
								 | 
							
								#   URI_ID            => undef,
							 | 
						||
| 
								 | 
							
								);
							 | 
						||
| 
								 | 
							
								                                                #>>>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Headers for LWP::UserAgent
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my %headers = (
							 | 
						||
| 
								 | 
							
								    'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
							 | 
						||
| 
								 | 
							
								        . 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
							 | 
						||
| 
								 | 
							
								    'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
							 | 
						||
| 
								 | 
							
								        . 'image/pjpeg, image/png, */*',
							 | 
						||
| 
								 | 
							
								    'Accept-Charset'  => 'iso-8859-1,*,utf-8',
							 | 
						||
| 
								 | 
							
								    'Accept-Language' => 'en-US',
							 | 
						||
| 
								 | 
							
								);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Enable Unicode mode
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								binmode STDOUT, ":encoding(UTF-8)";
							 | 
						||
| 
								 | 
							
								binmode STDERR, ":encoding(UTF-8)";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Options and arguments
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my %options;
							 | 
						||
| 
								 | 
							
								Options( \%options );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Any arguments are taken to be URLs
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								@new_urls = @ARGV;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Default help
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
							 | 
						||
| 
								 | 
							
								    if ( $options{'help'} );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Detailed help
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
							 | 
						||
| 
								 | 
							
								    if ( $options{'manpage'} );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Collect options
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								my $cfgfile
							 | 
						||
| 
								 | 
							
								    = ( defined( $options{config} ) ? $options{config} : $configfile );
							 | 
						||
| 
								 | 
							
								my $silent     = ( defined( $options{silent} )    ? $options{silent}    : 0 );
							 | 
						||
| 
								 | 
							
								my $loadfile   = $options{'load'};
							 | 
						||
| 
								 | 
							
								my $deletefile = $options{'delete'};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								my $scan   = ( defined( $options{scan} )   ? $options{scan}   : 0 );
							 | 
						||
| 
								 | 
							
								my $report = ( defined( $options{report} ) ? $options{report} : 0 );
							 | 
						||
| 
								 | 
							
								my $check  = ( defined( $options{check} )  ? $options{check}  : 0 );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								my $outfile  = $options{out};
							 | 
						||
| 
								 | 
							
								my $json     = $options{json};
							 | 
						||
| 
								 | 
							
								my $opml     = $options{opml};
							 | 
						||
| 
								 | 
							
								my $template = $options{template};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Check the configuration file
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Check the load file
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ($loadfile) {
							 | 
						||
| 
								 | 
							
								    die "File $loadfile does not exist\n"  unless -e $loadfile;
							 | 
						||
| 
								 | 
							
								    die "File $loadfile is not readable\n" unless -r $loadfile;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Check the delete file
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ($deletefile) {
							 | 
						||
| 
								 | 
							
								    die "File $deletefile does not exist\n"  unless -e $deletefile;
							 | 
						||
| 
								 | 
							
								    die "File $deletefile is not readable\n" unless -r $deletefile;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# We accept -json or -json=filename. In the former case we make a default
							 | 
						||
| 
								 | 
							
								# name, otherwise we use the one provided.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ( defined($json) ) {
							 | 
						||
| 
								 | 
							
								    if ($json =~ /^$/) {
							 | 
						||
| 
								 | 
							
								        $json = "$PROG.json"
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# We accept -opml or -opml=filename. In the former case we make a default
							 | 
						||
| 
								 | 
							
								# name, otherwise we use the one provided.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ( defined($opml) ) {
							 | 
						||
| 
								 | 
							
								    if ($opml =~ /^$/) {
							 | 
						||
| 
								 | 
							
								        $opml = "$PROG.opml"
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Similarly we accept -template or -template=filename. In the former case we
							 | 
						||
| 
								 | 
							
								# make a default name, otherwise we use the one provided.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ( defined($template) ) {
							 | 
						||
| 
								 | 
							
								    if ($template =~ /^$/) {
							 | 
						||
| 
								 | 
							
								        $template = "$deftemplate";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    die "Error: Unable to find template $template\n" unless -r $template;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Load configuration data
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my $conf = new Config::General(
							 | 
						||
| 
								 | 
							
								    -ConfigFile      => $cfgfile,
							 | 
						||
| 
								 | 
							
								    -InterPolateVars => 1,
							 | 
						||
| 
								 | 
							
								    -ExtendedAccess  => 1
							 | 
						||
| 
								 | 
							
								);
							 | 
						||
| 
								 | 
							
								my %config = $conf->getall();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Connect to the database
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my $dbtype = $config{database}->{type};
							 | 
						||
| 
								 | 
							
								my $dbfile = $config{database}->{file};
							 | 
						||
| 
								 | 
							
								my $dbuser = $config{database}->{user};
							 | 
						||
| 
								 | 
							
								my $dbpwd  = $config{database}->{password};
							 | 
						||
| 
								 | 
							
								my $dbh
							 | 
						||
| 
								 | 
							
								    = DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
							 | 
						||
| 
								 | 
							
								    { AutoCommit => 1, sqlite_unicode => 1, } )
							 | 
						||
| 
								 | 
							
								    or die $DBI::errstr;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
							 | 
						||
| 
								 | 
							
								# foreign key constraints to function)
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								$dbh->do('PRAGMA foreign_keys = ON');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Check we have something to do
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
							 | 
						||
| 
								 | 
							
								my $work = (
							 | 
						||
| 
								 | 
							
								           scalar(@new_urls) > 0
							 | 
						||
| 
								 | 
							
								        || defined($loadfile)
							 | 
						||
| 
								 | 
							
								        || defined($deletefile)
							 | 
						||
| 
								 | 
							
								        || ( $report
							 | 
						||
| 
								 | 
							
								        || defined($json)
							 | 
						||
| 
								 | 
							
								        || defined($opml)
							 | 
						||
| 
								 | 
							
								        || defined($template)
							 | 
						||
| 
								 | 
							
								        || $scan && $rows > 0 )
							 | 
						||
| 
								 | 
							
								);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								die "Nothing to do!\n" unless $work;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Set up logging keeping the default log layout except for the date
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								my $LOG = Log::Handler->new();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								$LOG->add(
							 | 
						||
| 
								 | 
							
								    file => {
							 | 
						||
| 
								 | 
							
								        timeformat => "%Y/%m/%d %H:%M:%S",
							 | 
						||
| 
								 | 
							
								        filename   => $logfile,
							 | 
						||
| 
								 | 
							
								        minlevel   => 0,
							 | 
						||
| 
								 | 
							
								        maxlevel   => 7,
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Open the output file (or STDOUT) - we may need the date to do it
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								my $outfh;
							 | 
						||
| 
								 | 
							
								if ($outfile) {
							 | 
						||
| 
								 | 
							
								    open( $outfh, ">:encoding(UTF-8)", $outfile )
							 | 
						||
| 
								 | 
							
								        or warn "Unable to open $outfile for writing: $!";
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								else {
							 | 
						||
| 
								 | 
							
								    open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
							 | 
						||
| 
								 | 
							
								        or warn "Unable to initialise for writing: $!";
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Set up a robot.txt rules parser
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								$rules = WWW::RobotRules->new($robot_name);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Slurp the load file if there is one
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ($loadfile) {
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Load the input file
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    open( my $in, '<:encoding(utf8)', $loadfile )
							 | 
						||
| 
								 | 
							
								        or die "$PROG : failed to open load file '$loadfile' : $!\n";
							 | 
						||
| 
								 | 
							
								    chomp( my @loaded = <$in> );
							 | 
						||
| 
								 | 
							
								    close($in)
							 | 
						||
| 
								 | 
							
								        or warn "$PROG : failed to close load file '$loadfile' : $!\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $LOG->info("Loaded URLs from $loadfile");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Add the loaded URLs to the array
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    push( @new_urls, @loaded );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Now, we either have URLs from the command line, or from the load file, so we
							 | 
						||
| 
								 | 
							
								# process these.
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if (@new_urls) {
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Remove duplicates
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    @new_urls = uniq(@new_urls);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $LOG->info( "Adding ", scalar(@new_urls), " URLs to the database" );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Load these URLs as appropriate, returning any more that we find by
							 | 
						||
| 
								 | 
							
								    # following HTML urls.
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Now process any URLs that came back. Since we are explicitly looking for
							 | 
						||
| 
								 | 
							
								    # feeds we can assume that's what we have so don't need to recurse again.
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # TODO
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Process the delete file if there is one
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ($deletefile) {
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Load the delete file
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    open( my $del, '<:encoding(utf8)', $deletefile )
							 | 
						||
| 
								 | 
							
								        or die "$PROG : failed to open load file '$deletefile' : $!\n";
							 | 
						||
| 
								 | 
							
								    chomp( @deletions = <$del> );
							 | 
						||
| 
								 | 
							
								    close($del)
							 | 
						||
| 
								 | 
							
								        or warn "$PROG : failed to close load file '$deletefile' : $!\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Remove duplicates
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    @deletions = uniq(@deletions);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # There are URLs to delete. Process them on by one.
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    if (@deletions) {
							 | 
						||
| 
								 | 
							
								        $sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
							 | 
						||
| 
								 | 
							
								        foreach my $rec (@deletions) {
							 | 
						||
| 
								 | 
							
								            $rv = $sth1->execute($rec);
							 | 
						||
| 
								 | 
							
								            if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								                warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            if ( $rv != 0 ) {
							 | 
						||
| 
								 | 
							
								                emit ( $silent, "Deleted $rec ($rv rows)\n" );
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            else {
							 | 
						||
| 
								 | 
							
								                emit ( $silent, "Failed to delete $rec\n" );
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Perform a database scan if requested
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								if ($scan) {
							 | 
						||
| 
								 | 
							
								    $LOG->info( "Scan is not fully implemented yet" );
							 | 
						||
| 
								 | 
							
								    scanDB($dbh, \%keymap);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Report database contents if requested
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								if ($report) {
							 | 
						||
| 
								 | 
							
								    reportDB($dbh, $outfh);
							 | 
						||
| 
								 | 
							
								    $LOG->info( "Report generated" );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Do any image repairs we need, but temporarily
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								if ($json || $opml || $template) {
							 | 
						||
| 
								 | 
							
								    $feeds = collectData($dbh);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Clean up the image references (until we have a proper fix)
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    for (my $i = 0; $i < scalar(@$feeds); $i++) {
							 | 
						||
| 
								 | 
							
								        my $f = $feeds->[$i];
							 | 
						||
| 
								 | 
							
								        if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
							 | 
						||
| 
								 | 
							
								            $f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
							 | 
						||
| 
								 | 
							
								            $LOG->info( "Fixed image for $f->{urls_url}" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Output all feeds to a JSON file if requested
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								if ($json) {
							 | 
						||
| 
								 | 
							
								    my $js = JSON->new->utf8->canonical->pretty;
							 | 
						||
| 
								 | 
							
								    open( my $out, '>:encoding(UTF-8)', $json )
							 | 
						||
| 
								 | 
							
								        or die "Unable to open output file $json $!\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    print $out $js->encode($feeds), "\n";
							 | 
						||
| 
								 | 
							
								    close($out);
							 | 
						||
| 
								 | 
							
								    emit ( $silent, "JSON data written to $json\n" );
							 | 
						||
| 
								 | 
							
								    $LOG->info( "JSON data written to $json" );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Requesting the -opml option means to dump the entire feed table as OPML
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								if ($opml) {
							 | 
						||
| 
								 | 
							
								    require XML::OPML;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    open( my $out, '>:encoding(UTF-8)', $opml )
							 | 
						||
| 
								 | 
							
								        or die "Unable to open output file $opml $!\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Start building OPML. Mandatory attributes are 'type', 'text' and
							 | 
						||
| 
								 | 
							
								    # 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
							 | 
						||
| 
								 | 
							
								    # attribute is poorly defined; this module uses 'rss' as the type and
							 | 
						||
| 
								 | 
							
								    # 'RSS' as the value of the 'version' attribute. This is not linked to the
							 | 
						||
| 
								 | 
							
								    # type of the feed.
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $OPML = XML::OPML->new(version => '1.1');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Create the 'head' element
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $DT = normaliseDT(DateTime->now());
							 | 
						||
| 
								 | 
							
								    $OPML->head(
							 | 
						||
| 
								 | 
							
								        title => 'Free Culture Podcasts',
							 | 
						||
| 
								 | 
							
								        dateCreated => $DT,
							 | 
						||
| 
								 | 
							
								        dateModified => $DT,
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Walk the feeds generating OPML
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    for (my $i = 0; $i < scalar(@$feeds); $i++) {
							 | 
						||
| 
								 | 
							
								        my $f = $feeds->[$i];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $OPML->add_outline(
							 | 
						||
| 
								 | 
							
								            title       => $f->{urls_title},
							 | 
						||
| 
								 | 
							
								            text        => $f->{urls_title},
							 | 
						||
| 
								 | 
							
								            description => $f->{urls_description},
							 | 
						||
| 
								 | 
							
								            xmlUrl      => $f->{urls_url},
							 | 
						||
| 
								 | 
							
								            htmlUrl     => $f->{urls_link},
							 | 
						||
| 
								 | 
							
								        );
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Output OPML to the nominated file
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    print $out $OPML->as_string;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    close($out);
							 | 
						||
| 
								 | 
							
								    emit ( $silent, "OPML data written to $opml\n" );
							 | 
						||
| 
								 | 
							
								    $LOG->info( "OPML data written to $opml" );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Fill and print the template if requested
							 | 
						||
| 
								 | 
							
								#-------------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								if ($template) {
							 | 
						||
| 
								 | 
							
								    my $tt = Template->new(
							 | 
						||
| 
								 | 
							
								        {   ABSOLUTE     => 1,
							 | 
						||
| 
								 | 
							
								            ENCODING     => 'utf8',
							 | 
						||
| 
								 | 
							
								            INCLUDE_PATH => $basedir,
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Make the structure the template needs
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $vars = {
							 | 
						||
| 
								 | 
							
								        feeds => $feeds,
							 | 
						||
| 
								 | 
							
								    };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#   print Dumper($vars),"\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $document;
							 | 
						||
| 
								 | 
							
								    $tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
							 | 
						||
| 
								 | 
							
								        || die $tt->error(), "\n";
							 | 
						||
| 
								 | 
							
								    print $outfh $document;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    emit ( $silent, "Data processed with $template\n" );
							 | 
						||
| 
								 | 
							
								    $LOG->info( "Data processed with $template" );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#$dbh->disconnect;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								exit;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: loadUrls
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To load URLs read from the input file into the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $new_urls       arrayref containing URLs
							 | 
						||
| 
								 | 
							
								#               $rules          WWW::RobotRules object
							 | 
						||
| 
								 | 
							
								#               $keymap         hashref containing a map of key names to
							 | 
						||
| 
								 | 
							
								#                               database field names
							 | 
						||
| 
								 | 
							
								#      RETURNS: Currently nothing. The plan is to return any new URLs
							 | 
						||
| 
								 | 
							
								#               discovered by investigating non-feed URLs.
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub loadUrls {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $new_urls, $rules, $keymap ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my ( $stream, $feed );
							 | 
						||
| 
								 | 
							
								    my %uridata;
							 | 
						||
| 
								 | 
							
								    my $roboturl;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Work through the list of URLs
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    foreach my $rec (@new_urls) {
							 | 
						||
| 
								 | 
							
								        %uridata = ();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # By default save the collected data
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $uridata{SAVE} = 1;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check that we have a valid URL. We don't save them if they are
							 | 
						||
| 
								 | 
							
								        # invalid ($uridata{SAVE} is set to 0 in the routine).
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        my $uri = validateURI($rec,\%uridata);
							 | 
						||
| 
								 | 
							
								        next unless $uri;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check whether we already have this URI in the database
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if (dbSearch(
							 | 
						||
| 
								 | 
							
								                $dbh, 'SELECT * FROM urls WHERE url = ?',
							 | 
						||
| 
								 | 
							
								                $uridata{URI}
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								        {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "$uri is already in the database\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check the hostname resolves in DNS
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( checkDNS( $uri, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								            $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
							 | 
						||
| 
								 | 
							
								            emit( $silent, "DNS: $uridata{DNS}\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "$uri has no DNS entry\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check the server is available. Don't save if it's not (probably will
							 | 
						||
| 
								 | 
							
								        # not get the feed anyway).
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( serverUp( $uri, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								            emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
							 | 
						||
| 
								 | 
							
								            $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check the site's robots.txt. If there's a block then don't save.
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "Check of robots.txt rules succeeded\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            if ( $uridata{ROBOTS} =~ /404/ ) {
							 | 
						||
| 
								 | 
							
								                emit( $silent, "Search for robots.txt file failed\n" );
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            else {
							 | 
						||
| 
								 | 
							
								                emit( $silent, "Check of robots.txt rules failed\n" );
							 | 
						||
| 
								 | 
							
								                $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								                next;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Look for the HTTP content type. Don't save if the request failed.
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "HTTP request OK\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "HTTP request failed\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Parse the feed
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $uridata{TYPE} eq 'Feed' ) {
							 | 
						||
| 
								 | 
							
								            $LOG->info('Processing feed: ',$uridata{URI});
							 | 
						||
| 
								 | 
							
								            $stream = getFeed( $uridata{URI} );
							 | 
						||
| 
								 | 
							
								            if ($stream) {
							 | 
						||
| 
								 | 
							
								                $feed = parseFeed( $uridata{URI}, $stream );
							 | 
						||
| 
								 | 
							
								                unless ( $feed ) {
							 | 
						||
| 
								 | 
							
								                    $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								                    next;
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                # Provide a means of examining the returned feed object. There
							 | 
						||
| 
								 | 
							
								                # are some weird behaviours in there (partly because of the
							 | 
						||
| 
								 | 
							
								                # weirdness of RSS and poor adherence to what standards there
							 | 
						||
| 
								 | 
							
								                # are).
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                print Dumper($feed), "\n" if ( $DEBUG > 2 );
							 | 
						||
| 
								 | 
							
								                storeFeed($feed,\%uridata);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                # Perform a check on the copyright. Routine sets
							 | 
						||
| 
								 | 
							
								                # $uridata{SAVE} = 0 if the copyright is not acceptable.
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                if ($check) {
							 | 
						||
| 
								 | 
							
								                    next unless checkCopyright(\%uridata);
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            else {
							 | 
						||
| 
								 | 
							
								                emit($silent, "Problem collecting feed");
							 | 
						||
| 
								 | 
							
								                $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								                next;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        elsif ( $uridata{TYPE} eq 'HTML' ) {
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            # TODO Save the URL and find any feeds it contains
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            emit( $silent,
							 | 
						||
| 
								 | 
							
								                "Type $uridata{TYPE} not implemented yet; skipped\n" );
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent,
							 | 
						||
| 
								 | 
							
								                "Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    continue {
							 | 
						||
| 
								 | 
							
								        # { This 'continue' block is executed for each iteration or 'next' }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Decide whether to save what we have collected
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $uridata{SAVE} ) {
							 | 
						||
| 
								 | 
							
								            if ( addURI( $dbh, \%uridata, $keymap ) ) {
							 | 
						||
| 
								 | 
							
								                emit( $silent, "$uridata{URI} added to the database\n" );
							 | 
						||
| 
								 | 
							
								                $LOG->info('Feed ',$uridata{URI},' added to the database');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                # Get the id the database allocated for the row we added
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                $uridata{URI_ID}
							 | 
						||
| 
								 | 
							
								                    = $dbh->last_insert_id( undef, undef, 'urls', undef );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                # Add any enclosures we found
							 | 
						||
| 
								 | 
							
								                #
							 | 
						||
| 
								 | 
							
								                if ( defined( $uridata{ENCLOSURE_COUNT} )
							 | 
						||
| 
								 | 
							
								                    && $uridata{ENCLOSURE_COUNT} > 0 )
							 | 
						||
| 
								 | 
							
								                {
							 | 
						||
| 
								 | 
							
								                    if ( addEnclosures( $dbh, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								                        emit( $silent, $uridata{ENCLOSURE_COUNT},
							 | 
						||
| 
								 | 
							
								                            " enclosures for $uridata{URI} added to the database\n"
							 | 
						||
| 
								 | 
							
								                        );
							 | 
						||
| 
								 | 
							
								                    }
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            else {
							 | 
						||
| 
								 | 
							
								                emit( $silent,
							 | 
						||
| 
								 | 
							
								                    "$uridata{URI} was not added to the database\n" );
							 | 
						||
| 
								 | 
							
								                $LOG->info('Feed ',$uridata{URI},' not added to the database');
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Dump what we have if requested
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        emit( $silent, '-' x 80, "\n" );
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Temporary
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    return;    # undef
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: scanDB
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To scan the URLs in the database and update the stored data
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $keymap         hashref containing a map of key names to
							 | 
						||
| 
								 | 
							
								#                               database field names
							 | 
						||
| 
								 | 
							
								#      RETURNS: Nothing
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub scanDB {
							 | 
						||
| 
								 | 
							
								    my ($dbh, $keymap) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my ( $sql1, $sth1, $rv1, $h1 );
							 | 
						||
| 
								 | 
							
								    my ( $aref, @urls, $DT, $stream, $feed );
							 | 
						||
| 
								 | 
							
								    my ( %uridata, $urichanges, $enc_changes );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Query to return all feed URLs
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' order by title};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $sth1 = $dbh->prepare($sql1);
							 | 
						||
| 
								 | 
							
								    $rv1  = $sth1->execute();
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Collect everything as an arrayref pointing to a bunch of arrayrefs
							 | 
						||
| 
								 | 
							
								    # containing the column details requested
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $aref = $sth1->fetchall_arrayref;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Extract just the URL strings
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    @urls = map { $_->[1] } @{$aref};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Now process these URLs from the database one at a time
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $sql1 = q{SELECT * FROM urls WHERE url = ?};
							 | 
						||
| 
								 | 
							
								    $sth1 = $dbh->prepare($sql1);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $count = 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    foreach my $url (@urls) {
							 | 
						||
| 
								 | 
							
								        %uridata = ();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $count++;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Record the scan for this URL
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $DT = normaliseDT( DateTime->now() );
							 | 
						||
| 
								 | 
							
								        $uridata{SCANNED_ON} = $DT;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Default to OK
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $uridata{SCAN_OK} = 1;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        emit( $silent, "Scanning '$url'\n" );
							 | 
						||
| 
								 | 
							
								        $rv1 = $sth1->execute($url);
							 | 
						||
| 
								 | 
							
								        if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								            warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        $h1 = $sth1->fetchrow_hashref;
							 | 
						||
| 
								 | 
							
								        emit( $silent, $h1->{title}, "\n" );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # The URL should be valid already from the point at which it was added
							 | 
						||
| 
								 | 
							
								        # to the database, but if we do this we get a canonical version (it
							 | 
						||
| 
								 | 
							
								        # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
							 | 
						||
| 
								 | 
							
								        # it. It also sets $uridata{URI}, which is useful.)
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        my $uri = validateURI( $url, \%uridata );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check the hostname resolves in DNS
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( checkDNS( $uri, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								            $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
							 | 
						||
| 
								 | 
							
								            emit( $silent, "DNS: $uridata{DNS}\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "$uri has no DNS entry\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SCAN_OK} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Check the server is available
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( serverUp( $uri, \%uridata ) ) {
							 | 
						||
| 
								 | 
							
								            emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
							 | 
						||
| 
								 | 
							
								            $uridata{SCAN_OK} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Look for the HTTP content type.
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "HTTP request OK\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "HTTP request failed\n" );
							 | 
						||
| 
								 | 
							
								            $uridata{SCAN_OK} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Note: not doing the robots.txt check since it was done at load time
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # We know we have a feed, so go get it
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $stream = getFeed( $uridata{URI} );
							 | 
						||
| 
								 | 
							
								        if ($stream) {
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            # Parse the feed. The routine generates its own error messages
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            $feed = parseFeed( $uridata{URI}, $stream );
							 | 
						||
| 
								 | 
							
								            unless ( $feed ) {
							 | 
						||
| 
								 | 
							
								                $uridata{SCAN_OK} = 0;
							 | 
						||
| 
								 | 
							
								                next;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            # Save the important feed components in the %uridata hash
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            print Dumper($feed), "\n" if ( $DEBUG > 2 );
							 | 
						||
| 
								 | 
							
								            storeFeed( $feed, \%uridata );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "Problem collecting feed" );
							 | 
						||
| 
								 | 
							
								            $uridata{SCAN_OK} = 0;
							 | 
						||
| 
								 | 
							
								            next;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    continue {
							 | 
						||
| 
								 | 
							
								        # { This 'continue' block is executed for each iteration or 'next' }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # If the scan went OK then perform a comparison between the new feed
							 | 
						||
| 
								 | 
							
								        # data and that which is stored
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $uridata{SCAN_OK} ) {
							 | 
						||
| 
								 | 
							
								                $urichanges = updateURI( $dbh, \%uridata, $keymap );
							 | 
						||
| 
								 | 
							
								                $LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
							 | 
						||
| 
								 | 
							
								                    $urichanges )
							 | 
						||
| 
								 | 
							
								                    if $urichanges > 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                # TODO Update enclosures
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "$uridata{URI} was not scanned successfully\n" );
							 | 
						||
| 
								 | 
							
								            $LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Finished scanning this feed
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        emit( $silent, '-' x 80, "\n" );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Temporarily stop after the first N feeds
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        last if $count == 2;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: reportDB
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To generate a printed report from the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $fh             output file handle
							 | 
						||
| 
								 | 
							
								#      RETURNS: Nothing
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: Perhaps we can get fancy with stored reports in the future
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub reportDB {
							 | 
						||
| 
								 | 
							
								    my ($dbh,  $fh) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Query to report the contents of the 'urls' table with the number of
							 | 
						||
| 
								 | 
							
								    # associated episodes
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $sql = q{
							 | 
						||
| 
								 | 
							
								        SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
							 | 
						||
| 
								 | 
							
								            FROM all_episodes ae
							 | 
						||
| 
								 | 
							
								            JOIN urls ON (ae.urls_id = urls.id)
							 | 
						||
| 
								 | 
							
								            GROUP by urls.id
							 | 
						||
| 
								 | 
							
								            HAVING ae.urls_urltype = 'Feed'
							 | 
						||
| 
								 | 
							
								    };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sth1 = $dbh->prepare($sql);
							 | 
						||
| 
								 | 
							
								    my $rv   = $sth1->execute();
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    while ( my $h1 = $sth1->fetchrow_hashref() ) {
							 | 
						||
| 
								 | 
							
								        foreach my $key ( sort( keys( %{$h1} ) ) ) {
							 | 
						||
| 
								 | 
							
								            printf $fh "%20s: %s\n", $key, coalesce( $h1->{$key}, 'undef' );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        print $fh "\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $sth1->finish;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: collectData
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Collects data from the database for generating a report
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#      RETURNS: An array of hashrefs from the query
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
							 | 
						||
| 
								 | 
							
								#               table and the 'all_episodes' view, showing the date of the
							 | 
						||
| 
								 | 
							
								#               latest episode. The result is an array of rows, each
							 | 
						||
| 
								 | 
							
								#               represented as a hash, all sorted by the feed title.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub collectData {
							 | 
						||
| 
								 | 
							
								    my ($dbh) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Query to report the contents of the 'urls' table with the details of the
							 | 
						||
| 
								 | 
							
								    # latest episode
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $sql = q{
							 | 
						||
| 
								 | 
							
								        SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
							 | 
						||
| 
								 | 
							
								            FROM all_episodes ae
							 | 
						||
| 
								 | 
							
								            JOIN urls ON (ae.urls_id = urls.id)
							 | 
						||
| 
								 | 
							
								            GROUP by urls.id
							 | 
						||
| 
								 | 
							
								            HAVING ae.urls_urltype = 'Feed'
							 | 
						||
| 
								 | 
							
								            ORDER BY ae.urls_title
							 | 
						||
| 
								 | 
							
								    };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sth1 = $dbh->prepare($sql);
							 | 
						||
| 
								 | 
							
								    my $rv   = $sth1->execute();
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Return everything as an arrayref of hashrefs
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $tbl_ary_ref = $sth1->fetchall_arrayref({});
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $sth1->finish;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $tbl_ary_ref;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: countRows
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To count the rows in a table
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $sql            SQL expression to use
							 | 
						||
| 
								 | 
							
								#      RETURNS: Number of rows found (note that zero is returned as 0E0)
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub countRows {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $sql ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sth1 = $dbh->prepare($sql);
							 | 
						||
| 
								 | 
							
								    my $rv   = $sth1->execute();
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    my $h1 = $sth1->fetch;
							 | 
						||
| 
								 | 
							
								    $sth1->finish;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return @$h1[0];
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: dbSearch
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To perform a simple search in the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $sql            SQL expression to use (expected to be
							 | 
						||
| 
								 | 
							
								#                               a SELECT)
							 | 
						||
| 
								 | 
							
								#               @args           arguments for the 'execute'
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the row exists, otherwise false (0).
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
							 | 
						||
| 
								 | 
							
								#               SQL without incurring the overhead of repeated 'prepare'
							 | 
						||
| 
								 | 
							
								#               calls. Only the first row is fetched (we expect there to be
							 | 
						||
| 
								 | 
							
								#               only one) and the success or failure is determined by its
							 | 
						||
| 
								 | 
							
								#               existence.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub dbSearch {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $sql, @args ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sth1 = $dbh->prepare_cached($sql);
							 | 
						||
| 
								 | 
							
								    my $rv   = $sth1->execute(@args);
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    my $h1 = $sth1->fetchrow_hashref();
							 | 
						||
| 
								 | 
							
								    $sth1->finish;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return defined($h1);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: execSQL
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To perform a non-SELECT query
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $sql            SQL expression to use
							 | 
						||
| 
								 | 
							
								#               @args           arguments for the 'execute'
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the query succeeded, otherwise false (0).
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
							 | 
						||
| 
								 | 
							
								#               SQL without incurring the overhead of repeated 'prepare'
							 | 
						||
| 
								 | 
							
								#               calls.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub execSQL {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $sql, @args ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sth1 = $dbh->prepare_cached($sql);
							 | 
						||
| 
								 | 
							
								    my $rv   = $sth1->execute(@args);
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    $sth1->finish;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $rv;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: validateURI
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Checks a URL for validity
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $rec            the URL handed to the script
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#      RETURNS: A URI object if valid otherwise undef
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: The URL string is validated with the URI module. A canonical
							 | 
						||
| 
								 | 
							
								#               string version is stored in the hash referenced by $uridata
							 | 
						||
| 
								 | 
							
								#               hash if valid otherwise the URL is marked as invalid.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub validateURI {
							 | 
						||
| 
								 | 
							
								    my ( $rec, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Check that we have a valid URL. We don't save them if they are
							 | 
						||
| 
								 | 
							
								    # invalid.
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $uri = URI->new( $rec, 'http' );
							 | 
						||
| 
								 | 
							
								    if ( $uri->scheme ) {
							 | 
						||
| 
								 | 
							
								        emit( $silent, "URI $uri is valid\n" );
							 | 
						||
| 
								 | 
							
								        $uridata->{URI} = $uri->canonical->as_string;
							 | 
						||
| 
								 | 
							
								        return $uri;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        emit( $silent, "URI $uri is not valid\n" );
							 | 
						||
| 
								 | 
							
								        $uridata->{SAVE} = 0;
							 | 
						||
| 
								 | 
							
								        return;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: checkDNS
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Looks up a host DNS entry
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $uri            URI object
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the DNS query was successful, otherwise false (0)
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: The host name is extracted from the URI (and stored). The
							 | 
						||
| 
								 | 
							
								#               hostname is searched for in the DNS and if successful, an
							 | 
						||
| 
								 | 
							
								#               array of addresses from the 'A' records is built. This is
							 | 
						||
| 
								 | 
							
								#               stored in the hash referenced by $uridata.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub checkDNS {
							 | 
						||
| 
								 | 
							
								    my ( $uri, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @adds;
							 | 
						||
| 
								 | 
							
								    my $hostname = $uri->host;
							 | 
						||
| 
								 | 
							
								    $uridata->{HOST} = $hostname;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $res   = Net::DNS::Resolver->new;
							 | 
						||
| 
								 | 
							
								    my $query = $res->search($hostname);
							 | 
						||
| 
								 | 
							
								    if ($query) {
							 | 
						||
| 
								 | 
							
								        foreach my $rr ( $query->answer ) {
							 | 
						||
| 
								 | 
							
								            next unless $rr->type eq "A";
							 | 
						||
| 
								 | 
							
								            push( @adds, $rr->address );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        $uridata->{DNS} = \@adds;
							 | 
						||
| 
								 | 
							
								        return 1;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        warn "Query failed: ", $res->errorstring, "\n";
							 | 
						||
| 
								 | 
							
								        return 0;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: serverUp
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Checks that a given host is responding
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $uri            URI object
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the host responds to a TCP connection, false (0)
							 | 
						||
| 
								 | 
							
								#               otherwise
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
							 | 
						||
| 
								 | 
							
								#               to the appropriate default for the scheme, such as 80 for
							 | 
						||
| 
								 | 
							
								#               http). Attempts to connect to this host and port. If the
							 | 
						||
| 
								 | 
							
								#               connect fails then details are written to the data structure
							 | 
						||
| 
								 | 
							
								#               pointed to by $uridata.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub serverUp {
							 | 
						||
| 
								 | 
							
								    my ( $uri, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my ( $hostname, $port, $socket );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $hostname        = $uri->host();
							 | 
						||
| 
								 | 
							
								    $port            = $uri->port();
							 | 
						||
| 
								 | 
							
								    $uridata->{PORT} = $port;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $socket = IO::Socket::INET->new(
							 | 
						||
| 
								 | 
							
								        Proto    => "tcp",
							 | 
						||
| 
								 | 
							
								        PeerAddr => $hostname,
							 | 
						||
| 
								 | 
							
								        PeerPort => $port,
							 | 
						||
| 
								 | 
							
								        Reuse    => 1,
							 | 
						||
| 
								 | 
							
								        Timeout  => 10
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if ($socket) {
							 | 
						||
| 
								 | 
							
								        $socket->close;
							 | 
						||
| 
								 | 
							
								        $uridata->{HOSTUP} = 1;
							 | 
						||
| 
								 | 
							
								        return 1;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        $uridata->{HOSTUP} = 0;
							 | 
						||
| 
								 | 
							
								        return 0;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: robotRulesOK
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To check the intended URL against the site's robots.txt rules
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $uri            URI object
							 | 
						||
| 
								 | 
							
								#               $rules          WWW::RobotRules object
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the GET of the robots.txt file succeeded and the
							 | 
						||
| 
								 | 
							
								#               rules allow the URI object to be fetched, false (0) otherwise.
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub robotRulesOK {
							 | 
						||
| 
								 | 
							
								    my ( $uri, $rules, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Build the URL of robots.txt
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
							 | 
						||
| 
								 | 
							
								    my $robots_txt;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Use LWP::UserAgent to get the feed and handle errors
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $ua = LWP::UserAgent->new;
							 | 
						||
| 
								 | 
							
								    $ua->timeout(10);
							 | 
						||
| 
								 | 
							
								    $ua->agent("$PROG/$VERSION");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $response = $ua->get($roboturl);
							 | 
						||
| 
								 | 
							
								    if ( $response->is_success ) {
							 | 
						||
| 
								 | 
							
								        $uridata->{ROBOTS} = 'Found';
							 | 
						||
| 
								 | 
							
								        $robots_txt = $response->decoded_content;
							 | 
						||
| 
								 | 
							
								        $rules->parse( $roboturl, $robots_txt );
							 | 
						||
| 
								 | 
							
								        return $rules->allowed("$uri");
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        $uridata->{ROBOTS} = $response->status_line;
							 | 
						||
| 
								 | 
							
								        warn "Failed to get $roboturl\n";
							 | 
						||
| 
								 | 
							
								        warn $response->status_line . "\n";
							 | 
						||
| 
								 | 
							
								        return; # undef
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: checkContentType
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Check the content_type of the url
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $uri            URI object
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#               $headers        hashref containing query headers
							 | 
						||
| 
								 | 
							
								#               $log            Log::Handler object
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if all was well, otherwise false (0)
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
							 | 
						||
| 
								 | 
							
								#               for the URI object using the LWP::UserAgent head method. Then
							 | 
						||
| 
								 | 
							
								#               we examine the 'content-type' header looking for the string
							 | 
						||
| 
								 | 
							
								#               'xml' or 'html' in it. The former denotes a feed, and the
							 | 
						||
| 
								 | 
							
								#               latter a normal HTML page.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub checkContentType {
							 | 
						||
| 
								 | 
							
								    my ( $uri, $uridata, $headers, $log ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @feeds;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $uridata->{HTTP_STATUS} = 'Unknown';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $browser = LWP::UserAgent->new or return 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $response = $browser->head( $uri->as_string, %{$headers} )
							 | 
						||
| 
								 | 
							
								        or return 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $uridata->{HTTP_STATUS} = $response->status_line;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if ( $response->is_success ) {
							 | 
						||
| 
								 | 
							
								        $uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Decode the content-type we received
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
							 | 
						||
| 
								 | 
							
								            $uridata->{TYPE} = 'Feed';
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
							 | 
						||
| 
								 | 
							
								            $uridata->{TYPE} = 'HTML';
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            $uridata->{TYPE} = 'Unknown';
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Sometimes perfectly valid feeds misleadingly return text/html so we
							 | 
						||
| 
								 | 
							
								        # try to spot such cases here and adjust the internal type accordingly
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $uridata->{TYPE} eq 'HTML' ) {
							 | 
						||
| 
								 | 
							
								            @feeds = Feed::Find->find( $uri->as_string );
							 | 
						||
| 
								 | 
							
								            if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
							 | 
						||
| 
								 | 
							
								                emit( $silent, "Feed found with wrong content-type\n" );
							 | 
						||
| 
								 | 
							
								                $uridata->{TYPE} = 'Feed';
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            else {
							 | 
						||
| 
								 | 
							
								                emit( $silent, "Found ", scalar(@feeds),
							 | 
						||
| 
								 | 
							
								                    " feeds within this HTML page\n" );
							 | 
						||
| 
								 | 
							
								                print Dumper( \@feeds ), "\n" if $DEBUG > 0;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $log->info( "URL content classified as: ", $uridata->{TYPE} );
							 | 
						||
| 
								 | 
							
								        emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
							 | 
						||
| 
								 | 
							
								        return 1;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        $uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
							 | 
						||
| 
								 | 
							
								        return 0;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: getFeed
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Download the contents of a feed URL
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $feed_url       URL of the feed to download
							 | 
						||
| 
								 | 
							
								#      RETURNS: String representation of the feed contents or undef if the
							 | 
						||
| 
								 | 
							
								#               download failed.
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
							 | 
						||
| 
								 | 
							
								#               need not be). If successful the contents are decoded and
							 | 
						||
| 
								 | 
							
								#               returned, otherwise undef is returned.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO:
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub getFeed {
							 | 
						||
| 
								 | 
							
								    my ($feed_url) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Use LWP::UserAgent to get the feed and handle errors
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    my $ua = LWP::UserAgent->new;
							 | 
						||
| 
								 | 
							
								    $ua->timeout(10);
							 | 
						||
| 
								 | 
							
								    $ua->agent("$PROG/$VERSION");
							 | 
						||
| 
								 | 
							
								    my $response = $ua->get($feed_url);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $feed_content;
							 | 
						||
| 
								 | 
							
								    if ( $response->is_success ) {
							 | 
						||
| 
								 | 
							
								        $feed_content = $response->decoded_content;
							 | 
						||
| 
								 | 
							
								        return $feed_content;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        warn "Failed to get $feed_url\n";
							 | 
						||
| 
								 | 
							
								        warn $response->status_line, "\n";
							 | 
						||
| 
								 | 
							
								        return; # undef
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: parseFeed
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Parse a podcast feed that has already been downloaded
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $feed_url       URL of the feed previously downloaded
							 | 
						||
| 
								 | 
							
								#               $feed_content   String containing the content of the feed, for
							 | 
						||
| 
								 | 
							
								#                               parsing
							 | 
						||
| 
								 | 
							
								#      RETURNS: An XML::Feed object containing the parsed feed or undef if the
							 | 
						||
| 
								 | 
							
								#               parse failed
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub parseFeed {
							 | 
						||
| 
								 | 
							
								    my ( $feed_url, $feed_content ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $feed = XML::Feed->parse( \$feed_content );
							 | 
						||
| 
								 | 
							
								    unless ($feed) {
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Something went wrong. Abort this feed
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
							 | 
						||
| 
								 | 
							
								        return; # undef
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $feed;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: storeFeed
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Stores feed attributes in a hash
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $feed           XML::Feed object returned from parsing the
							 | 
						||
| 
								 | 
							
								#                               feed
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for this URI
							 | 
						||
| 
								 | 
							
								#      RETURNS: Nothing
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub storeFeed {
							 | 
						||
| 
								 | 
							
								    my ($feed, $uridata) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    ( $uridata->{TITLE}  = $feed->title ) =~ s/(^\s+|\s+$)//g;
							 | 
						||
| 
								 | 
							
								    ( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $uridata->{LINK}        = $feed->link;
							 | 
						||
| 
								 | 
							
								    $uridata->{DESCRIPTION} = clean_string( $feed->tagline );
							 | 
						||
| 
								 | 
							
								    $uridata->{AUTHOR}      = $feed->author;
							 | 
						||
| 
								 | 
							
								    $uridata->{MODIFIED}    = normaliseDT( $feed->modified );
							 | 
						||
| 
								 | 
							
								    $uridata->{IMAGE}       = flattenArray( $feed->image );
							 | 
						||
| 
								 | 
							
								    $uridata->{COPYRIGHT}   = $feed->copyright;
							 | 
						||
| 
								 | 
							
								    $uridata->{GENERATOR}   = $feed->generator;
							 | 
						||
| 
								 | 
							
								    $uridata->{LANGUAGE}    = $feed->language;
							 | 
						||
| 
								 | 
							
								    #print coalesce($feed->webMaster,'No webMaster'),"\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $uridata->{ENCLOSURES} = extractEnclosures($feed);
							 | 
						||
| 
								 | 
							
								    $uridata->{ENCLOSURE_COUNT}
							 | 
						||
| 
								 | 
							
								        = scalar( @{ $uridata->{ENCLOSURES} } );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: checkCopyright
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Ask the user to check the copyright of a feed
							 | 
						||
| 
								 | 
							
								#   PARAMETERS:
							 | 
						||
| 
								 | 
							
								#      RETURNS:
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub checkCopyright {
							 | 
						||
| 
								 | 
							
								    my ($uridata) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $decision;
							 | 
						||
| 
								 | 
							
								    $LOG->info('Checking copyright of feed');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Prompt the user, failing gracefully if there's
							 | 
						||
| 
								 | 
							
								    # a problem. If the user types 'Y' or 'y' we accept the
							 | 
						||
| 
								 | 
							
								    # feed, otherwise we do not (thus a blank return = 'no').
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    try {
							 | 
						||
| 
								 | 
							
								        printf STDERR
							 | 
						||
| 
								 | 
							
								            "Feed '%s' has the copyright string:\n%s\n",
							 | 
						||
| 
								 | 
							
								            $uridata->{TITLE},
							 | 
						||
| 
								 | 
							
								            coalesce( $uridata->{COPYRIGHT}, '' );
							 | 
						||
| 
								 | 
							
								        $decision = prompt(
							 | 
						||
| 
								 | 
							
								            -in     => *STDIN,
							 | 
						||
| 
								 | 
							
								            -prompt => 'OK to add this feed?',
							 | 
						||
| 
								 | 
							
								            -style  => 'bold red underlined',
							 | 
						||
| 
								 | 
							
								            -yes
							 | 
						||
| 
								 | 
							
								        );
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    catch {
							 | 
						||
| 
								 | 
							
								        warn "Problem processing copyright decision: $_";
							 | 
						||
| 
								 | 
							
								        $decision = 0;
							 | 
						||
| 
								 | 
							
								    };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Take action on the decision (or default)
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $uridata->{SAVE} = $decision;
							 | 
						||
| 
								 | 
							
								    if ($decision) {
							 | 
						||
| 
								 | 
							
								        emit( $silent, "Feed added\n" );
							 | 
						||
| 
								 | 
							
								        $LOG->info('Copyright OK');
							 | 
						||
| 
								 | 
							
								        return 1;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        emit( $silent, "Feed not added\n" );
							 | 
						||
| 
								 | 
							
								        $LOG->info('Copyright not OK');
							 | 
						||
| 
								 | 
							
								        return 0;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: parseRSS
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Attempt to parse a feed as RSS
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $feed_url       URL of the feed previously downloaded
							 | 
						||
| 
								 | 
							
								#               $feed_content   String containing the content of the feed, for
							 | 
						||
| 
								 | 
							
								#                               parsing
							 | 
						||
| 
								 | 
							
								#      RETURNS: An XML::RSS object containing the parsed feed or undef if the
							 | 
						||
| 
								 | 
							
								#               parse failed
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: ** Incomplete **
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub parseRSS {
							 | 
						||
| 
								 | 
							
								    my ( $feed_url, $feed_content ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $rss = XML::RSS->parse(\$feed_content);
							 | 
						||
| 
								 | 
							
								    unless ($rss) {
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: addURI
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Adds the data for a URI to the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for the current URI
							 | 
						||
| 
								 | 
							
								#               $keymap         hashref containing a map of key names to
							 | 
						||
| 
								 | 
							
								#                               database field names
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if the insert succeeded, false (0) otherwise
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: The hash keys are defined as an array to make it easy to slice
							 | 
						||
| 
								 | 
							
								#               the hash and the SQL is defined internally using the size of
							 | 
						||
| 
								 | 
							
								#               the key array as a guide to the number of '?' placeholders.
							 | 
						||
| 
								 | 
							
								#               These are passed to execSQL to do the work.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub addURI {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $uridata, $keymap ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @keys = (
							 | 
						||
| 
								 | 
							
								        'URI',               'DNS',
							 | 
						||
| 
								 | 
							
								        'HOSTUP',            'HTTP_STATUS',
							 | 
						||
| 
								 | 
							
								        'HTTP_CONTENT_TYPE', 'TYPE',
							 | 
						||
| 
								 | 
							
								        'FORMAT',            'TITLE',
							 | 
						||
| 
								 | 
							
								        'DESCRIPTION',       'AUTHOR',
							 | 
						||
| 
								 | 
							
								        'MODIFIED',          'LINK',
							 | 
						||
| 
								 | 
							
								        'IMAGE',             'COPYRIGHT',
							 | 
						||
| 
								 | 
							
								        'GENERATOR',         'LANGUAGE',
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sql
							 | 
						||
| 
								 | 
							
								        = 'INSERT INTO urls ('
							 | 
						||
| 
								 | 
							
								        . join( ",", @{$keymap}{@keys} ) . ') '
							 | 
						||
| 
								 | 
							
								        . 'VALUES('
							 | 
						||
| 
								 | 
							
								        . join( ',', ('?') x scalar(@keys) ) . ')';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    print "addURI query: $sql\n" if $DEBUG > 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return execSQL( $dbh, $sql, @{$uridata}{@keys} );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: extractEnclosures
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Builds an array of hashes containing enclosure data from an
							 | 
						||
| 
								 | 
							
								#               XML::Feed object
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $feed           XML::Feed object
							 | 
						||
| 
								 | 
							
								#      RETURNS: A reference to the array of anonymous hashes built from the
							 | 
						||
| 
								 | 
							
								#               feed.
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: The XML::Feed object is expected to contain an array of
							 | 
						||
| 
								 | 
							
								#               entries. These are converted to hashes, references to which
							 | 
						||
| 
								 | 
							
								#               are stored in an array. The two DateTime components are
							 | 
						||
| 
								 | 
							
								#               converted to ISO8601 dates. If there is an enclosure then its
							 | 
						||
| 
								 | 
							
								#               elements are saved.  Note that there could be multiple
							 | 
						||
| 
								 | 
							
								#               enclosures, but XML::Feed does not cater for them unless
							 | 
						||
| 
								 | 
							
								#               explicitly requested. We do not deal with such a case here.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub extractEnclosures {
							 | 
						||
| 
								 | 
							
								    my ($feed) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @encs;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    foreach my $entry ( $feed->entries ) {
							 | 
						||
| 
								 | 
							
								        my %ent;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $ent{title}    = $entry->title;
							 | 
						||
| 
								 | 
							
								        $ent{base}     = $entry->base;
							 | 
						||
| 
								 | 
							
								        $ent{link}     = $entry->link;
							 | 
						||
| 
								 | 
							
								        $ent{category} = join( ", ", $entry->category );
							 | 
						||
| 
								 | 
							
								        $ent{author}   = $entry->author;
							 | 
						||
| 
								 | 
							
								        $ent{id}       = $entry->id;
							 | 
						||
| 
								 | 
							
								        $ent{issued}   = normaliseDT( $entry->issued );
							 | 
						||
| 
								 | 
							
								        $ent{modified} = normaliseDT( $entry->modified );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Work around a bug in XML::Feed where the source method is only
							 | 
						||
| 
								 | 
							
								        # available for Atom feeds. TODO report this.
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
							 | 
						||
| 
								 | 
							
								            $ent{source} = $entry->source;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        my ($enclosure) = $entry->enclosure;
							 | 
						||
| 
								 | 
							
								        if ( defined($enclosure) ) {
							 | 
						||
| 
								 | 
							
								            $ent{url}    = $enclosure->url;
							 | 
						||
| 
								 | 
							
								            $ent{type}   = $enclosure->type;
							 | 
						||
| 
								 | 
							
								            $ent{length} = $enclosure->length;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        push( @encs, \%ent );
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return \@encs;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: addEnclosures
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Adds episodes extracted from a feed into the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for the current URI
							 | 
						||
| 
								 | 
							
								#                               including an arrayref of hashrefs of episode
							 | 
						||
| 
								 | 
							
								#                               data
							 | 
						||
| 
								 | 
							
								#      RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: The SQL is defined internally and the hash keys are defined as
							 | 
						||
| 
								 | 
							
								#               an array to make it easy to slice the hash. The enclosures (or
							 | 
						||
| 
								 | 
							
								#               more correctly, feed items) are present in the hash as an
							 | 
						||
| 
								 | 
							
								#               array of anonymous hashes. These are processed one at a time
							 | 
						||
| 
								 | 
							
								#               and inserted into the database. A count of the number of
							 | 
						||
| 
								 | 
							
								#               successful inserts is kept. This is compared with the number
							 | 
						||
| 
								 | 
							
								#               of enclosures to determine the boolean value to return.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub addEnclosures {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $sql = q{INSERT INTO episodes
							 | 
						||
| 
								 | 
							
								        (urls_id, link, enclosure, title, author, category, source, ep_id,
							 | 
						||
| 
								 | 
							
								        issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @keys = (
							 | 
						||
| 
								 | 
							
								        'link', 'url',    'title',    'author', 'category', 'source',
							 | 
						||
| 
								 | 
							
								        'id',   'issued', 'modified', 'length', 'type'
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $successes = 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
							 | 
						||
| 
								 | 
							
								        if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
							 | 
						||
| 
								 | 
							
								            $successes++;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit( $silent, "Failed to add episode $enc->{url}\n" );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return ( $successes == $uridata->{ENCLOSURE_COUNT} );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: updateURI
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Compare the data in a hash with that in the database
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for the current URI
							 | 
						||
| 
								 | 
							
								#                               including an arrayref of hashrefs of episode
							 | 
						||
| 
								 | 
							
								#                               data
							 | 
						||
| 
								 | 
							
								#               $keymap         hashref containing a map of key names to
							 | 
						||
| 
								 | 
							
								#                               database field names
							 | 
						||
| 
								 | 
							
								#      RETURNS: The number of changes made
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub updateURI {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $uridata, $keymap ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my ( $sql1, $sth1, $rv1, $h1 );
							 | 
						||
| 
								 | 
							
								    my ( %fieldvals, %where );
							 | 
						||
| 
								 | 
							
								    my ( $diffs, $updates ) = ( 0, 0 );
							 | 
						||
| 
								 | 
							
								    my @keys = (
							 | 
						||
| 
								 | 
							
								        'URI',               'DNS',
							 | 
						||
| 
								 | 
							
								        'HOSTUP',            'HTTP_STATUS',
							 | 
						||
| 
								 | 
							
								        'HTTP_CONTENT_TYPE', 'TYPE',
							 | 
						||
| 
								 | 
							
								        'FORMAT',            'TITLE',
							 | 
						||
| 
								 | 
							
								        'DESCRIPTION',       'AUTHOR',
							 | 
						||
| 
								 | 
							
								        'MODIFIED',          'LINK',
							 | 
						||
| 
								 | 
							
								        'IMAGE',             'COPYRIGHT',
							 | 
						||
| 
								 | 
							
								        'GENERATOR',         'LANGUAGE',
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Get the row from the urls table
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $sql1 = q{SELECT * FROM urls WHERE url = ?};
							 | 
						||
| 
								 | 
							
								    $sth1 = $dbh->prepare($sql1);
							 | 
						||
| 
								 | 
							
								    $rv1  = $sth1->execute( $uridata->{URI} );
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    $h1 = $sth1->fetchrow_hashref;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    for my $key (@keys) {
							 | 
						||
| 
								 | 
							
								        unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
							 | 
						||
| 
								 | 
							
								            $diffs++;
							 | 
						||
| 
								 | 
							
								            $fieldvals{$key} = $uridata->{$key};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            # Temporary report
							 | 
						||
| 
								 | 
							
								            #
							 | 
						||
| 
								 | 
							
								            print "Difference: ($key)\n";
							 | 
						||
| 
								 | 
							
								            print "  Feed:     ", coalesce($uridata->{$key},''), "\n";
							 | 
						||
| 
								 | 
							
								            print "  Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if ($diffs > 0) {
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Prepare to use SQL::Abstract
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        my $sql = SQL::Abstract->new();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Build the update statement
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $where{id} = $h1->{id};
							 | 
						||
| 
								 | 
							
								        my ( $stmt, @bindvals )
							 | 
						||
| 
								 | 
							
								            = $sql->update( 'urls', \%fieldvals, \%where );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # Temporary
							 | 
						||
| 
								 | 
							
								        print "Statement: $stmt\n";
							 | 
						||
| 
								 | 
							
								        print "Bind values: ", join(",",@bindvals),"\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # Perform the updates
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        $sth1 = $dbh->prepare($stmt);
							 | 
						||
| 
								 | 
							
								        $sth1->execute(@bindvals);
							 | 
						||
| 
								 | 
							
								        if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								            warn "Processing $h1->{url}\n", $dbh->errstr;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        else {
							 | 
						||
| 
								 | 
							
								            emit($silent, "Updated $h1->{url}\n");
							 | 
						||
| 
								 | 
							
								            $updates++;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $updates;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: updateEnclosures
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Update the enclosures stored with a feed URL
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dbh            database handle
							 | 
						||
| 
								 | 
							
								#               $uridata        hashref containing data for the current URI
							 | 
						||
| 
								 | 
							
								#                               including an arrayref of hashrefs of episode
							 | 
						||
| 
								 | 
							
								#                               data
							 | 
						||
| 
								 | 
							
								#      RETURNS: The number of changes made
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: 
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub updateEnclosures {
							 | 
						||
| 
								 | 
							
								    my ( $dbh, $uridata ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my ( $sql1, $sth1, $rv1, $h1 );
							 | 
						||
| 
								 | 
							
								    my ( %fieldvals, %where );
							 | 
						||
| 
								 | 
							
								    my ( $diffs, $updates ) = ( 0, 0 );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    # Get the enclosures associated with this feed
							 | 
						||
| 
								 | 
							
								    #
							 | 
						||
| 
								 | 
							
								    $sql1 = q{
							 | 
						||
| 
								 | 
							
								        SELECT * FROM episodes
							 | 
						||
| 
								 | 
							
								        WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
							 | 
						||
| 
								 | 
							
								    };
							 | 
						||
| 
								 | 
							
								    $sth1 = $dbh->prepare($sql1);
							 | 
						||
| 
								 | 
							
								    $rv1  = $sth1->execute( $uridata->{URI} );
							 | 
						||
| 
								 | 
							
								    if ( $dbh->err ) {
							 | 
						||
| 
								 | 
							
								        warn $dbh->errstr;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    $h1 = $sth1->fetchrow_hashref;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: equal
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Compare two strings even if undefined
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $s1             The first string
							 | 
						||
| 
								 | 
							
								#               $s2             The second string
							 | 
						||
| 
								 | 
							
								#      RETURNS: True if both strings are undefined, false if one isn't
							 | 
						||
| 
								 | 
							
								#               defined, otherwise the result of comparing them.
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Works on the principle that two undefined strings are equal,
							 | 
						||
| 
								 | 
							
								#               a defined and an undefined string are not, and otherwise they
							 | 
						||
| 
								 | 
							
								#               are equal if they are equal!
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO:
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub equal {
							 | 
						||
| 
								 | 
							
								    my ( $s1, $s2 ) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return 1 if ( !defined($s1) && !defined($s2) );
							 | 
						||
| 
								 | 
							
								    return 0 if ( !defined($s1) || !defined($s2) );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return ( $s1 eq $s2 );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: clean_string
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Clean a string of non-printables, newlines, multiple spaces
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $str            The string to process
							 | 
						||
| 
								 | 
							
								#      RETURNS: The processed string
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
							 | 
						||
| 
								 | 
							
								#               characters. Removes all CR/LF sequences. Removes multiple
							 | 
						||
| 
								 | 
							
								#               spaces.
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO:
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub clean_string {
							 | 
						||
| 
								 | 
							
								    my ($str) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    $str =~ s/(^\s+|\s+$)//g;
							 | 
						||
| 
								 | 
							
								    $str =~ tr/[[:graph:]]//c;
							 | 
						||
| 
								 | 
							
								    $str =~ tr/\x0A\x0D/ /;
							 | 
						||
| 
								 | 
							
								    $str =~ tr/ / /s;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $str;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: normaliseDT
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Normalise an ISO8601 date for comparison, etc.
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $dt             a DateTime object
							 | 
						||
| 
								 | 
							
								#      RETURNS: The DateTime object formatted as an ISO8601 string
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub normaliseDT {
							 | 
						||
| 
								 | 
							
								    my ($dt) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $p = DateTime::Format::SQLite->new();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return (
							 | 
						||
| 
								 | 
							
								        defined($dt)
							 | 
						||
| 
								 | 
							
								        ? $p->format_datetime(
							 | 
						||
| 
								 | 
							
								            DateTime::Format::ISO8601->parse_datetime($dt)
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								        : undef
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: flattenArray
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Turns an arrayref into a simple list in a string
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $item - the item that may be an arrayref
							 | 
						||
| 
								 | 
							
								#      RETURNS: The plain item if it's not an array otherwise the flattened
							 | 
						||
| 
								 | 
							
								#               version
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub flattenArray {
							 | 
						||
| 
								 | 
							
								    my ($item) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my $result;
							 | 
						||
| 
								 | 
							
								    if ( ref( $item ) eq 'ARRAY' ) {
							 | 
						||
| 
								 | 
							
								        $result = join(", ",@{$item});
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        $result = $item;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return $item;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: coalesce
							 | 
						||
| 
								 | 
							
								#      PURPOSE: To find the defined argument and return it
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: Arbitrary number of arguments
							 | 
						||
| 
								 | 
							
								#      RETURNS: The first defined argument
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub coalesce {
							 | 
						||
| 
								 | 
							
								    foreach (@_) {
							 | 
						||
| 
								 | 
							
								        return $_ if defined($_);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    return; # undef
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: emit
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Print text on STDERR unless silent mode has been selected
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: - Boolean indicating whether to be silent or not
							 | 
						||
| 
								 | 
							
								#               - list of arguments to 'print'
							 | 
						||
| 
								 | 
							
								#      RETURNS: Nothing
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION: This is a wrapper around 'print' to determine whether to send
							 | 
						||
| 
								 | 
							
								#               a message to STDERR depending on a boolean. We need this to be
							 | 
						||
| 
								 | 
							
								#               able to make the script silent when the -silent option is
							 | 
						||
| 
								 | 
							
								#               selected
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub emit {
							 | 
						||
| 
								 | 
							
								    unless (shift) {
							 | 
						||
| 
								 | 
							
								        print STDERR @_;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#===  FUNCTION  ================================================================
							 | 
						||
| 
								 | 
							
								#         NAME: Options
							 | 
						||
| 
								 | 
							
								#      PURPOSE: Processes command-line options
							 | 
						||
| 
								 | 
							
								#   PARAMETERS: $optref     Hash reference to hold the options
							 | 
						||
| 
								 | 
							
								#      RETURNS: Undef
							 | 
						||
| 
								 | 
							
								#  DESCRIPTION:
							 | 
						||
| 
								 | 
							
								#       THROWS: No exceptions
							 | 
						||
| 
								 | 
							
								#     COMMENTS: None
							 | 
						||
| 
								 | 
							
								#     SEE ALSO: N/A
							 | 
						||
| 
								 | 
							
								#===============================================================================
							 | 
						||
| 
								 | 
							
								sub Options {
							 | 
						||
| 
								 | 
							
								    my ($optref) = @_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    my @options = (
							 | 
						||
| 
								 | 
							
								        "help",   "manpage",  "debug=i", "silent!",
							 | 
						||
| 
								 | 
							
								        "load=s", "delete=s", "scan!",   "report!",
							 | 
						||
| 
								 | 
							
								        "check!", "json:s",   "opml:s",  "config=s",
							 | 
						||
| 
								 | 
							
								        "out=s",  "template:s",
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if ( !GetOptions( $optref, @options ) ) {
							 | 
						||
| 
								 | 
							
								        pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								__END__
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
							 | 
						||
| 
								 | 
							
								#  Application Documentation
							 | 
						||
| 
								 | 
							
								#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
							 | 
						||
| 
								 | 
							
								#{{{
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 NAME
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								feedWatcher - watch a collection of podcast feeds
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 VERSION
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This documentation refers to I<feedWatcher> version 0.0.14
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 USAGE
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								 feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
							 | 
						||
| 
								 | 
							
								    [-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
							 | 
						||
| 
								 | 
							
								    [-[no]silent] [-config=FILE] [-debug=N] [URL ...]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 ARGUMENTS
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Arguments are optional and may consist of an arbitrarily long list of URLs to
							 | 
						||
| 
								 | 
							
								be processed and possibly added to the database by the script.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								These URLs are prepended to any which may be provided through the
							 | 
						||
| 
								 | 
							
								B<-load=FILE> option described below. The script makes sure the URL collection
							 | 
						||
| 
								 | 
							
								contains no duplicates.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 OPTIONS
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=over 8
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-help>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Prints a brief help message describing the usage of the program, and then exits.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								The full documentation can be displayed with the command:
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    perldoc -oman feedWatcher
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Alternatively, use the option B<-manpage>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-manpage>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Prints the entire documentation for the script. Note that this mode uses a
							 | 
						||
| 
								 | 
							
								simpler type of formatter than I<perldoc>.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-load=FILE>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Defines a file from which new URLs are to be added to the database. These URLs
							 | 
						||
| 
								 | 
							
								are checked in various ways before adding to the database. If arguments are
							 | 
						||
| 
								 | 
							
								provided when the script is invoked these URLs are appended to the argument list.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-delete=FILE>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Defines a file from which a list of URLs is to be read which are to be
							 | 
						||
| 
								 | 
							
								deleted from the database.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Note that it is possible (though inadvisable) to both add and delete an URL in
							 | 
						||
| 
								 | 
							
								the same run of the script. The URL will first be added (from the load file or
							 | 
						||
| 
								 | 
							
								argument list) then deleted. This is a pointless exercise which wastes
							 | 
						||
| 
								 | 
							
								bandwidth, so don't do it!
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-[no]scan>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option (B<-scan>) causes the URLs stored in the database to be scanned
							 | 
						||
| 
								 | 
							
								and updated. The negated form, which is also the default behaviour of the
							 | 
						||
| 
								 | 
							
								script, (B<-noscan>) omits the scan.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								NOTE: This function is not implemented yet.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-out=FILE>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option defines an output file to receive any output. If the option is
							 | 
						||
| 
								 | 
							
								omitted the data is written to STDOUT, allowing it to be redirected if
							 | 
						||
| 
								 | 
							
								required.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-[no]check>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option (B<-check>) causes each feed which is being to be checked against
							 | 
						||
| 
								 | 
							
								the script user to check that it's OK to add it. The script reports the
							 | 
						||
| 
								 | 
							
								I<copyright> field and requests a I<y> or I<n> response.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-[no]report>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option (B<-report>) causes a report of the contents of the database to be
							 | 
						||
| 
								 | 
							
								generated.  The negated form, which is also the default behaviour of the
							 | 
						||
| 
								 | 
							
								script, (B<-noreport>) omits the report.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								NOTE: The report is currently very simple.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-json[=FILE]>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option, which may be omitted, defines the location where the feed and
							 | 
						||
| 
								 | 
							
								episode details are to be written. If omitted no JSON data is written.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the option is given as B<-json=FILE> the data is written to the nominated
							 | 
						||
| 
								 | 
							
								file.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
							 | 
						||
| 
								 | 
							
								used.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-opml[=FILE]>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option, which may be omitted, defines the location where the feed details
							 | 
						||
| 
								 | 
							
								are to be written. If omitted no OPML data is written.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the option is given as B<-opml=FILE> the data is written to the nominated
							 | 
						||
| 
								 | 
							
								file.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
							 | 
						||
| 
								 | 
							
								used.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-template=FILE>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option defines the template used to generate a form of the feed data. The
							 | 
						||
| 
								 | 
							
								template is written using the B<Template> toolkit language.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the file name is omitted then the script uses the file B<feedWatcher.tpl>
							 | 
						||
| 
								 | 
							
								in the same directory as the script. If this file does not exist then the
							 | 
						||
| 
								 | 
							
								script will exit with an error message.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								For convenience B<feedWatcher.tpl> can be made a soft link which points to the
							 | 
						||
| 
								 | 
							
								file which is the current default. This allows the development of versions
							 | 
						||
| 
								 | 
							
								without changing the usual way this script is run.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-config=FILE>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option allows an alternative configuration file to be used. This file
							 | 
						||
| 
								 | 
							
								defines the location of the database, its name and the username and
							 | 
						||
| 
								 | 
							
								password to be used to access it. This feature permits a test database to be
							 | 
						||
| 
								 | 
							
								used, or for two more sets of feeds to be processed.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								See the CONFIGURATION AND ENVIRONMENT section below for the file format.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								If the option is omitted the default file is used: B<feedWatcher.cfg>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=item B<-debug=N>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This option selects the debug level, resulting in a lot of output.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    0   (the default) No debug output
							 | 
						||
| 
								 | 
							
								    1   Dumps the list of feeds found in an HTML download.
							 | 
						||
| 
								 | 
							
								        Also shows the SQL query which will result in the insertion of a new
							 | 
						||
| 
								 | 
							
								        row into the database table I<urls>.
							 | 
						||
| 
								 | 
							
								    2   Dumps the collected data which is destined to be written to the database
							 | 
						||
| 
								 | 
							
								    3   Dumps the contents of feeds during analysis
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=back
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 DESCRIPTION
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								A full description of the application and its features.
							 | 
						||
| 
								 | 
							
								May include numerous subsections (i.e. =head2, =head3, etc.)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 DIAGNOSTICS
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								A list of every error and warning message that the application can generate
							 | 
						||
| 
								 | 
							
								(even the ones that will "never happen"), with a full explanation of each
							 | 
						||
| 
								 | 
							
								problem, one or more likely causes, and any suggested remedies. If the
							 | 
						||
| 
								 | 
							
								application generates exit status codes (e.g. under Unix) then list the exit
							 | 
						||
| 
								 | 
							
								status associated with each error.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 CONFIGURATION AND ENVIRONMENT
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								A full explanation of any configuration system(s) used by the application,
							 | 
						||
| 
								 | 
							
								including the names and locations of any configuration files, and the
							 | 
						||
| 
								 | 
							
								meaning of any environment variables or properties that can be set. These
							 | 
						||
| 
								 | 
							
								descriptions must also include details of any configuration language used
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 DEPENDENCIES
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    Config::General
							 | 
						||
| 
								 | 
							
								    DBI
							 | 
						||
| 
								 | 
							
								    Data::Dumper
							 | 
						||
| 
								 | 
							
								    DateTime::Format::ISO8601
							 | 
						||
| 
								 | 
							
								    DateTime::Format::SQLite
							 | 
						||
| 
								 | 
							
								    Feed::Find
							 | 
						||
| 
								 | 
							
								    Getopt::Long
							 | 
						||
| 
								 | 
							
								    HTML::Entities
							 | 
						||
| 
								 | 
							
								    IO::Socket
							 | 
						||
| 
								 | 
							
								    JSON
							 | 
						||
| 
								 | 
							
								    LWP::UserAgent
							 | 
						||
| 
								 | 
							
								    List::MoreUtils
							 | 
						||
| 
								 | 
							
								    Log::Handler
							 | 
						||
| 
								 | 
							
								    Net::DNS
							 | 
						||
| 
								 | 
							
								    Pod::Usage
							 | 
						||
| 
								 | 
							
								    Template
							 | 
						||
| 
								 | 
							
								    Template::Filters
							 | 
						||
| 
								 | 
							
								    URI
							 | 
						||
| 
								 | 
							
								    WWW::RobotRules
							 | 
						||
| 
								 | 
							
								    XML::Feed
							 | 
						||
| 
								 | 
							
								    XML::RSS::Parser
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 INCOMPATIBILITIES
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								A list of any modules that this module cannot be used in conjunction with.
							 | 
						||
| 
								 | 
							
								This may be due to name conflicts in the interface, or competition for
							 | 
						||
| 
								 | 
							
								system or program resources, or due to internal limitations of Perl
							 | 
						||
| 
								 | 
							
								(for example, many modules that use source code filters are mutually
							 | 
						||
| 
								 | 
							
								incompatible).
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 BUGS AND LIMITATIONS
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								There are no known bugs in this module.
							 | 
						||
| 
								 | 
							
								Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
							 | 
						||
| 
								 | 
							
								Patches are welcome.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 AUTHOR
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Dave Morriss (Dave.Morriss@gmail.com) 2020
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=head1 LICENCE AND COPYRIGHT
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This program is free software. You can redistribute it and/or modify it under
							 | 
						||
| 
								 | 
							
								the same terms as perl itself.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								=cut
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#}}}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# [zo to open fold, zc to close]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
							 |