forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			2196 lines
		
	
	
		
			66 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: feedWatcher
 | |
| #
 | |
| #        USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
 | |
| #               [-[no]report] [-[no]check] [-out=FILE] [-json[=FILE]]
 | |
| #               [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
 | |
| #               [URL ...]
 | |
| #
 | |
| #  DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
 | |
| #               podcasts. Ken's script was inspired by Dann Washko's site at
 | |
| #               http://www.thelinuxlink.net/ and prepared for OggCamp 12 in
 | |
| #               August 2012 where it was used to generate handouts.
 | |
| #
 | |
| #               The script has not been developed since 2014, but is now in
 | |
| #               use in 2020 helping to prepare podcast information for
 | |
| #               a FOSDEM visit under the heading of "Free Culture Podcasts".
 | |
| #               See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html'
 | |
| #               (created with Pandoc) in the same directory as this script for
 | |
| #               details of what has been done to develop the original design.
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.14
 | |
| #      CREATED: 2013-12-25 12:40:33
 | |
| #     REVISION: 2020-02-18 22:45:28
 | |
| #
 | |
| #-------------------------------------------------------------------------------
 | |
| # Released under the terms of the GNU Affero General Public License (AGPLv3)
 | |
| # http://www.gnu.org/licenses/agpl-3.0.html
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use 5.030;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| 
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| use Config::General;
 | |
| use List::MoreUtils qw{uniq};
 | |
| use Log::Handler;
 | |
| 
 | |
| use Try::Tiny;
 | |
| 
 | |
| use URI;
 | |
| use Net::DNS;
 | |
| use IO::Socket;
 | |
| use LWP::UserAgent;
 | |
| use WWW::RobotRules;
 | |
| use XML::RSS::Parser;
 | |
| use XML::Feed;
 | |
| use Feed::Find;
 | |
| 
 | |
| use Template;
 | |
| use Template::Filters;
 | |
| Template::Filters->use_html_entities;    # Use HTML::Entities in the template
 | |
| 
 | |
| use HTML::Entities;
 | |
| 
 | |
| use IO::Prompter;
 | |
| 
 | |
| use JSON;
 | |
| 
 | |
| use DBI;
 | |
| use DateTime::Format::SQLite;
 | |
| use DateTime::Format::ISO8601;
 | |
| use SQL::Abstract;
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.0.14';
 | |
| 
 | |
| #
 | |
| # Script name
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| 
 | |
| #
 | |
| # Declarations
 | |
| #
 | |
| my ( @new_urls, @deletions );
 | |
| my ( $rules,    $robot_name ) = ( undef, "$PROG/$VERSION" );
 | |
| my ( $sth1, $h1, $rv );
 | |
| 
 | |
| my $feeds;
 | |
| 
 | |
| #
 | |
| # File and directory paths
 | |
| #
 | |
| my $basedir     = "$ENV{HOME}/HPR/feed_watcher";
 | |
| my $tempdir     = "$basedir/tmp";
 | |
| my $configfile  = "$basedir/$PROG.cfg";
 | |
| my $logfile     = "$basedir/${PROG}.log";
 | |
| my $deftemplate = "$basedir/${PROG}.tpl";
 | |
| 
 | |
| #
 | |
| # Hash key map to database field names (chevron comments are to stop Perltidy
 | |
| # messing with the layout)
 | |
| #
 | |
|                                                 #<<<
 | |
| my %keymap = (
 | |
|     AUTHOR            => 'author',
 | |
|     COPYRIGHT         => 'copyright',
 | |
|     DESCRIPTION       => 'description',
 | |
|     DNS               => 'dns',
 | |
| #   ENCLOSURES        => undef,
 | |
| #   ENCLOSURE_COUNT   => undef,
 | |
|     FORMAT            => 'feedformat',
 | |
|     GENERATOR         => 'generator',
 | |
| #   HOST              => undef,
 | |
|     HOSTUP            => 'host_up',
 | |
|     HTTP_CONTENT_TYPE => 'content_type',
 | |
|     HTTP_STATUS       => 'http_status',
 | |
|     IMAGE             => 'image',
 | |
|     LANGUAGE          => 'language',
 | |
|     LINK              => 'link',
 | |
|     MODIFIED          => 'modified',
 | |
| #   PORT              => undef,
 | |
| #   ROBOTS            => undef,
 | |
| #   SAVE              => undef,
 | |
|     TITLE             => 'title',
 | |
|     TYPE              => 'urltype',
 | |
|     URI               => 'url',
 | |
| #   URI_ID            => undef,
 | |
| );
 | |
|                                                 #>>>
 | |
| 
 | |
| #
 | |
| # Headers for LWP::UserAgent
 | |
| #
 | |
| my %headers = (
 | |
|     'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; '
 | |
|         . 'rv:15.0) Gecko/20100101 Firefox/15.0.1',
 | |
|     'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, '
 | |
|         . 'image/pjpeg, image/png, */*',
 | |
|     'Accept-Charset'  => 'iso-8859-1,*,utf-8',
 | |
|     'Accept-Language' => 'en-US',
 | |
| );
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #
 | |
| # Options and arguments
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Any arguments are taken to be URLs
 | |
| #
 | |
| @new_urls = @ARGV;
 | |
| 
 | |
| #
 | |
| # Default help
 | |
| #
 | |
| pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # Detailed help
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
 | |
|     if ( $options{'manpage'} );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 );
 | |
| 
 | |
| my $cfgfile
 | |
|     = ( defined( $options{config} ) ? $options{config} : $configfile );
 | |
| my $silent     = ( defined( $options{silent} )    ? $options{silent}    : 0 );
 | |
| my $loadfile   = $options{'load'};
 | |
| my $deletefile = $options{'delete'};
 | |
| 
 | |
| my $scan   = ( defined( $options{scan} )   ? $options{scan}   : 0 );
 | |
| my $report = ( defined( $options{report} ) ? $options{report} : 0 );
 | |
| my $check  = ( defined( $options{check} )  ? $options{check}  : 0 );
 | |
| 
 | |
| my $outfile  = $options{out};
 | |
| my $json     = $options{json};
 | |
| my $opml     = $options{opml};
 | |
| my $template = $options{template};
 | |
| 
 | |
| #
 | |
| # Check the configuration file
 | |
| #
 | |
| die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile );
 | |
| 
 | |
| #
 | |
| # Check the load file
 | |
| #
 | |
| if ($loadfile) {
 | |
|     die "File $loadfile does not exist\n"  unless -e $loadfile;
 | |
|     die "File $loadfile is not readable\n" unless -r $loadfile;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Check the delete file
 | |
| #
 | |
| if ($deletefile) {
 | |
|     die "File $deletefile does not exist\n"  unless -e $deletefile;
 | |
|     die "File $deletefile is not readable\n" unless -r $deletefile;
 | |
| }
 | |
| 
 | |
| #
 | |
| # We accept -json or -json=filename. In the former case we make a default
 | |
| # name, otherwise we use the one provided.
 | |
| #
 | |
| if ( defined($json) ) {
 | |
|     if ($json =~ /^$/) {
 | |
|         $json = "$PROG.json"
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # We accept -opml or -opml=filename. In the former case we make a default
 | |
| # name, otherwise we use the one provided.
 | |
| #
 | |
| if ( defined($opml) ) {
 | |
|     if ($opml =~ /^$/) {
 | |
|         $opml = "$PROG.opml"
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # Similarly we accept -template or -template=filename. In the former case we
 | |
| # make a default name, otherwise we use the one provided.
 | |
| #
 | |
| if ( defined($template) ) {
 | |
|     if ($template =~ /^$/) {
 | |
|         $template = "$deftemplate";
 | |
|     }
 | |
| 
 | |
|     die "Error: Unable to find template $template\n" unless -r $template;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Load configuration data
 | |
| #
 | |
| my $conf = new Config::General(
 | |
|     -ConfigFile      => $cfgfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1
 | |
| );
 | |
| my %config = $conf->getall();
 | |
| 
 | |
| #
 | |
| # Connect to the database
 | |
| #
 | |
| my $dbtype = $config{database}->{type};
 | |
| my $dbfile = $config{database}->{file};
 | |
| my $dbuser = $config{database}->{user};
 | |
| my $dbpwd  = $config{database}->{password};
 | |
| my $dbh
 | |
|     = DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd,
 | |
|     { AutoCommit => 1, sqlite_unicode => 1, } )
 | |
|     or die $DBI::errstr;
 | |
| 
 | |
| #
 | |
| # Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE
 | |
| # foreign key constraints to function)
 | |
| #
 | |
| $dbh->do('PRAGMA foreign_keys = ON');
 | |
| 
 | |
| #
 | |
| # Check we have something to do
 | |
| #
 | |
| my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' );
 | |
| my $work = (
 | |
|            scalar(@new_urls) > 0
 | |
|         || defined($loadfile)
 | |
|         || defined($deletefile)
 | |
|         || ( $report
 | |
|         || defined($json)
 | |
|         || defined($opml)
 | |
|         || defined($template)
 | |
|         || $scan && $rows > 0 )
 | |
| );
 | |
| 
 | |
| die "Nothing to do!\n" unless $work;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Set up logging keeping the default log layout except for the date
 | |
| #-------------------------------------------------------------------------------
 | |
| my $LOG = Log::Handler->new();
 | |
| 
 | |
| $LOG->add(
 | |
|     file => {
 | |
|         timeformat => "%Y/%m/%d %H:%M:%S",
 | |
|         filename   => $logfile,
 | |
|         minlevel   => 0,
 | |
|         maxlevel   => 7,
 | |
|     }
 | |
| );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Open the output file (or STDOUT) - we may need the date to do it
 | |
| #-------------------------------------------------------------------------------
 | |
| my $outfh;
 | |
| if ($outfile) {
 | |
|     open( $outfh, ">:encoding(UTF-8)", $outfile )
 | |
|         or warn "Unable to open $outfile for writing: $!";
 | |
| }
 | |
| else {
 | |
|     open( $outfh, ">&:encoding(UTF-8)", \*STDOUT )
 | |
|         or warn "Unable to initialise for writing: $!";
 | |
| }
 | |
| 
 | |
| 
 | |
| #
 | |
| # Set up a robot.txt rules parser
 | |
| #
 | |
| $rules = WWW::RobotRules->new($robot_name);
 | |
| 
 | |
| #
 | |
| # Slurp the load file if there is one
 | |
| #
 | |
| if ($loadfile) {
 | |
|     #
 | |
|     # Load the input file
 | |
|     #
 | |
|     open( my $in, '<:encoding(utf8)', $loadfile )
 | |
|         or die "$PROG : failed to open load file '$loadfile' : $!\n";
 | |
|     chomp( my @loaded = <$in> );
 | |
|     close($in)
 | |
|         or warn "$PROG : failed to close load file '$loadfile' : $!\n";
 | |
| 
 | |
|     $LOG->info("Loaded URLs from $loadfile");
 | |
| 
 | |
|     #
 | |
|     # Add the loaded URLs to the array
 | |
|     #
 | |
|     push( @new_urls, @loaded );
 | |
| }
 | |
| 
 | |
| #
 | |
| # Now, we either have URLs from the command line, or from the load file, so we
 | |
| # process these.
 | |
| #
 | |
| if (@new_urls) {
 | |
|     #
 | |
|     # Remove duplicates
 | |
|     #
 | |
|     @new_urls = uniq(@new_urls);
 | |
| 
 | |
|     $LOG->info( "Adding ", scalar(@new_urls), " URLs to the database" );
 | |
| 
 | |
|     #
 | |
|     # Load these URLs as appropriate, returning any more that we find by
 | |
|     # following HTML urls.
 | |
|     #
 | |
|     @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
 | |
| 
 | |
|     #
 | |
|     # Now process any URLs that came back. Since we are explicitly looking for
 | |
|     # feeds we can assume that's what we have so don't need to recurse again.
 | |
|     #
 | |
|     # TODO
 | |
| }
 | |
| 
 | |
| #
 | |
| # Process the delete file if there is one
 | |
| #
 | |
| if ($deletefile) {
 | |
|     #
 | |
|     # Load the delete file
 | |
|     #
 | |
|     open( my $del, '<:encoding(utf8)', $deletefile )
 | |
|         or die "$PROG : failed to open load file '$deletefile' : $!\n";
 | |
|     chomp( @deletions = <$del> );
 | |
|     close($del)
 | |
|         or warn "$PROG : failed to close load file '$deletefile' : $!\n";
 | |
| 
 | |
|     #
 | |
|     # Remove duplicates
 | |
|     #
 | |
|     @deletions = uniq(@deletions);
 | |
| 
 | |
|     $LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" );
 | |
| 
 | |
|     #
 | |
|     # There are URLs to delete. Process them on by one.
 | |
|     #
 | |
|     if (@deletions) {
 | |
|         $sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?});
 | |
|         foreach my $rec (@deletions) {
 | |
|             $rv = $sth1->execute($rec);
 | |
|             if ( $dbh->err ) {
 | |
|                 warn $dbh->errstr;
 | |
|             }
 | |
|             if ( $rv != 0 ) {
 | |
|                 emit ( $silent, "Deleted $rec ($rv rows)\n" );
 | |
|             }
 | |
|             else {
 | |
|                 emit ( $silent, "Failed to delete $rec\n" );
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Perform a database scan if requested
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($scan) {
 | |
|     $LOG->info( "Scan is not fully implemented yet" );
 | |
|     scanDB($dbh, \%keymap);
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Report database contents if requested
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($report) {
 | |
|     reportDB($dbh, $outfh);
 | |
|     $LOG->info( "Report generated" );
 | |
| }
 | |
| 
 | |
| #
 | |
| # Do any image repairs we need, but temporarily
 | |
| #
 | |
| if ($json || $opml || $template) {
 | |
|     $feeds = collectData($dbh);
 | |
| 
 | |
|     #
 | |
|     # Clean up the image references (until we have a proper fix)
 | |
|     #
 | |
|     for (my $i = 0; $i < scalar(@$feeds); $i++) {
 | |
|         my $f = $feeds->[$i];
 | |
|         if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) {
 | |
|             $f->{urls_image} =~ s/^(ARRAY\([^)]+\))//;
 | |
|             $LOG->info( "Fixed image for $f->{urls_url}" );
 | |
|         }
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Output all feeds to a JSON file if requested
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($json) {
 | |
|     my $js = JSON->new->utf8->canonical->pretty;
 | |
|     open( my $out, '>:encoding(UTF-8)', $json )
 | |
|         or die "Unable to open output file $json $!\n";
 | |
| 
 | |
|     print $out $js->encode($feeds), "\n";
 | |
|     close($out);
 | |
|     emit ( $silent, "JSON data written to $json\n" );
 | |
|     $LOG->info( "JSON data written to $json" );
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Requesting the -opml option means to dump the entire feed table as OPML
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($opml) {
 | |
|     require XML::OPML;
 | |
| 
 | |
|     open( my $out, '>:encoding(UTF-8)', $opml )
 | |
|         or die "Unable to open output file $opml $!\n";
 | |
| 
 | |
|     #
 | |
|     # Start building OPML. Mandatory attributes are 'type', 'text' and
 | |
|     # 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type'
 | |
|     # attribute is poorly defined; this module uses 'rss' as the type and
 | |
|     # 'RSS' as the value of the 'version' attribute. This is not linked to the
 | |
|     # type of the feed.
 | |
|     #
 | |
|     my $OPML = XML::OPML->new(version => '1.1');
 | |
| 
 | |
|     #
 | |
|     # Create the 'head' element
 | |
|     #
 | |
|     my $DT = normaliseDT(DateTime->now());
 | |
|     $OPML->head(
 | |
|         title => 'Free Culture Podcasts',
 | |
|         dateCreated => $DT,
 | |
|         dateModified => $DT,
 | |
|     );
 | |
| 
 | |
|     #
 | |
|     # Walk the feeds generating OPML
 | |
|     #
 | |
|     for (my $i = 0; $i < scalar(@$feeds); $i++) {
 | |
|         my $f = $feeds->[$i];
 | |
| 
 | |
|         $OPML->add_outline(
 | |
|             title       => $f->{urls_title},
 | |
|             text        => $f->{urls_title},
 | |
|             description => $f->{urls_description},
 | |
|             xmlUrl      => $f->{urls_url},
 | |
|             htmlUrl     => $f->{urls_link},
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Output OPML to the nominated file
 | |
|     #
 | |
|     print $out $OPML->as_string;
 | |
| 
 | |
|     close($out);
 | |
|     emit ( $silent, "OPML data written to $opml\n" );
 | |
|     $LOG->info( "OPML data written to $opml" );
 | |
| }
 | |
| 
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Fill and print the template if requested
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($template) {
 | |
|     my $tt = Template->new(
 | |
|         {   ABSOLUTE     => 1,
 | |
|             ENCODING     => 'utf8',
 | |
|             INCLUDE_PATH => $basedir,
 | |
|         }
 | |
|     );
 | |
| 
 | |
|     #
 | |
|     # Make the structure the template needs
 | |
|     #
 | |
|     my $vars = {
 | |
|         feeds => $feeds,
 | |
|     };
 | |
| 
 | |
| #   print Dumper($vars),"\n";
 | |
| 
 | |
|     my $document;
 | |
|     $tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
 | |
|         || die $tt->error(), "\n";
 | |
|     print $outfh $document;
 | |
| 
 | |
|     emit ( $silent, "Data processed with $template\n" );
 | |
|     $LOG->info( "Data processed with $template" );
 | |
| }
 | |
| 
 | |
| #$dbh->disconnect;
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: loadUrls
 | |
| #      PURPOSE: To load URLs read from the input file into the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $new_urls       arrayref containing URLs
 | |
| #               $rules          WWW::RobotRules object
 | |
| #               $keymap         hashref containing a map of key names to
 | |
| #                               database field names
 | |
| #      RETURNS: Currently nothing. The plan is to return any new URLs
 | |
| #               discovered by investigating non-feed URLs.
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub loadUrls {
 | |
|     my ( $dbh, $new_urls, $rules, $keymap ) = @_;
 | |
| 
 | |
|     my ( $stream, $feed );
 | |
|     my %uridata;
 | |
|     my $roboturl;
 | |
| 
 | |
|     #
 | |
|     # Work through the list of URLs
 | |
|     #
 | |
|     foreach my $rec (@new_urls) {
 | |
|         %uridata = ();
 | |
| 
 | |
|         #
 | |
|         # By default save the collected data
 | |
|         #
 | |
|         $uridata{SAVE} = 1;
 | |
| 
 | |
|         #
 | |
|         # Check that we have a valid URL. We don't save them if they are
 | |
|         # invalid ($uridata{SAVE} is set to 0 in the routine).
 | |
|         #
 | |
|         my $uri = validateURI($rec,\%uridata);
 | |
|         next unless $uri;
 | |
| 
 | |
|         #
 | |
|         # Check whether we already have this URI in the database
 | |
|         #
 | |
|         if (dbSearch(
 | |
|                 $dbh, 'SELECT * FROM urls WHERE url = ?',
 | |
|                 $uridata{URI}
 | |
|             )
 | |
|             )
 | |
|         {
 | |
|             emit( $silent, "$uri is already in the database\n" );
 | |
|             $uridata{SAVE} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check the hostname resolves in DNS
 | |
|         #
 | |
|         if ( checkDNS( $uri, \%uridata ) ) {
 | |
|             $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
 | |
|             emit( $silent, "DNS: $uridata{DNS}\n" );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "$uri has no DNS entry\n" );
 | |
|             $uridata{SAVE} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check the server is available. Don't save if it's not (probably will
 | |
|         # not get the feed anyway).
 | |
|         #
 | |
|         if ( serverUp( $uri, \%uridata ) ) {
 | |
|             emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
 | |
|             $uridata{SAVE} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check the site's robots.txt. If there's a block then don't save.
 | |
|         #
 | |
|         if ( robotRulesOK( $uri, $rules, \%uridata ) ) {
 | |
|             emit( $silent, "Check of robots.txt rules succeeded\n" );
 | |
|         }
 | |
|         else {
 | |
|             if ( $uridata{ROBOTS} =~ /404/ ) {
 | |
|                 emit( $silent, "Search for robots.txt file failed\n" );
 | |
|             }
 | |
|             else {
 | |
|                 emit( $silent, "Check of robots.txt rules failed\n" );
 | |
|                 $uridata{SAVE} = 0;
 | |
|                 next;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Look for the HTTP content type. Don't save if the request failed.
 | |
|         #
 | |
|         if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
 | |
|             emit( $silent, "HTTP request OK\n" );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "HTTP request failed\n" );
 | |
|             $uridata{SAVE} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Parse the feed
 | |
|         #
 | |
|         if ( $uridata{TYPE} eq 'Feed' ) {
 | |
|             $LOG->info('Processing feed: ',$uridata{URI});
 | |
|             $stream = getFeed( $uridata{URI} );
 | |
|             if ($stream) {
 | |
|                 $feed = parseFeed( $uridata{URI}, $stream );
 | |
|                 unless ( $feed ) {
 | |
|                     $uridata{SAVE} = 0;
 | |
|                     next;
 | |
|                 }
 | |
| 
 | |
|                 #
 | |
|                 # Provide a means of examining the returned feed object. There
 | |
|                 # are some weird behaviours in there (partly because of the
 | |
|                 # weirdness of RSS and poor adherence to what standards there
 | |
|                 # are).
 | |
|                 #
 | |
|                 print Dumper($feed), "\n" if ( $DEBUG > 2 );
 | |
|                 storeFeed($feed,\%uridata);
 | |
| 
 | |
|                 #
 | |
|                 # Perform a check on the copyright. Routine sets
 | |
|                 # $uridata{SAVE} = 0 if the copyright is not acceptable.
 | |
|                 #
 | |
|                 if ($check) {
 | |
|                     next unless checkCopyright(\%uridata);
 | |
|                 }
 | |
| 
 | |
|             }
 | |
|             else {
 | |
|                 emit($silent, "Problem collecting feed");
 | |
|                 $uridata{SAVE} = 0;
 | |
|                 next;
 | |
|             }
 | |
|         }
 | |
|         elsif ( $uridata{TYPE} eq 'HTML' ) {
 | |
|             #
 | |
|             # TODO Save the URL and find any feeds it contains
 | |
|             #
 | |
|             emit( $silent,
 | |
|                 "Type $uridata{TYPE} not implemented yet; skipped\n" );
 | |
|             next;
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent,
 | |
|                 "Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" );
 | |
|             $uridata{SAVE} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|     }
 | |
|     continue {
 | |
|         # { This 'continue' block is executed for each iteration or 'next' }
 | |
| 
 | |
|         #
 | |
|         # Decide whether to save what we have collected
 | |
|         #
 | |
|         if ( $uridata{SAVE} ) {
 | |
|             if ( addURI( $dbh, \%uridata, $keymap ) ) {
 | |
|                 emit( $silent, "$uridata{URI} added to the database\n" );
 | |
|                 $LOG->info('Feed ',$uridata{URI},' added to the database');
 | |
| 
 | |
|                 #
 | |
|                 # Get the id the database allocated for the row we added
 | |
|                 #
 | |
|                 $uridata{URI_ID}
 | |
|                     = $dbh->last_insert_id( undef, undef, 'urls', undef );
 | |
| 
 | |
|                 #
 | |
|                 # Add any enclosures we found
 | |
|                 #
 | |
|                 if ( defined( $uridata{ENCLOSURE_COUNT} )
 | |
|                     && $uridata{ENCLOSURE_COUNT} > 0 )
 | |
|                 {
 | |
|                     if ( addEnclosures( $dbh, \%uridata ) ) {
 | |
|                         emit( $silent, $uridata{ENCLOSURE_COUNT},
 | |
|                             " enclosures for $uridata{URI} added to the database\n"
 | |
|                         );
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|             else {
 | |
|                 emit( $silent,
 | |
|                     "$uridata{URI} was not added to the database\n" );
 | |
|                 $LOG->info('Feed ',$uridata{URI},' not added to the database');
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Dump what we have if requested
 | |
|         #
 | |
|         print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 );
 | |
| 
 | |
|         emit( $silent, '-' x 80, "\n" );
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Temporary
 | |
|     #
 | |
|     return;    # undef
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: scanDB
 | |
| #      PURPOSE: To scan the URLs in the database and update the stored data
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $keymap         hashref containing a map of key names to
 | |
| #                               database field names
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub scanDB {
 | |
|     my ($dbh, $keymap) = @_;
 | |
| 
 | |
|     my ( $sql1, $sth1, $rv1, $h1 );
 | |
|     my ( $aref, @urls, $DT, $stream, $feed );
 | |
|     my ( %uridata, $urichanges, $enc_changes );
 | |
| 
 | |
|     #
 | |
|     # Query to return all feed URLs
 | |
|     #
 | |
|     $sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' order by title};
 | |
| 
 | |
|     $sth1 = $dbh->prepare($sql1);
 | |
|     $rv1  = $sth1->execute();
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Collect everything as an arrayref pointing to a bunch of arrayrefs
 | |
|     # containing the column details requested
 | |
|     #
 | |
|     $aref = $sth1->fetchall_arrayref;
 | |
| 
 | |
|     #
 | |
|     # Extract just the URL strings
 | |
|     #
 | |
|     @urls = map { $_->[1] } @{$aref};
 | |
| 
 | |
|     #
 | |
|     # Now process these URLs from the database one at a time
 | |
|     #
 | |
|     $sql1 = q{SELECT * FROM urls WHERE url = ?};
 | |
|     $sth1 = $dbh->prepare($sql1);
 | |
| 
 | |
|     my $count = 0;
 | |
| 
 | |
|     foreach my $url (@urls) {
 | |
|         %uridata = ();
 | |
| 
 | |
|         $count++;
 | |
| 
 | |
|         #
 | |
|         # Record the scan for this URL
 | |
|         #
 | |
|         $DT = normaliseDT( DateTime->now() );
 | |
|         $uridata{SCANNED_ON} = $DT;
 | |
| 
 | |
|         #
 | |
|         # Default to OK
 | |
|         #
 | |
|         $uridata{SCAN_OK} = 1;
 | |
| 
 | |
|         emit( $silent, "Scanning '$url'\n" );
 | |
|         $rv1 = $sth1->execute($url);
 | |
|         if ( $dbh->err ) {
 | |
|             warn $dbh->errstr;
 | |
|         }
 | |
|         $h1 = $sth1->fetchrow_hashref;
 | |
|         emit( $silent, $h1->{title}, "\n" );
 | |
| 
 | |
|         #
 | |
|         # The URL should be valid already from the point at which it was added
 | |
|         # to the database, but if we do this we get a canonical version (it
 | |
|         # sets $uridata{SAVE} which makes no sense here, but we'll just ignore
 | |
|         # it. It also sets $uridata{URI}, which is useful.)
 | |
|         #
 | |
|         my $uri = validateURI( $url, \%uridata );
 | |
| 
 | |
|         #
 | |
|         # Check the hostname resolves in DNS
 | |
|         #
 | |
|         if ( checkDNS( $uri, \%uridata ) ) {
 | |
|             $uridata{DNS} = join( ", ", @{ $uridata{DNS} } );
 | |
|             emit( $silent, "DNS: $uridata{DNS}\n" );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "$uri has no DNS entry\n" );
 | |
|             $uridata{SCAN_OK} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Check the server is available
 | |
|         #
 | |
|         if ( serverUp( $uri, \%uridata ) ) {
 | |
|             emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) );
 | |
|             $uridata{SCAN_OK} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Look for the HTTP content type.
 | |
|         #
 | |
|         if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) {
 | |
|             emit( $silent, "HTTP request OK\n" );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "HTTP request failed\n" );
 | |
|             $uridata{SCAN_OK} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Note: not doing the robots.txt check since it was done at load time
 | |
|         #
 | |
| 
 | |
|         #
 | |
|         # We know we have a feed, so go get it
 | |
|         #
 | |
|         $stream = getFeed( $uridata{URI} );
 | |
|         if ($stream) {
 | |
|             #
 | |
|             # Parse the feed. The routine generates its own error messages
 | |
|             #
 | |
|             $feed = parseFeed( $uridata{URI}, $stream );
 | |
|             unless ( $feed ) {
 | |
|                 $uridata{SCAN_OK} = 0;
 | |
|                 next;
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Save the important feed components in the %uridata hash
 | |
|             #
 | |
|             print Dumper($feed), "\n" if ( $DEBUG > 2 );
 | |
|             storeFeed( $feed, \%uridata );
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "Problem collecting feed" );
 | |
|             $uridata{SCAN_OK} = 0;
 | |
|             next;
 | |
|         }
 | |
| 
 | |
|     }
 | |
|     continue {
 | |
|         # { This 'continue' block is executed for each iteration or 'next' }
 | |
| 
 | |
|         #
 | |
|         # If the scan went OK then perform a comparison between the new feed
 | |
|         # data and that which is stored
 | |
|         #
 | |
|         if ( $uridata{SCAN_OK} ) {
 | |
|                 $urichanges = updateURI( $dbh, \%uridata, $keymap );
 | |
|                 $LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ',
 | |
|                     $urichanges )
 | |
|                     if $urichanges > 0;
 | |
| 
 | |
|                 # TODO Update enclosures
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "$uridata{URI} was not scanned successfully\n" );
 | |
|             $LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' );
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Finished scanning this feed
 | |
|         #
 | |
|         emit( $silent, '-' x 80, "\n" );
 | |
| 
 | |
|         #
 | |
|         # Temporarily stop after the first N feeds
 | |
|         #
 | |
|         last if $count == 2;
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: reportDB
 | |
| #      PURPOSE: To generate a printed report from the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $fh             output file handle
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: Perhaps we can get fancy with stored reports in the future
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub reportDB {
 | |
|     my ($dbh,  $fh) = @_;
 | |
| 
 | |
|     #
 | |
|     # Query to report the contents of the 'urls' table with the number of
 | |
|     # associated episodes
 | |
|     #
 | |
|     my $sql = q{
 | |
|         SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
 | |
|             FROM all_episodes ae
 | |
|             JOIN urls ON (ae.urls_id = urls.id)
 | |
|             GROUP by urls.id
 | |
|             HAVING ae.urls_urltype = 'Feed'
 | |
|     };
 | |
| 
 | |
|     my $sth1 = $dbh->prepare($sql);
 | |
|     my $rv   = $sth1->execute();
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     while ( my $h1 = $sth1->fetchrow_hashref() ) {
 | |
|         foreach my $key ( sort( keys( %{$h1} ) ) ) {
 | |
|             printf $fh "%20s: %s\n", $key, coalesce( $h1->{$key}, 'undef' );
 | |
|         }
 | |
|         print $fh "\n";
 | |
|     }
 | |
| 
 | |
|     $sth1->finish;
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: collectData
 | |
| #      PURPOSE: Collects data from the database for generating a report
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #      RETURNS: An array of hashrefs from the query
 | |
| #  DESCRIPTION: Runs a fixed query looking for feed details in the 'urls'
 | |
| #               table and the 'all_episodes' view, showing the date of the
 | |
| #               latest episode. The result is an array of rows, each
 | |
| #               represented as a hash, all sorted by the feed title.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub collectData {
 | |
|     my ($dbh) = @_;
 | |
| 
 | |
|     #
 | |
|     # Query to report the contents of the 'urls' table with the details of the
 | |
|     # latest episode
 | |
|     #
 | |
|     my $sql = q{
 | |
|         SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
 | |
|             FROM all_episodes ae
 | |
|             JOIN urls ON (ae.urls_id = urls.id)
 | |
|             GROUP by urls.id
 | |
|             HAVING ae.urls_urltype = 'Feed'
 | |
|             ORDER BY ae.urls_title
 | |
|     };
 | |
| 
 | |
|     my $sth1 = $dbh->prepare($sql);
 | |
|     my $rv   = $sth1->execute();
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Return everything as an arrayref of hashrefs
 | |
|     #
 | |
|     my $tbl_ary_ref = $sth1->fetchall_arrayref({});
 | |
| 
 | |
|     $sth1->finish;
 | |
| 
 | |
|     return $tbl_ary_ref;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: countRows
 | |
| #      PURPOSE: To count the rows in a table
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $sql            SQL expression to use
 | |
| #      RETURNS: Number of rows found (note that zero is returned as 0E0)
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub countRows {
 | |
|     my ( $dbh, $sql ) = @_;
 | |
| 
 | |
|     my $sth1 = $dbh->prepare($sql);
 | |
|     my $rv   = $sth1->execute();
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
|     my $h1 = $sth1->fetch;
 | |
|     $sth1->finish;
 | |
| 
 | |
|     return @$h1[0];
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: dbSearch
 | |
| #      PURPOSE: To perform a simple search in the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $sql            SQL expression to use (expected to be
 | |
| #                               a SELECT)
 | |
| #               @args           arguments for the 'execute'
 | |
| #      RETURNS: True (1) if the row exists, otherwise false (0).
 | |
| #  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
 | |
| #               SQL without incurring the overhead of repeated 'prepare'
 | |
| #               calls. Only the first row is fetched (we expect there to be
 | |
| #               only one) and the success or failure is determined by its
 | |
| #               existence.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub dbSearch {
 | |
|     my ( $dbh, $sql, @args ) = @_;
 | |
| 
 | |
|     my $sth1 = $dbh->prepare_cached($sql);
 | |
|     my $rv   = $sth1->execute(@args);
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
|     my $h1 = $sth1->fetchrow_hashref();
 | |
|     $sth1->finish;
 | |
| 
 | |
|     return defined($h1);
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: execSQL
 | |
| #      PURPOSE: To perform a non-SELECT query
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $sql            SQL expression to use
 | |
| #               @args           arguments for the 'execute'
 | |
| #      RETURNS: True (1) if the query succeeded, otherwise false (0).
 | |
| #  DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same
 | |
| #               SQL without incurring the overhead of repeated 'prepare'
 | |
| #               calls.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub execSQL {
 | |
|     my ( $dbh, $sql, @args ) = @_;
 | |
| 
 | |
|     my $sth1 = $dbh->prepare_cached($sql);
 | |
|     my $rv   = $sth1->execute(@args);
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
|     $sth1->finish;
 | |
| 
 | |
|     return $rv;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: validateURI
 | |
| #      PURPOSE: Checks a URL for validity
 | |
| #   PARAMETERS: $rec            the URL handed to the script
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #      RETURNS: A URI object if valid otherwise undef
 | |
| #  DESCRIPTION: The URL string is validated with the URI module. A canonical
 | |
| #               string version is stored in the hash referenced by $uridata
 | |
| #               hash if valid otherwise the URL is marked as invalid.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub validateURI {
 | |
|     my ( $rec, $uridata ) = @_;
 | |
| 
 | |
|     #
 | |
|     # Check that we have a valid URL. We don't save them if they are
 | |
|     # invalid.
 | |
|     #
 | |
|     my $uri = URI->new( $rec, 'http' );
 | |
|     if ( $uri->scheme ) {
 | |
|         emit( $silent, "URI $uri is valid\n" );
 | |
|         $uridata->{URI} = $uri->canonical->as_string;
 | |
|         return $uri;
 | |
|     }
 | |
|     else {
 | |
|         emit( $silent, "URI $uri is not valid\n" );
 | |
|         $uridata->{SAVE} = 0;
 | |
|         return;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: checkDNS
 | |
| #      PURPOSE: Looks up a host DNS entry
 | |
| #   PARAMETERS: $uri            URI object
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #      RETURNS: True (1) if the DNS query was successful, otherwise false (0)
 | |
| #  DESCRIPTION: The host name is extracted from the URI (and stored). The
 | |
| #               hostname is searched for in the DNS and if successful, an
 | |
| #               array of addresses from the 'A' records is built. This is
 | |
| #               stored in the hash referenced by $uridata.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub checkDNS {
 | |
|     my ( $uri, $uridata ) = @_;
 | |
| 
 | |
|     my @adds;
 | |
|     my $hostname = $uri->host;
 | |
|     $uridata->{HOST} = $hostname;
 | |
| 
 | |
|     my $res   = Net::DNS::Resolver->new;
 | |
|     my $query = $res->search($hostname);
 | |
|     if ($query) {
 | |
|         foreach my $rr ( $query->answer ) {
 | |
|             next unless $rr->type eq "A";
 | |
|             push( @adds, $rr->address );
 | |
|         }
 | |
|         $uridata->{DNS} = \@adds;
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         warn "Query failed: ", $res->errorstring, "\n";
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: serverUp
 | |
| #      PURPOSE: Checks that a given host is responding
 | |
| #   PARAMETERS: $uri            URI object
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #      RETURNS: True (1) if the host responds to a TCP connection, false (0)
 | |
| #               otherwise
 | |
| #  DESCRIPTION: Given an URL parses out the hostname and the port (defaulting
 | |
| #               to the appropriate default for the scheme, such as 80 for
 | |
| #               http). Attempts to connect to this host and port. If the
 | |
| #               connect fails then details are written to the data structure
 | |
| #               pointed to by $uridata.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub serverUp {
 | |
|     my ( $uri, $uridata ) = @_;
 | |
| 
 | |
|     my ( $hostname, $port, $socket );
 | |
| 
 | |
|     $hostname        = $uri->host();
 | |
|     $port            = $uri->port();
 | |
|     $uridata->{PORT} = $port;
 | |
| 
 | |
|     $socket = IO::Socket::INET->new(
 | |
|         Proto    => "tcp",
 | |
|         PeerAddr => $hostname,
 | |
|         PeerPort => $port,
 | |
|         Reuse    => 1,
 | |
|         Timeout  => 10
 | |
|     );
 | |
| 
 | |
|     if ($socket) {
 | |
|         $socket->close;
 | |
|         $uridata->{HOSTUP} = 1;
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         $uridata->{HOSTUP} = 0;
 | |
|         return 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: robotRulesOK
 | |
| #      PURPOSE: To check the intended URL against the site's robots.txt rules
 | |
| #   PARAMETERS: $uri            URI object
 | |
| #               $rules          WWW::RobotRules object
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #      RETURNS: True (1) if the GET of the robots.txt file succeeded and the
 | |
| #               rules allow the URI object to be fetched, false (0) otherwise.
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub robotRulesOK {
 | |
|     my ( $uri, $rules, $uridata ) = @_;
 | |
| 
 | |
|     #
 | |
|     # Build the URL of robots.txt
 | |
|     #
 | |
|     my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt';
 | |
|     my $robots_txt;
 | |
| 
 | |
|     #
 | |
|     # Use LWP::UserAgent to get the feed and handle errors
 | |
|     #
 | |
|     my $ua = LWP::UserAgent->new;
 | |
|     $ua->timeout(10);
 | |
|     $ua->agent("$PROG/$VERSION");
 | |
| 
 | |
|     my $response = $ua->get($roboturl);
 | |
|     if ( $response->is_success ) {
 | |
|         $uridata->{ROBOTS} = 'Found';
 | |
|         $robots_txt = $response->decoded_content;
 | |
|         $rules->parse( $roboturl, $robots_txt );
 | |
|         return $rules->allowed("$uri");
 | |
|     }
 | |
|     else {
 | |
|         $uridata->{ROBOTS} = $response->status_line;
 | |
|         warn "Failed to get $roboturl\n";
 | |
|         warn $response->status_line . "\n";
 | |
|         return; # undef
 | |
|     }
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: checkContentType
 | |
| #      PURPOSE: Check the content_type of the url
 | |
| #   PARAMETERS: $uri            URI object
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #               $headers        hashref containing query headers
 | |
| #               $log            Log::Handler object
 | |
| #      RETURNS: True (1) if all was well, otherwise false (0)
 | |
| #  DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers
 | |
| #               for the URI object using the LWP::UserAgent head method. Then
 | |
| #               we examine the 'content-type' header looking for the string
 | |
| #               'xml' or 'html' in it. The former denotes a feed, and the
 | |
| #               latter a normal HTML page.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub checkContentType {
 | |
|     my ( $uri, $uridata, $headers, $log ) = @_;
 | |
| 
 | |
|     my @feeds;
 | |
| 
 | |
|     $uridata->{HTTP_STATUS} = 'Unknown';
 | |
| 
 | |
|     my $browser = LWP::UserAgent->new or return 0;
 | |
| 
 | |
|     my $response = $browser->head( $uri->as_string, %{$headers} )
 | |
|         or return 0;
 | |
| 
 | |
|     $uridata->{HTTP_STATUS} = $response->status_line;
 | |
| 
 | |
|     if ( $response->is_success ) {
 | |
|         $uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type');
 | |
| 
 | |
|         #
 | |
|         # Decode the content-type we received
 | |
|         #
 | |
|         if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) {
 | |
|             $uridata->{TYPE} = 'Feed';
 | |
|         }
 | |
|         elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) {
 | |
|             $uridata->{TYPE} = 'HTML';
 | |
|         }
 | |
|         else {
 | |
|             $uridata->{TYPE} = 'Unknown';
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Sometimes perfectly valid feeds misleadingly return text/html so we
 | |
|         # try to spot such cases here and adjust the internal type accordingly
 | |
|         #
 | |
|         if ( $uridata->{TYPE} eq 'HTML' ) {
 | |
|             @feeds = Feed::Find->find( $uri->as_string );
 | |
|             if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) {
 | |
|                 emit( $silent, "Feed found with wrong content-type\n" );
 | |
|                 $uridata->{TYPE} = 'Feed';
 | |
|             }
 | |
|             else {
 | |
|                 emit( $silent, "Found ", scalar(@feeds),
 | |
|                     " feeds within this HTML page\n" );
 | |
|                 print Dumper( \@feeds ), "\n" if $DEBUG > 0;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         $log->info( "URL content classified as: ", $uridata->{TYPE} );
 | |
|         emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" );
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         $uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown';
 | |
|         return 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: getFeed
 | |
| #      PURPOSE: Download the contents of a feed URL
 | |
| #   PARAMETERS: $feed_url       URL of the feed to download
 | |
| #      RETURNS: String representation of the feed contents or undef if the
 | |
| #               download failed.
 | |
| #  DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but
 | |
| #               need not be). If successful the contents are decoded and
 | |
| #               returned, otherwise undef is returned.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO:
 | |
| #===============================================================================
 | |
| sub getFeed {
 | |
|     my ($feed_url) = @_;
 | |
| 
 | |
|     #
 | |
|     # Use LWP::UserAgent to get the feed and handle errors
 | |
|     #
 | |
|     my $ua = LWP::UserAgent->new;
 | |
|     $ua->timeout(10);
 | |
|     $ua->agent("$PROG/$VERSION");
 | |
|     my $response = $ua->get($feed_url);
 | |
| 
 | |
|     my $feed_content;
 | |
|     if ( $response->is_success ) {
 | |
|         $feed_content = $response->decoded_content;
 | |
|         return $feed_content;
 | |
|     }
 | |
|     else {
 | |
|         warn "Failed to get $feed_url\n";
 | |
|         warn $response->status_line, "\n";
 | |
|         return; # undef
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: parseFeed
 | |
| #      PURPOSE: Parse a podcast feed that has already been downloaded
 | |
| #   PARAMETERS: $feed_url       URL of the feed previously downloaded
 | |
| #               $feed_content   String containing the content of the feed, for
 | |
| #                               parsing
 | |
| #      RETURNS: An XML::Feed object containing the parsed feed or undef if the
 | |
| #               parse failed
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub parseFeed {
 | |
|     my ( $feed_url, $feed_content ) = @_;
 | |
| 
 | |
|     my $feed = XML::Feed->parse( \$feed_content );
 | |
|     unless ($feed) {
 | |
|         #
 | |
|         # Something went wrong. Abort this feed
 | |
|         #
 | |
|         warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
 | |
|         return; # undef
 | |
|     }
 | |
| 
 | |
|     return $feed;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: storeFeed
 | |
| #      PURPOSE: Stores feed attributes in a hash
 | |
| #   PARAMETERS: $feed           XML::Feed object returned from parsing the
 | |
| #                               feed
 | |
| #               $uridata        hashref containing data for this URI
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub storeFeed {
 | |
|     my ($feed, $uridata) = @_;
 | |
| 
 | |
|     ( $uridata->{TITLE}  = $feed->title ) =~ s/(^\s+|\s+$)//g;
 | |
|     ( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g;
 | |
| 
 | |
|     $uridata->{LINK}        = $feed->link;
 | |
|     $uridata->{DESCRIPTION} = clean_string( $feed->tagline );
 | |
|     $uridata->{AUTHOR}      = $feed->author;
 | |
|     $uridata->{MODIFIED}    = normaliseDT( $feed->modified );
 | |
|     $uridata->{IMAGE}       = flattenArray( $feed->image );
 | |
|     $uridata->{COPYRIGHT}   = $feed->copyright;
 | |
|     $uridata->{GENERATOR}   = $feed->generator;
 | |
|     $uridata->{LANGUAGE}    = $feed->language;
 | |
|     #print coalesce($feed->webMaster,'No webMaster'),"\n";
 | |
| 
 | |
|     $uridata->{ENCLOSURES} = extractEnclosures($feed);
 | |
|     $uridata->{ENCLOSURE_COUNT}
 | |
|         = scalar( @{ $uridata->{ENCLOSURES} } );
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: checkCopyright
 | |
| #      PURPOSE: Ask the user to check the copyright of a feed
 | |
| #   PARAMETERS:
 | |
| #      RETURNS:
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub checkCopyright {
 | |
|     my ($uridata) = @_;
 | |
| 
 | |
|     my $decision;
 | |
|     $LOG->info('Checking copyright of feed');
 | |
| 
 | |
|     #
 | |
|     # Prompt the user, failing gracefully if there's
 | |
|     # a problem. If the user types 'Y' or 'y' we accept the
 | |
|     # feed, otherwise we do not (thus a blank return = 'no').
 | |
|     #
 | |
|     try {
 | |
|         printf STDERR
 | |
|             "Feed '%s' has the copyright string:\n%s\n",
 | |
|             $uridata->{TITLE},
 | |
|             coalesce( $uridata->{COPYRIGHT}, '' );
 | |
|         $decision = prompt(
 | |
|             -in     => *STDIN,
 | |
|             -prompt => 'OK to add this feed?',
 | |
|             -style  => 'bold red underlined',
 | |
|             -yes
 | |
|         );
 | |
|     }
 | |
|     catch {
 | |
|         warn "Problem processing copyright decision: $_";
 | |
|         $decision = 0;
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # Take action on the decision (or default)
 | |
|     #
 | |
|     $uridata->{SAVE} = $decision;
 | |
|     if ($decision) {
 | |
|         emit( $silent, "Feed added\n" );
 | |
|         $LOG->info('Copyright OK');
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         emit( $silent, "Feed not added\n" );
 | |
|         $LOG->info('Copyright not OK');
 | |
|         return 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: parseRSS
 | |
| #      PURPOSE: Attempt to parse a feed as RSS
 | |
| #   PARAMETERS: $feed_url       URL of the feed previously downloaded
 | |
| #               $feed_content   String containing the content of the feed, for
 | |
| #                               parsing
 | |
| #      RETURNS: An XML::RSS object containing the parsed feed or undef if the
 | |
| #               parse failed
 | |
| #  DESCRIPTION: ** Incomplete **
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub parseRSS {
 | |
|     my ( $feed_url, $feed_content ) = @_;
 | |
| 
 | |
|     my $rss = XML::RSS->parse(\$feed_content);
 | |
|     unless ($rss) {
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: addURI
 | |
| #      PURPOSE: Adds the data for a URI to the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $uridata        hashref containing data for the current URI
 | |
| #               $keymap         hashref containing a map of key names to
 | |
| #                               database field names
 | |
| #      RETURNS: True (1) if the insert succeeded, false (0) otherwise
 | |
| #  DESCRIPTION: The hash keys are defined as an array to make it easy to slice
 | |
| #               the hash and the SQL is defined internally using the size of
 | |
| #               the key array as a guide to the number of '?' placeholders.
 | |
| #               These are passed to execSQL to do the work.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub addURI {
 | |
|     my ( $dbh, $uridata, $keymap ) = @_;
 | |
| 
 | |
|     my @keys = (
 | |
|         'URI',               'DNS',
 | |
|         'HOSTUP',            'HTTP_STATUS',
 | |
|         'HTTP_CONTENT_TYPE', 'TYPE',
 | |
|         'FORMAT',            'TITLE',
 | |
|         'DESCRIPTION',       'AUTHOR',
 | |
|         'MODIFIED',          'LINK',
 | |
|         'IMAGE',             'COPYRIGHT',
 | |
|         'GENERATOR',         'LANGUAGE',
 | |
|     );
 | |
| 
 | |
|     my $sql
 | |
|         = 'INSERT INTO urls ('
 | |
|         . join( ",", @{$keymap}{@keys} ) . ') '
 | |
|         . 'VALUES('
 | |
|         . join( ',', ('?') x scalar(@keys) ) . ')';
 | |
| 
 | |
|     print "addURI query: $sql\n" if $DEBUG > 0;
 | |
| 
 | |
|     return execSQL( $dbh, $sql, @{$uridata}{@keys} );
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: extractEnclosures
 | |
| #      PURPOSE: Builds an array of hashes containing enclosure data from an
 | |
| #               XML::Feed object
 | |
| #   PARAMETERS: $feed           XML::Feed object
 | |
| #      RETURNS: A reference to the array of anonymous hashes built from the
 | |
| #               feed.
 | |
| #  DESCRIPTION: The XML::Feed object is expected to contain an array of
 | |
| #               entries. These are converted to hashes, references to which
 | |
| #               are stored in an array. The two DateTime components are
 | |
| #               converted to ISO8601 dates. If there is an enclosure then its
 | |
| #               elements are saved.  Note that there could be multiple
 | |
| #               enclosures, but XML::Feed does not cater for them unless
 | |
| #               explicitly requested. We do not deal with such a case here.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub extractEnclosures {
 | |
|     my ($feed) = @_;
 | |
| 
 | |
|     my @encs;
 | |
| 
 | |
|     foreach my $entry ( $feed->entries ) {
 | |
|         my %ent;
 | |
| 
 | |
|         $ent{title}    = $entry->title;
 | |
|         $ent{base}     = $entry->base;
 | |
|         $ent{link}     = $entry->link;
 | |
|         $ent{category} = join( ", ", $entry->category );
 | |
|         $ent{author}   = $entry->author;
 | |
|         $ent{id}       = $entry->id;
 | |
|         $ent{issued}   = normaliseDT( $entry->issued );
 | |
|         $ent{modified} = normaliseDT( $entry->modified );
 | |
| 
 | |
|         #
 | |
|         # Work around a bug in XML::Feed where the source method is only
 | |
|         # available for Atom feeds. TODO report this.
 | |
|         #
 | |
|         if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) {
 | |
|             $ent{source} = $entry->source;
 | |
|         }
 | |
| 
 | |
|         my ($enclosure) = $entry->enclosure;
 | |
|         if ( defined($enclosure) ) {
 | |
|             $ent{url}    = $enclosure->url;
 | |
|             $ent{type}   = $enclosure->type;
 | |
|             $ent{length} = $enclosure->length;
 | |
|         }
 | |
| 
 | |
|         push( @encs, \%ent );
 | |
|     }
 | |
| 
 | |
|     return \@encs;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: addEnclosures
 | |
| #      PURPOSE: Adds episodes extracted from a feed into the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $uridata        hashref containing data for the current URI
 | |
| #                               including an arrayref of hashrefs of episode
 | |
| #                               data
 | |
| #      RETURNS: True (1) if all the inserts succeeded, false (0) otherwise
 | |
| #  DESCRIPTION: The SQL is defined internally and the hash keys are defined as
 | |
| #               an array to make it easy to slice the hash. The enclosures (or
 | |
| #               more correctly, feed items) are present in the hash as an
 | |
| #               array of anonymous hashes. These are processed one at a time
 | |
| #               and inserted into the database. A count of the number of
 | |
| #               successful inserts is kept. This is compared with the number
 | |
| #               of enclosures to determine the boolean value to return.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub addEnclosures {
 | |
|     my ( $dbh, $uridata ) = @_;
 | |
| 
 | |
|     my $sql = q{INSERT INTO episodes
 | |
|         (urls_id, link, enclosure, title, author, category, source, ep_id,
 | |
|         issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)};
 | |
| 
 | |
|     my @keys = (
 | |
|         'link', 'url',    'title',    'author', 'category', 'source',
 | |
|         'id',   'issued', 'modified', 'length', 'type'
 | |
|     );
 | |
| 
 | |
|     my $successes = 0;
 | |
| 
 | |
|     foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) {
 | |
|         if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) {
 | |
|             $successes++;
 | |
|         }
 | |
|         else {
 | |
|             emit( $silent, "Failed to add episode $enc->{url}\n" );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return ( $successes == $uridata->{ENCLOSURE_COUNT} );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: updateURI
 | |
| #      PURPOSE: Compare the data in a hash with that in the database
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $uridata        hashref containing data for the current URI
 | |
| #                               including an arrayref of hashrefs of episode
 | |
| #                               data
 | |
| #               $keymap         hashref containing a map of key names to
 | |
| #                               database field names
 | |
| #      RETURNS: The number of changes made
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub updateURI {
 | |
|     my ( $dbh, $uridata, $keymap ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sth1, $rv1, $h1 );
 | |
|     my ( %fieldvals, %where );
 | |
|     my ( $diffs, $updates ) = ( 0, 0 );
 | |
|     my @keys = (
 | |
|         'URI',               'DNS',
 | |
|         'HOSTUP',            'HTTP_STATUS',
 | |
|         'HTTP_CONTENT_TYPE', 'TYPE',
 | |
|         'FORMAT',            'TITLE',
 | |
|         'DESCRIPTION',       'AUTHOR',
 | |
|         'MODIFIED',          'LINK',
 | |
|         'IMAGE',             'COPYRIGHT',
 | |
|         'GENERATOR',         'LANGUAGE',
 | |
|     );
 | |
| 
 | |
|     #
 | |
|     # Get the row from the urls table
 | |
|     #
 | |
|     $sql1 = q{SELECT * FROM urls WHERE url = ?};
 | |
|     $sth1 = $dbh->prepare($sql1);
 | |
|     $rv1  = $sth1->execute( $uridata->{URI} );
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
|     $h1 = $sth1->fetchrow_hashref;
 | |
| 
 | |
|     for my $key (@keys) {
 | |
|         unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) {
 | |
|             $diffs++;
 | |
|             $fieldvals{$key} = $uridata->{$key};
 | |
| 
 | |
|             #
 | |
|             # Temporary report
 | |
|             #
 | |
|             print "Difference: ($key)\n";
 | |
|             print "  Feed:     ", coalesce($uridata->{$key},''), "\n";
 | |
|             print "  Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n";
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if ($diffs > 0) {
 | |
|         #
 | |
|         # Prepare to use SQL::Abstract
 | |
|         #
 | |
|         my $sql = SQL::Abstract->new();
 | |
| 
 | |
|         #
 | |
|         # Build the update statement
 | |
|         #
 | |
|         $where{id} = $h1->{id};
 | |
|         my ( $stmt, @bindvals )
 | |
|             = $sql->update( 'urls', \%fieldvals, \%where );
 | |
| 
 | |
|         # Temporary
 | |
|         print "Statement: $stmt\n";
 | |
|         print "Bind values: ", join(",",@bindvals),"\n";
 | |
| 
 | |
|         #
 | |
|         # Perform the updates
 | |
|         #
 | |
|         $sth1 = $dbh->prepare($stmt);
 | |
|         $sth1->execute(@bindvals);
 | |
|         if ( $dbh->err ) {
 | |
|             warn "Processing $h1->{url}\n", $dbh->errstr;
 | |
|         }
 | |
|         else {
 | |
|             emit($silent, "Updated $h1->{url}\n");
 | |
|             $updates++;
 | |
|         }
 | |
| 
 | |
|     }
 | |
| 
 | |
|     return $updates;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: updateEnclosures
 | |
| #      PURPOSE: Update the enclosures stored with a feed URL
 | |
| #   PARAMETERS: $dbh            database handle
 | |
| #               $uridata        hashref containing data for the current URI
 | |
| #                               including an arrayref of hashrefs of episode
 | |
| #                               data
 | |
| #      RETURNS: The number of changes made
 | |
| #  DESCRIPTION: 
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub updateEnclosures {
 | |
|     my ( $dbh, $uridata ) = @_;
 | |
| 
 | |
|     my ( $sql1, $sth1, $rv1, $h1 );
 | |
|     my ( %fieldvals, %where );
 | |
|     my ( $diffs, $updates ) = ( 0, 0 );
 | |
| 
 | |
|     #
 | |
|     # Get the enclosures associated with this feed
 | |
|     #
 | |
|     $sql1 = q{
 | |
|         SELECT * FROM episodes
 | |
|         WHERE urls_id = (SELECT id FROM urls WHERE title = ?)
 | |
|     };
 | |
|     $sth1 = $dbh->prepare($sql1);
 | |
|     $rv1  = $sth1->execute( $uridata->{URI} );
 | |
|     if ( $dbh->err ) {
 | |
|         warn $dbh->errstr;
 | |
|     }
 | |
|     $h1 = $sth1->fetchrow_hashref;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: equal
 | |
| #      PURPOSE: Compare two strings even if undefined
 | |
| #   PARAMETERS: $s1             The first string
 | |
| #               $s2             The second string
 | |
| #      RETURNS: True if both strings are undefined, false if one isn't
 | |
| #               defined, otherwise the result of comparing them.
 | |
| #  DESCRIPTION: Works on the principle that two undefined strings are equal,
 | |
| #               a defined and an undefined string are not, and otherwise they
 | |
| #               are equal if they are equal!
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO:
 | |
| #===============================================================================
 | |
| sub equal {
 | |
|     my ( $s1, $s2 ) = @_;
 | |
| 
 | |
|     return 1 if ( !defined($s1) && !defined($s2) );
 | |
|     return 0 if ( !defined($s1) || !defined($s2) );
 | |
| 
 | |
|     return ( $s1 eq $s2 );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: clean_string
 | |
| #      PURPOSE: Clean a string of non-printables, newlines, multiple spaces
 | |
| #   PARAMETERS: $str            The string to process
 | |
| #      RETURNS: The processed string
 | |
| #  DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable
 | |
| #               characters. Removes all CR/LF sequences. Removes multiple
 | |
| #               spaces.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO:
 | |
| #===============================================================================
 | |
| sub clean_string {
 | |
|     my ($str) = @_;
 | |
| 
 | |
|     $str =~ s/(^\s+|\s+$)//g;
 | |
|     $str =~ tr/[[:graph:]]//c;
 | |
|     $str =~ tr/\x0A\x0D/ /;
 | |
|     $str =~ tr/ / /s;
 | |
| 
 | |
|     return $str;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: normaliseDT
 | |
| #      PURPOSE: Normalise an ISO8601 date for comparison, etc.
 | |
| #   PARAMETERS: $dt             a DateTime object
 | |
| #      RETURNS: The DateTime object formatted as an ISO8601 string
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub normaliseDT {
 | |
|     my ($dt) = @_;
 | |
| 
 | |
|     my $p = DateTime::Format::SQLite->new();
 | |
| 
 | |
|     return (
 | |
|         defined($dt)
 | |
|         ? $p->format_datetime(
 | |
|             DateTime::Format::ISO8601->parse_datetime($dt)
 | |
|             )
 | |
|         : undef
 | |
|     );
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: flattenArray
 | |
| #      PURPOSE: Turns an arrayref into a simple list in a string
 | |
| #   PARAMETERS: $item - the item that may be an arrayref
 | |
| #      RETURNS: The plain item if it's not an array otherwise the flattened
 | |
| #               version
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub flattenArray {
 | |
|     my ($item) = @_;
 | |
| 
 | |
|     my $result;
 | |
|     if ( ref( $item ) eq 'ARRAY' ) {
 | |
|         $result = join(", ",@{$item});
 | |
|     }
 | |
|     else {
 | |
|         $result = $item;
 | |
|     }
 | |
| 
 | |
|     return $item;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return; # undef
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: emit
 | |
| #      PURPOSE: Print text on STDERR unless silent mode has been selected
 | |
| #   PARAMETERS: - Boolean indicating whether to be silent or not
 | |
| #               - list of arguments to 'print'
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: This is a wrapper around 'print' to determine whether to send
 | |
| #               a message to STDERR depending on a boolean. We need this to be
 | |
| #               able to make the script silent when the -silent option is
 | |
| #               selected
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub emit {
 | |
|     unless (shift) {
 | |
|         print STDERR @_;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",   "manpage",  "debug=i", "silent!",
 | |
|         "load=s", "delete=s", "scan!",   "report!",
 | |
|         "check!", "json:s",   "opml:s",  "config=s",
 | |
|         "out=s",  "template:s",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage( -msg => "Version $VERSION\n", -exitval => 1 );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| feedWatcher - watch a collection of podcast feeds
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to I<feedWatcher> version 0.0.14
 | |
| 
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|  feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
 | |
|     [-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
 | |
|     [-[no]silent] [-config=FILE] [-debug=N] [URL ...]
 | |
| 
 | |
| =head1 ARGUMENTS
 | |
| 
 | |
| Arguments are optional and may consist of an arbitrarily long list of URLs to
 | |
| be processed and possibly added to the database by the script.
 | |
| 
 | |
| These URLs are prepended to any which may be provided through the
 | |
| B<-load=FILE> option described below. The script makes sure the URL collection
 | |
| contains no duplicates.
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Prints a brief help message describing the usage of the program, and then exits.
 | |
| 
 | |
| The full documentation can be displayed with the command:
 | |
| 
 | |
|     perldoc -oman feedWatcher
 | |
| 
 | |
| Alternatively, use the option B<-manpage>
 | |
| 
 | |
| =item B<-manpage>
 | |
| 
 | |
| Prints the entire documentation for the script. Note that this mode uses a
 | |
| simpler type of formatter than I<perldoc>.
 | |
| 
 | |
| =item B<-load=FILE>
 | |
| 
 | |
| Defines a file from which new URLs are to be added to the database. These URLs
 | |
| are checked in various ways before adding to the database. If arguments are
 | |
| provided when the script is invoked these URLs are appended to the argument list.
 | |
| 
 | |
| =item B<-delete=FILE>
 | |
| 
 | |
| Defines a file from which a list of URLs is to be read which are to be
 | |
| deleted from the database.
 | |
| 
 | |
| Note that it is possible (though inadvisable) to both add and delete an URL in
 | |
| the same run of the script. The URL will first be added (from the load file or
 | |
| argument list) then deleted. This is a pointless exercise which wastes
 | |
| bandwidth, so don't do it!
 | |
| 
 | |
| =item B<-[no]scan>
 | |
| 
 | |
| This option (B<-scan>) causes the URLs stored in the database to be scanned
 | |
| and updated. The negated form, which is also the default behaviour of the
 | |
| script, (B<-noscan>) omits the scan.
 | |
| 
 | |
| NOTE: This function is not implemented yet.
 | |
| 
 | |
| =item B<-out=FILE>
 | |
| 
 | |
| This option defines an output file to receive any output. If the option is
 | |
| omitted the data is written to STDOUT, allowing it to be redirected if
 | |
| required.
 | |
| 
 | |
| =item B<-[no]check>
 | |
| 
 | |
| This option (B<-check>) causes each feed which is being to be checked against
 | |
| the script user to check that it's OK to add it. The script reports the
 | |
| I<copyright> field and requests a I<y> or I<n> response.
 | |
| 
 | |
| =item B<-[no]report>
 | |
| 
 | |
| This option (B<-report>) causes a report of the contents of the database to be
 | |
| generated.  The negated form, which is also the default behaviour of the
 | |
| script, (B<-noreport>) omits the report.
 | |
| 
 | |
| NOTE: The report is currently very simple.
 | |
| 
 | |
| =item B<-json[=FILE]>
 | |
| 
 | |
| This option, which may be omitted, defines the location where the feed and
 | |
| episode details are to be written. If omitted no JSON data is written.
 | |
| 
 | |
| If the option is given as B<-json=FILE> the data is written to the nominated
 | |
| file.
 | |
| 
 | |
| If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is
 | |
| used.
 | |
| 
 | |
| =item B<-opml[=FILE]>
 | |
| 
 | |
| This option, which may be omitted, defines the location where the feed details
 | |
| are to be written. If omitted no OPML data is written.
 | |
| 
 | |
| If the option is given as B<-opml=FILE> the data is written to the nominated
 | |
| file.
 | |
| 
 | |
| If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is
 | |
| used.
 | |
| 
 | |
| =item B<-template=FILE>
 | |
| 
 | |
| This option defines the template used to generate a form of the feed data. The
 | |
| template is written using the B<Template> toolkit language.
 | |
| 
 | |
| If the file name is omitted then the script uses the file B<feedWatcher.tpl>
 | |
| in the same directory as the script. If this file does not exist then the
 | |
| script will exit with an error message.
 | |
| 
 | |
| For convenience B<feedWatcher.tpl> can be made a soft link which points to the
 | |
| file which is the current default. This allows the development of versions
 | |
| without changing the usual way this script is run.
 | |
| 
 | |
| =item B<-config=FILE>
 | |
| 
 | |
| This option allows an alternative configuration file to be used. This file
 | |
| defines the location of the database, its name and the username and
 | |
| password to be used to access it. This feature permits a test database to be
 | |
| used, or for two more sets of feeds to be processed.
 | |
| 
 | |
| See the CONFIGURATION AND ENVIRONMENT section below for the file format.
 | |
| 
 | |
| If the option is omitted the default file is used: B<feedWatcher.cfg>
 | |
| 
 | |
| =item B<-debug=N>
 | |
| 
 | |
| This option selects the debug level, resulting in a lot of output.
 | |
| 
 | |
|     0   (the default) No debug output
 | |
|     1   Dumps the list of feeds found in an HTML download.
 | |
|         Also shows the SQL query which will result in the insertion of a new
 | |
|         row into the database table I<urls>.
 | |
|     2   Dumps the collected data which is destined to be written to the database
 | |
|     3   Dumps the contents of feeds during analysis
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| A full description of the application and its features.
 | |
| May include numerous subsections (i.e. =head2, =head3, etc.)
 | |
| 
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| A list of every error and warning message that the application can generate
 | |
| (even the ones that will "never happen"), with a full explanation of each
 | |
| problem, one or more likely causes, and any suggested remedies. If the
 | |
| application generates exit status codes (e.g. under Unix) then list the exit
 | |
| status associated with each error.
 | |
| 
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| A full explanation of any configuration system(s) used by the application,
 | |
| including the names and locations of any configuration files, and the
 | |
| meaning of any environment variables or properties that can be set. These
 | |
| descriptions must also include details of any configuration language used
 | |
| 
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
|     Config::General
 | |
|     DBI
 | |
|     Data::Dumper
 | |
|     DateTime::Format::ISO8601
 | |
|     DateTime::Format::SQLite
 | |
|     Feed::Find
 | |
|     Getopt::Long
 | |
|     HTML::Entities
 | |
|     IO::Socket
 | |
|     JSON
 | |
|     LWP::UserAgent
 | |
|     List::MoreUtils
 | |
|     Log::Handler
 | |
|     Net::DNS
 | |
|     Pod::Usage
 | |
|     Template
 | |
|     Template::Filters
 | |
|     URI
 | |
|     WWW::RobotRules
 | |
|     XML::Feed
 | |
|     XML::RSS::Parser
 | |
| 
 | |
| =head1 INCOMPATIBILITIES
 | |
| 
 | |
| A list of any modules that this module cannot be used in conjunction with.
 | |
| This may be due to name conflicts in the interface, or competition for
 | |
| system or program resources, or due to internal limitations of Perl
 | |
| (for example, many modules that use source code filters are mutually
 | |
| incompatible).
 | |
| 
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Dave Morriss (Dave.Morriss@gmail.com) 2020
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
 | |
| 
 | |
| This program is free software. You can redistribute it and/or modify it under
 | |
| the same terms as perl itself.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |