#!/usr/bin/env perl #=============================================================================== # # FILE: feedWatcher # # USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] # [-report[=title]] [-check[=mode]] [-out=FILE] # [-rejects[=FILE]] [-json[=FILE]] [-opml[=FILE]] # [-template[=FILE]] [-[no]silent] [-debug=N] # [URL ...] # # DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux # podcasts. Ken's script was inspired by Dann Washko's site at # http://www.thelinuxlink.net/ and prepared for OggCamp 12 in # August 2012 where it was used to generate handouts. # # The script has not been developed since 2014, but is now in # use in 2020 helping to prepare podcast information for # a FOSDEM visit under the heading of "Free Culture Podcasts". # See the files 'Journal.adoc' (AsciiDoctor) and 'Journal.html' # (created with Pandoc) in the same directory as this script for # details of what has been done to develop the original design. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.1.1 # CREATED: 2013-12-25 12:40:33 # REVISION: 2023-01-09 15:28:13 # #------------------------------------------------------------------------------- # Released under the terms of the GNU Affero General Public License (AGPLv3) # http://www.gnu.org/licenses/agpl-3.0.html #------------------------------------------------------------------------------- # #=============================================================================== use 5.030; use strict; use warnings; use utf8; use feature qw{ postderef say signatures state }; no warnings qw{ experimental::postderef experimental::signatures } ; use FindBin::libs; use XML::RSS; use Getopt::Long; use Pod::Usage; use Config::General; use List::MoreUtils qw{uniq}; use Log::Handler; use Try::Tiny; use URI; use Net::DNS; use IO::Socket; use LWP::UserAgent; use WWW::RobotRules; #use XML::RSS::Parser; use XML::Feed; use XML::FeedPP; # Fall back? use Feed::Find; use Template; use Template::Filters; Template::Filters->use_html_entities; # Use HTML::Entities in the template use HTML::Entities; use IO::Prompter; use JSON; use DBI; use DateTime::Format::SQLite; use DateTime::Format::ISO8601; use SQL::Abstract; use Data::Dumper; # # Version number (manually incremented) # our $VERSION = '0.1.1'; # # Script name # ( my $PROG = $0 ) =~ s|.*/||mx; # # Declarations # my ( @new_urls, @deletions ); my ( $rules, $robot_name ) = ( undef, "$PROG/$VERSION" ); my ($search_target); my ( $sth1, $h1, $rv ); my ($rejectcount); my $feeds; # # File and directory paths # my $basedir = "$ENV{HOME}/HPR/feed_watcher"; my $tempdir = "$basedir/tmp"; my $configfile = "$basedir/$PROG.cfg"; my $logfile = "$basedir/${PROG}.log"; my $deftemplate = "$basedir/${PROG}.tpl"; # # Hash key map to database field names (chevron comments are to stop Perltidy # messing with the layout) # #<<< my %keymap = ( AUTHOR => 'author', COPYRIGHT => 'copyright', DESCRIPTION => 'description', DNS => 'dns', # ENCLOSURES => undef, # ENCLOSURE_COUNT => undef, FORMAT => 'feedformat', GENERATOR => 'generator', # HOST => undef, HOSTUP => 'host_up', HTTP_CONTENT_TYPE => 'content_type', HTTP_STATUS => 'http_status', IMAGE => 'image', LANGUAGE => 'language', LINK => 'link', MODIFIED => 'modified', # PORT => undef, # ROBOTS => undef, # SAVE => undef, TITLE => 'title', TYPE => 'urltype', URI => 'url', # URI_ID => undef, ); #>>> # # Headers for LWP::UserAgent # my %headers = ( 'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux i686; ' . 'rv:15.0) Gecko/20100101 Firefox/15.0.1', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, ' . 'image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); # # Enable Unicode mode # binmode STDOUT, ":encoding(UTF-8)"; binmode STDERR, ":encoding(UTF-8)"; # # Options and arguments # my %options; Options( \%options ); # # Any arguments are taken to be URLs # @new_urls = @ARGV; # # Default help # pod2usage( -msg => "Version $VERSION\n", -exitval => 1, -verbose => 0 ) if ( $options{'help'} ); # # Detailed help # pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 ) if ( $options{'manpage'} ); # # Collect options # my $DEBUG = ( $options{'debug'} ? $options{'debug'} : 0 ); my $cfgfile = ( defined( $options{config} ) ? $options{config} : $configfile ); my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 ); my $loadfile = $options{'load'}; my $deletefile = $options{'delete'}; my $scan = ( defined( $options{scan} ) ? $options{scan} : 0 ); my $check = $options{check}; my $outfile = $options{out}; my $rejectfile = $options{rejects}; my $report = $options{report}; my $json = $options{json}; my $opml = $options{opml}; my $template = $options{template}; # # Check the configuration file # die "Unable to find configuration file $cfgfile\n" unless ( -e $cfgfile ); # # Check the load file # if ($loadfile) { die "File $loadfile does not exist\n" unless -e $loadfile; die "File $loadfile is not readable\n" unless -r $loadfile; } # # Check the delete file # if ($deletefile) { die "File $deletefile does not exist\n" unless -e $deletefile; die "File $deletefile is not readable\n" unless -r $deletefile; } # # The checking mode defaults to 'auto' if the option has no value, or may be # 'manual' or 'none'. If the option is not used at all it defaults to 'none'. # if ( defined($check) ) { $check =~ s/(^\s+|\s+$)//g; if ($check =~ /^$/) { $check = "auto"; } else { $check = lc($check); die "Invalid option '-check=$check'\n" . "Values are , auto and manual\n" unless ($check =~ /^(auto|manual|none)$/) } } else { $check = 'none'; } # # We accept -report, meaning report everything or -report='title' to report # just the feed with the given title. # if ( defined($report) ) { if ($report =~ /^$/) { $search_target = undef; } else { $search_target = $report; } } # # We accept -json or -json=filename. In the former case we make a default # name, otherwise we use the one provided. # if ( defined($json) ) { if ($json =~ /^$/) { $json = "$PROG.json"; } } # # We accept -opml or -opml=filename. In the former case we make a default # name, otherwise we use the one provided. # if ( defined($opml) ) { if ($opml =~ /^$/) { $opml = "$PROG.opml" } } # # Similarly we accept -template or -template=filename. In the former case we # make a default name, otherwise we use the one provided. # if ( defined($template) ) { if ($template =~ /^$/) { $template = "$deftemplate"; } die "Error: Unable to find template $template\n" unless -r $template; } # # Load configuration data # my $conf = new Config::General( -ConfigFile => $cfgfile, -InterPolateVars => 1, -ExtendedAccess => 1 ); my %config = $conf->getall(); # # Connect to the database # my $dbtype = $config{database}->{type}; my $dbfile = $config{database}->{file}; my $dbuser = $config{database}->{user}; my $dbpwd = $config{database}->{password}; my $dbh = DBI->connect( "dbi:$dbtype:dbname=$dbfile", $dbuser, $dbpwd, { AutoCommit => 1, sqlite_unicode => 1, } ) or die $DBI::errstr; # # Enable SQLite's foreign keys (necessary to enable any ON DELETE CASCADE # foreign key constraints to function) # $dbh->do('PRAGMA foreign_keys = ON'); # # Check we have something to do # my $rows = countRows( $dbh, 'SELECT count(*) FROM urls' ); my $work = ( scalar(@new_urls) > 0 || defined($loadfile) || defined($deletefile) || ( defined($report) || defined($json) || defined($opml) || defined($template) || $scan && $rows > 0 ) ); die "Nothing to do!\n" unless $work; #------------------------------------------------------------------------------- # Set up logging keeping the default log layout except for the date #------------------------------------------------------------------------------- my $LOG = Log::Handler->new(); $LOG->add( file => { timeformat => "%Y/%m/%d %H:%M:%S", filename => $logfile, minlevel => 0, maxlevel => 7, } ); #------------------------------------------------------------------------------- # Open the output file (or STDOUT) - we may need the date to do it #------------------------------------------------------------------------------- my $outfh; if ($outfile) { open( $outfh, ">:encoding(UTF-8)", $outfile ) or warn "Unable to open $outfile for writing: $!"; } else { open( $outfh, ">&:encoding(UTF-8)", \*STDOUT ) or warn "Unable to initialise for writing: $!"; } #------------------------------------------------------------------------------- # Open the rejects file if requested, otherwise we don't write reject data #------------------------------------------------------------------------------- my $rejectfh; if ($rejectfile) { if ($rejectfile =~ /^$/) { $rejectfile = "${PROG}_rejected_URLs.txt"; } open( $rejectfh, ">:encoding(UTF-8)", $rejectfile ) or warn "Unable to open $rejectfile for writing: $!"; $rejectcount = 0; } # # Set up a robot.txt rules parser # $rules = WWW::RobotRules->new($robot_name); #------------------------------------------------------------------------------- # Slurp the load file into @new_urls if the file is provided #------------------------------------------------------------------------------- if ($loadfile) { # # Load the input file # open( my $in, '<:encoding(utf8)', $loadfile ) or die "$PROG : failed to open load file '$loadfile' : $!\n"; chomp( my @loaded = <$in> ); close($in) or warn "$PROG : failed to close load file '$loadfile' : $!\n"; $LOG->info("Loaded URLs from $loadfile"); # # Add the loaded URLs to the array # push( @new_urls, @loaded ); } # # Now, we either have URLs from the command line, or from the load file (or # both), so we process these. # # It's a loop because 'loadUrls' might find some more URLs by scanning HTML # URLs if given them. If it does we replace @new_urls with the found URLs and # go again. When there's nothing returned the loop stops. # ---- # NOTE: This seems dirty, but all the 'while' is testing is whether the array # contains anything or not. It's not iterating over it or anything, which would # be messy! # while (@new_urls) { # # Remove duplicates, finish if it deletes them all! # @new_urls = uniq(@new_urls); last unless @new_urls; # # Remove any commented out lines, finish if it deletes them all! # @new_urls = grep {!/^\s*#/} @new_urls; last unless @new_urls; $LOG->info( "Received ", scalar(@new_urls), " URLs to add to the database" ); # # Load these URLs as appropriate, returning any more that we find by # following HTML urls. We overwrite the original list and start all over # again. # @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap ); } # # Process the delete file if there is one # if ($deletefile) { # # Load the delete file # open( my $del, '<:encoding(utf8)', $deletefile ) or die "$PROG : failed to open load file '$deletefile' : $!\n"; chomp( @deletions = <$del> ); close($del) or warn "$PROG : failed to close load file '$deletefile' : $!\n"; # # Remove duplicates # @deletions = uniq(@deletions); $LOG->info( "Deleting ", scalar(@deletions), " URLs from the database" ); # # There are URLs to delete. Process them on by one. # if (@deletions) { $sth1 = $dbh->prepare(q{DELETE from urls WHERE url = ?}); foreach my $rec (@deletions) { $rv = $sth1->execute($rec); if ( $dbh->err ) { warn $dbh->errstr; } if ( $rv != 0 ) { emit ( $silent, "Deleted $rec ($rv rows)\n" ); } else { emit ( $silent, "Failed to delete $rec\n" ); } } } } #------------------------------------------------------------------------------- # Perform a database scan if requested # TODO: Needs to be developed; does nothing at the moment. #------------------------------------------------------------------------------- if ($scan) { $LOG->info( "Scan is not fully implemented yet" ); # Testing. Processes the first two feeds scanDB($dbh, \%keymap); } #------------------------------------------------------------------------------- # Report all or selected database contents if requested #------------------------------------------------------------------------------- if ( defined($report) ) { if ( defined($search_target) ) { # # Reporting a specific title # my @matches = searchTitle( $dbh, $search_target ); if (@matches) { # # Too many matches! # if ( $#matches > 1 ) { say "Multiple matches:\n"; say "- ", join( "\n- ", @matches ); say "\nTry again with a different title"; } else { # # One match, report it # reportFeed( findFeed( $dbh, $matches[0] ), $outfh ); } } else { say "Feed not found with target '$search_target'"; } } else { # # Reporting everything # reportDB( $dbh, $outfh ); $LOG->info("Report generated"); } } # # Do any image repairs we need, but temporarily # if ($json || $opml || $template) { $feeds = collectData($dbh); # # Clean up the image references (until we have a proper fix) # for (my $i = 0; $i < scalar(@$feeds); $i++) { my $f = $feeds->[$i]; if (defined($f->{urls_image}) && $f->{urls_image} =~ /^ARRAY\([^)]+\)/) { $f->{urls_image} =~ s/^(ARRAY\([^)]+\))//; $LOG->info( "Fixed image for $f->{urls_url}" ); } } } #------------------------------------------------------------------------------- # Output all feeds to a JSON file if requested #------------------------------------------------------------------------------- if ($json) { my $js = JSON->new->utf8->canonical->pretty; open( my $out, '>:encoding(UTF-8)', $json ) or die "Unable to open output file $json $!\n"; print $out $js->encode($feeds), "\n"; close($out); emit ( $silent, "JSON data written to $json\n" ); $LOG->info( "JSON data written to $json" ); } #------------------------------------------------------------------------------- # Requesting the -opml option means to dump the entire feed table as OPML #------------------------------------------------------------------------------- if ($opml) { require XML::OPML; open( my $out, '>:encoding(UTF-8)', $opml ) or die "Unable to open output file $opml $!\n"; # # Start building OPML. Mandatory attributes are 'type', 'text' and # 'xmlURL'. The 'title' attribute is the same as 'text'. The 'type' # attribute is poorly defined; this module uses 'rss' as the type and # 'RSS' as the value of the 'version' attribute. This is not linked to the # type of the feed. # my $OPML = XML::OPML->new(version => '1.1'); # # Create the 'head' element # my $DT = normaliseDT(DateTime->now()); $OPML->head( title => 'Free Culture Podcasts', dateCreated => $DT, dateModified => $DT, ); # # Walk the feeds generating OPML # for (my $i = 0; $i < scalar(@$feeds); $i++) { my $f = $feeds->[$i]; $OPML->add_outline( title => $f->{urls_title}, text => $f->{urls_title}, description => $f->{urls_description}, xmlUrl => $f->{urls_url}, htmlUrl => $f->{urls_link}, ); } # # Output OPML to the nominated file # print $out $OPML->as_string; close($out); emit ( $silent, "OPML data written to $opml\n" ); $LOG->info( "OPML data written to $opml" ); } #------------------------------------------------------------------------------- # Fill and print the template if requested #------------------------------------------------------------------------------- if ($template) { my $tt = Template->new( { ABSOLUTE => 1, ENCODING => 'utf8', INCLUDE_PATH => $basedir, } ); # # Make the structure the template needs # my $vars = { feeds => $feeds, }; # print Dumper($vars),"\n"; my $document; $tt->process( $template, $vars, \$document, { binmode => ':utf8' } ) || die $tt->error(), "\n"; print $outfh $document; emit ( $silent, "Data processed with $template\n" ); $LOG->info( "Data processed with $template" ); } #$dbh->disconnect; if ($rejectfile) { emit( $silent, "Number of rejected URLs written to $rejectfile is $rejectcount\n" ); } exit; #=== FUNCTION ================================================================ # NAME: loadUrls # PURPOSE: To load URLs read from the input file into the database # PARAMETERS: $dbh database handle # $new_urls arrayref containing URLs # $rules WWW::RobotRules object # $keymap hashref containing a map of key names to # database field names # RETURNS: Any new URLs discovered by investigating non-feed URLs. # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub loadUrls { my ( $dbh, $new_urls, $rules, $keymap ) = @_; my ( $stream, $feed ); my ( %uridata, $roboturl, @found_urls ); # # Work through the list of URLs # foreach my $rec (@$new_urls) { %uridata = (); # # By default save the collected data # $uridata{SAVE} = 1; # # Check that we have a valid URL. We don't save them if they are # invalid ($uridata{SAVE} is set to 0 in the routine). # my $uri = validateURI($rec,\%uridata); next unless $uri; # # Check whether we already have this URI in the database # if (dbSearch( $dbh, 'SELECT * FROM urls WHERE url = ?', $uridata{URI} ) ) { emit( $silent, "$uri is already in the database\n" ); $uridata{SAVE} = 0; next; } # # Check the hostname resolves in DNS # if ( checkDNS( $uri, \%uridata ) ) { $uridata{DNS} = join( ", ", @{ $uridata{DNS} } ); emit( $silent, "DNS: $uridata{DNS}\n" ); } else { emit( $silent, "$uri has no DNS entry\n" ); $uridata{SAVE} = 0; next; } # # Check the server is available. Don't save if it's not (probably will # not get the feed anyway). # if ( serverUp( $uri, \%uridata ) ) { emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) ); } else { emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) ); $uridata{SAVE} = 0; next; } # # Check the site's robots.txt. If there's a block then don't save. # if ( robotRulesOK( $uri, $rules, \%uridata ) ) { emit( $silent, "Check of robots.txt rules succeeded\n" ); } else { if ( $uridata{ROBOTS} =~ /404/ ) { emit( $silent, "Search for robots.txt file failed\n" ); } else { emit( $silent, "Check of robots.txt rules failed\n" ); $uridata{SAVE} = 0; next; } } # # Look for the HTTP content type. Don't save if the request failed. # if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) { emit( $silent, "HTTP request to check type OK\n" ); } else { emit( $silent, "HTTP request failed\n" ); $uridata{SAVE} = 0; next; } # # Parse the feed # if ( $uridata{TYPE} eq 'Feed' ) { $LOG->info('Processing feed: ',$uridata{URI}); $stream = getFeed( $uridata{URI} ); if ($stream) { $feed = parseFeed( $uridata{URI}, $stream ); unless ( $feed ) { $uridata{SAVE} = 0; emit( $silent, "Feed did not parse $uridata{URI}\n" ); $LOG->warning('Feed did not parse: ',$uridata{URI}); next; } # # Provide a means of examining the returned feed object. There # are some weird behaviours in there (partly because of the # weirdness of RSS and poor adherence to what standards there # are). # print Dumper($feed), "\n" if ( $DEBUG > 2 ); storeFeed($feed,\%uridata); # # Perform a check on the copyright. The routine sets # $uridata{SAVE} = 0 if the copyright is not acceptable. # if ( $check ne 'none' ) { unless (checkCopyright( $check, \%uridata )) { # # Rejected, write URL to a file if requested # if ($rejectfile) { printf $rejectfh "%s\n", $uridata{URI}; $rejectcount++; } next; } } } else { emit($silent, "Problem collecting feed"); $uridata{SAVE} = 0; next; } } elsif ( $uridata{TYPE} eq 'HTML' ) { # # TODO Save the URL and find any feeds it contains # if (@found_urls) { emit( $silent, "Type $uridata{TYPE} contained " . scalar(@found_urls) . " feeds; queued to be checked for inclusion\n" ); } else { emit( $silent, "Type $uridata{TYPE} contained no feeds; skipped\n" ); } next; } else { emit( $silent, "Not a known type '$uridata{HTTP_CONTENT_TYPE}'; skipped\n" ); $uridata{SAVE} = 0; next; } } continue { # { This 'continue' block is executed for each iteration or 'next' } # # Decide whether to save what we have collected # if ( $uridata{SAVE} ) { if ( addURI( $dbh, \%uridata, $keymap ) ) { emit( $silent, "$uridata{URI} added to the database\n" ); $LOG->info("$uridata{TYPE} ",$uridata{URI},' added to the database'); # # Get the id the database allocated for the row we added # $uridata{URI_ID} = $dbh->last_insert_id( undef, undef, 'urls', undef ); # # Add any enclosures we found # if ( defined( $uridata{ENCLOSURE_COUNT} ) && $uridata{ENCLOSURE_COUNT} > 0 ) { if ( addEnclosures( $dbh, \%uridata ) ) { emit( $silent, $uridata{ENCLOSURE_COUNT}, " enclosures for $uridata{URI} added to the database\n" ); } } } else { emit( $silent, "$uridata{URI} was not added to the database\n" ); $LOG->info("$uridata{TYPE} ",$uridata{URI},' not added to the database'); } } # # Dump what we have if requested # print Dumper( \%uridata ), "\n" if ( $DEBUG > 1 ); emit( $silent, '-' x 80, "\n" ); } # # Return any further urls we found # return @found_urls; } #=== FUNCTION ================================================================ # NAME: searchTitle # PURPOSE: Search the database for a feed with a given title # PARAMETERS: $dbh database handle # $target search target # RETURNS: A list of titles # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub searchTitle { my ($dbh, $target) = @_; my ( $sql1, $sql2, $sth, $rv, $h ); my ( $count, @result ); $sql1 = q{ SELECT count(*) AS count FROM urls WHERE urltype = 'Feed' AND title REGEXP ? }; $sql2 = q{ SELECT title FROM urls WHERE urltype = 'Feed' AND title REGEXP ? ORDER BY title }; # # Count the number of matches # $sth = $dbh->prepare($sql1); $rv = $sth->execute($search_target); if ( $dbh->err ) { warn $dbh->errstr; return; } # # Proceed if we have 1 or more matches # if ( $h = $sth->fetchrow_hashref ) { $count = $h->{count}; $sth->finish; if ( $count >= 1 ) { $sth = $dbh->prepare($sql2); $rv = $sth->execute($search_target); if ( $dbh->err ) { warn $dbh->errstr; return; } while ( $h = $sth->fetchrow_hashref ) { push(@result, $h->{title}); } } return @result; } else { return; } } #=== FUNCTION ================================================================ # NAME: scanDB # PURPOSE: To scan the URLs in the database and update the stored data # PARAMETERS: $dbh database handle # $keymap hashref containing a map of key names to # database field names # RETURNS: Nothing # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub scanDB { my ($dbh, $keymap) = @_; my ( $sql1, $sth1, $rv1, $h1 ); my ( $aref, @urls, $DT, $stream, $feed ); my ( %uridata, $urichanges, $enc_changes ); # # Query to return all feed URLs # $sql1 = q{SELECT id, url FROM urls WHERE urltype = 'Feed' ORDER BY title}; $sth1 = $dbh->prepare($sql1); $rv1 = $sth1->execute(); if ( $dbh->err ) { warn $dbh->errstr; } # # Collect everything as an arrayref pointing to a bunch of arrayrefs # containing the column details requested # $aref = $sth1->fetchall_arrayref; # # Extract just the URL strings # @urls = map { $_->[1] } @{$aref}; # # Now process these URLs from the database one at a time # # $sql1 = q{SELECT * FROM urls WHERE url = ?}; # $sth1 = $dbh->prepare($sql1); my $count = 0; foreach my $url (@urls) { %uridata = (); $count++; scanFeed($dbh,$url,\%uridata); # # # # Record the scan for this URL # # # $DT = normaliseDT( DateTime->now() ); # $uridata{SCANNED_ON} = $DT; # # # # # Default to OK # # # $uridata{SCAN_OK} = 1; # # emit( $silent, "Scanning '$url'\n" ); # $rv1 = $sth1->execute($url); # if ( $dbh->err ) { # warn $dbh->errstr; # } # $h1 = $sth1->fetchrow_hashref; # emit( $silent, $h1->{title}, "\n" ); # # # # # The URL should be valid already from the point at which it was added # # to the database, but if we do this we get a canonical version (it # # sets $uridata{SAVE} which makes no sense here, but we'll just ignore # # it. It also sets $uridata{URI}, which is useful.) # # # my $uri = validateURI( $url, \%uridata ); # # # # # Check the hostname resolves in DNS # # # if ( checkDNS( $uri, \%uridata ) ) { # $uridata{DNS} = join( ", ", @{ $uridata{DNS} } ); # emit( $silent, "DNS: $uridata{DNS}\n" ); # } # else { # emit( $silent, "$uri has no DNS entry\n" ); # $uridata{SCAN_OK} = 0; # next; # } # # # # # Check the server is available # # # if ( serverUp( $uri, \%uridata ) ) { # emit( $silent, sprintf( "Host: %s is up\n", $uridata{HOST} ) ); # } # else { # emit( $silent, sprintf( "Host: %s is down\n", $uridata{HOST} ) ); # $uridata{SCAN_OK} = 0; # next; # } # # # # # Look for the HTTP content type. # # # if ( checkContentType( $uri, \%uridata, \%headers, $LOG ) ) { # emit( $silent, "HTTP request OK\n" ); # } # else { # emit( $silent, "HTTP request failed\n" ); # $uridata{SCAN_OK} = 0; # next; # } # # # # # Note: not doing the robots.txt check since it was done at load time # # # # # # # We know we have a feed, so go get it # # # $stream = getFeed( $uridata{URI} ); # if ($stream) { # # # # Parse the feed. The routine generates its own error messages # # # $feed = parseFeed( $uridata{URI}, $stream ); # unless ( $feed ) { # $uridata{SCAN_OK} = 0; # next; # } # # # # # Save the important feed components in the %uridata hash # # # print Dumper($feed), "\n" if ( $DEBUG > 2 ); # storeFeed( $feed, \%uridata ); # } # else { # emit( $silent, "Problem collecting feed" ); # $uridata{SCAN_OK} = 0; # next; # } # } continue { # { This 'continue' block is executed for each iteration or 'next' } # # If the scan went OK then perform a comparison between the new feed # data and that which is stored # if ( $uridata{SCAN_OK} ) { $urichanges = updateURI( $dbh, \%uridata, $keymap ); $LOG->info( 'Feed \'', $uridata{URI}, '\' URL changes = ', $urichanges ) if $urichanges > 0; # TODO Update enclosures } else { emit( $silent, "$uridata{URI} was not scanned successfully\n" ); $LOG->info( 'Feed ', $uridata{URI}, ' not scanned successfully' ); } # # Finished scanning this feed # emit( $silent, '-' x 80, "\n" ); # # NOTE: Temporarily stop after the first N feeds # last if $count == 2; } return; } #=== FUNCTION ================================================================ # NAME: scanFeed # PURPOSE: Performs a scan on a single feed # PARAMETERS: $dbh database handle # $url feed URL to look up # $uridata hashref to the hash of data collected from the # feed # RETURNS: True (1) if all the steps worked, otherwise false (0) # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub scanFeed { my ( $dbh, $url, $uridata ) = @_; my ( $sql, $sth, $rv, $h ); my ( $DT, $stream, $feed ); my ( $urichanges, $enc_changes ); $sql = q{SELECT * FROM urls WHERE url = ?}; $sth = $dbh->prepare($sql); # # Record the scan for this URL # $DT = normaliseDT( DateTime->now() ); $uridata->{SCANNED_ON} = $DT; # # Default to OK # $uridata->{SCAN_OK} = 1; emit( $silent, "Scanning '$url'\n" ); $rv = $sth->execute($url); if ( $dbh->err ) { warn $dbh->errstr; } $h = $sth->fetchrow_hashref; emit( $silent, $h->{title}, "\n" ); # # The URL should be valid already from the point at which it was added # to the database, but if we do this we get a canonical version (it # sets $uridata->{SAVE} which makes no sense here, but we'll just ignore # it. It also sets $uridata->{URI}, which is useful.) # my $uri = validateURI( $url, $uridata ); # # Check the hostname resolves in DNS # if ( checkDNS( $uri, $uridata ) ) { $uridata->{DNS} = join( ", ", @{ $uridata->{DNS} } ); emit( $silent, "DNS: $uridata->{DNS}\n" ); } else { emit( $silent, "$uri has no DNS entry\n" ); $uridata->{SCAN_OK} = 0; return 0; } # # Check the server is available # if ( serverUp( $uri, $uridata ) ) { emit( $silent, sprintf( "Host: %s is up\n", $uridata->{HOST} ) ); } else { emit( $silent, sprintf( "Host: %s is down\n", $uridata->{HOST} ) ); $uridata->{SCAN_OK} = 0; return 0; } # # Look for the HTTP content type. # if ( checkContentType( $uri, $uridata, \%headers, $LOG ) ) { emit( $silent, "HTTP request OK\n" ); } else { emit( $silent, "HTTP request failed\n" ); $uridata->{SCAN_OK} = 0; return 0; } # # Note: not doing the robots.txt check since it was done at load time # # # We know we have a feed, so go get it # $stream = getFeed( $uridata->{URI} ); if ($stream) { # # Parse the feed. The routine generates its own error messages # $feed = parseFeed( $uridata->{URI}, $stream ); unless ( $feed ) { $uridata->{SCAN_OK} = 0; emit( $silent, "Feed did not parse $uridata->{URI}\n" ); $LOG->warning('Feed did not parse: ',$uridata->{URI}); return 0; } # # Save the important feed components in the %uridata hash # print Dumper($feed), "\n" if ( $DEBUG > 2 ); storeFeed( $feed, $uridata ); } else { emit( $silent, "Problem collecting feed" ); $uridata->{SCAN_OK} = 0; return 0; } return 1; } #=== FUNCTION ================================================================ # NAME: reportDB # PURPOSE: To generate a printed report from the database # PARAMETERS: $dbh database handle # $fh output file handle # RETURNS: Nothing # DESCRIPTION: # THROWS: No exceptions # COMMENTS: reportFeed generates a standardised report # SEE ALSO: N/A #=============================================================================== sub reportDB { my ($dbh, $fh) = @_; # # Query to report the contents of the 'all_episodes' view with the latest # associated episode # my $sql = q{ SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep FROM all_episodes ae JOIN urls ON (ae.urls_id = urls.id) GROUP by urls.id HAVING ae.urls_urltype = 'Feed' ORDER BY urls_title }; my $sth1 = $dbh->prepare($sql); my $rv = $sth1->execute(); if ( $dbh->err ) { warn $dbh->errstr; } while ( my $h1 = $sth1->fetchrow_hashref() ) { reportFeed( $h1, $fh ); } $sth1->finish; return; } #=== FUNCTION ================================================================ # NAME: findFeed # PURPOSE: Find and return a single feed by title # PARAMETERS: $dbh database handle # $target search target # RETURNS: Hashref indexing the feed # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub findFeed { my ( $dbh, $target ) = @_; my ( $sql, $sth, $rv, $h ); # # Query to report the contents of the 'all_episodes' view with the latest # associated episode # $sql = q{ SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep FROM all_episodes ae JOIN urls ON (ae.urls_id = urls.id) GROUP by urls.id HAVING ae.urls_urltype = 'Feed' AND ae.urls_title = ? }; $sth = $dbh->prepare($sql); $rv = $sth->execute($target); if ( $dbh->err ) { warn $dbh->errstr; } $h = $sth->fetchrow_hashref(); return $h; } #=== FUNCTION ================================================================ # NAME: reportFeed # PURPOSE: Reports a single feed from the database # PARAMETERS: $feed a hash from the database containing the # details of a particular feed # $fh output file handle # RETURNS: Nothing # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub reportFeed { my ($feed, $fh) = @_; my $lwidth = 12; # # Hash for converting database keys to labels for the report and arrays # for controlling the sequence. # state ( %keys, @seq1, @seq2 ); %keys = ( 'id' => 'DB Primary key', 'latest_ep' => 'Latest episode date', 'ep_author' => 'Author', 'ep_byte_length' => 'Byte length', 'ep_category' => 'Category', 'ep_enclosure' => 'URL', 'ep_ep_id' => 'ID', 'ep_id' => 'Key', 'ep_issued' => 'Issued on', 'ep_last_update' => 'Last updated', 'ep_link' => 'Permalink', 'ep_mime_type' => 'MIME type', 'ep_modified' => 'Modified on', 'ep_source' => 'Source', 'ep_title' => 'Title', 'ep_urls_id' => 'URL key', 'urls_author' => 'Author', 'urls_content_type' => 'Content type', 'urls_copyright' => 'Copyright', 'urls_description' => 'Description', 'urls_dns' => 'DNS', 'urls_feedformat' => 'Feed format', 'urls_generator' => 'Generator', 'urls_host_up' => 'Host up', 'urls_http_status' => 'HTTP status', 'urls_id' => 'Key', 'urls_image' => 'Image', 'urls_language' => 'Language', 'urls_last_update' => 'Last updated', 'urls_link' => 'Link', 'urls_modified' => 'Modified on', 'urls_title' => 'Title', 'urls_url' => 'Feed URL', 'urls_urltype' => 'URL type', 'urls_parent_id' => 'Parent ID', 'urls_child_count' => 'Child count', ); @seq1 = ( 'urls_title', 'urls_url', 'urls_feedformat', 'urls_link', 'urls_author', 'urls_content_type', 'urls_copyright', 'urls_description', 'urls_dns', 'urls_generator', 'urls_host_up', 'urls_http_status', 'urls_image', 'urls_language', 'urls_last_update', 'urls_modified', 'urls_parent_id', 'urls_child_count', ); @seq2 = ( 'ep_title', 'ep_enclosure', 'ep_category', 'ep_link', 'ep_author', 'ep_byte_length', 'ep_ep_id', 'ep_mime_type', 'ep_issued', 'ep_last_update', 'ep_modified', 'ep_source', ); if ($feed) { print $fh "Channel:\n"; foreach my $key (@seq1) { printf $fh " %-*s: %s\n", $lwidth, $keys{$key}, coalesce( $feed->{$key}, '--' ); } print $fh "\nLatest episode:\n"; foreach my $key (@seq2) { printf $fh " %-*s: %s\n", $lwidth, $keys{$key}, coalesce( $feed->{$key}, '--' ); } print $fh "\n"; } return; } #=== FUNCTION ================================================================ # NAME: collectData # PURPOSE: Collects data from the database for generating a report # PARAMETERS: $dbh database handle # RETURNS: An array of hashrefs from the query # DESCRIPTION: Runs a fixed query looking for feed details in the 'urls' # table and the 'all_episodes' view, showing the date of the # latest episode. The result is an array of rows, each # represented as a hash, all sorted by the feed title. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub collectData { my ($dbh) = @_; # # Query to report only the feeds from the contents of the 'urls' table # with the details of the latest episode # my $sql = q{ SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep FROM all_episodes ae JOIN urls ON (ae.urls_id = urls.id) GROUP by urls.id HAVING ae.urls_urltype = 'Feed' ORDER BY ae.urls_title }; my $sth1 = $dbh->prepare($sql); my $rv = $sth1->execute(); if ( $dbh->err ) { warn $dbh->errstr; } # # Return everything as an arrayref of hashrefs # my $tbl_ary_ref = $sth1->fetchall_arrayref({}); $sth1->finish; return $tbl_ary_ref; } #=== FUNCTION ================================================================ # NAME: countRows # PURPOSE: To count the rows in a table # PARAMETERS: $dbh database handle # $sql SQL expression to use # RETURNS: Number of rows found (note that zero is returned as 0E0) # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub countRows { my ( $dbh, $sql ) = @_; my $sth1 = $dbh->prepare($sql); my $rv = $sth1->execute(); if ( $dbh->err ) { warn $dbh->errstr; } my $h1 = $sth1->fetch; $sth1->finish; return @$h1[0]; } #=== FUNCTION ================================================================ # NAME: dbSearch # PURPOSE: To perform a simple search in the database # PARAMETERS: $dbh database handle # $sql SQL expression to use (expected to be # a SELECT) # @args arguments for the 'execute' # RETURNS: True (1) if the row exists, otherwise false (0). # DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same # SQL without incurring the overhead of repeated 'prepare' # calls. Only the first row is fetched (we expect there to be # only one) and the success or failure is determined by its # existence. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub dbSearch { my ( $dbh, $sql, @args ) = @_; my $sth1 = $dbh->prepare_cached($sql); my $rv = $sth1->execute(@args); if ( $dbh->err ) { warn $dbh->errstr; } my $h1 = $sth1->fetchrow_hashref(); $sth1->finish; return defined($h1); } #=== FUNCTION ================================================================ # NAME: execSQL # PURPOSE: To perform a non-SELECT query # PARAMETERS: $dbh database handle # $sql SQL expression to use # @args arguments for the 'execute' # RETURNS: True (1) if the query succeeded, otherwise false (0). # DESCRIPTION: Uses 'prepare_cached' to allow repeated calls with the same # SQL without incurring the overhead of repeated 'prepare' # calls. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub execSQL { my ( $dbh, $sql, @args ) = @_; my ( $sth1, $rv ); $sth1 = $dbh->prepare_cached($sql); try { $rv = $sth1->execute(@args); } catch { warn "Problem with query '$sql'\n"; if ( $dbh->err ) { warn '** ' . $dbh->errstr; } }; $sth1->finish; return $rv; } #=== FUNCTION ================================================================ # NAME: validateURI # PURPOSE: Checks a URL for validity # PARAMETERS: $rec the URL handed to the script # $uridata hashref containing data for this URI # RETURNS: A URI object if valid otherwise undef # DESCRIPTION: The URL string is validated with the URI module. A canonical # string version is stored in the hash referenced by $uridata # hash if valid otherwise the URL is marked as invalid. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub validateURI { my ( $rec, $uridata ) = @_; # # Check that we have a valid URL. We don't save them if they are # invalid. # my $uri = URI->new( $rec, 'http' ); if ( $uri->scheme ) { emit( $silent, "URI $uri is valid\n" ); $uridata->{URI} = $uri->canonical->as_string; return $uri; } else { emit( $silent, "URI $uri is not valid\n" ); $uridata->{SAVE} = 0; return; } } #=== FUNCTION ================================================================ # NAME: checkDNS # PURPOSE: Looks up a host DNS entry # PARAMETERS: $uri URI object # $uridata hashref containing data for this URI # RETURNS: True (1) if the DNS query was successful, otherwise false (0) # DESCRIPTION: The host name is extracted from the URI (and stored). The # hostname is searched for in the DNS and if successful, an # array of addresses from the 'A' records is built. This is # sorted and stored in the hash referenced by $uridata. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub checkDNS { my ( $uri, $uridata ) = @_; my @adds; my $hostname = $uri->host; $uridata->{HOST} = $hostname; my $res = Net::DNS::Resolver->new; my $query = $res->search($hostname); if ($query) { foreach my $rr ( $query->answer ) { next unless $rr->type eq "A"; push( @adds, $rr->address ); } @adds = sort(@adds); $uridata->{DNS} = \@adds; return 1; } else { warn "Query failed: ", $res->errorstring, "\n"; return 0; } } #=== FUNCTION ================================================================ # NAME: serverUp # PURPOSE: Checks that a given host is responding # PARAMETERS: $uri URI object # $uridata hashref containing data for this URI # RETURNS: True (1) if the host responds to a TCP connection, false (0) # otherwise # DESCRIPTION: Given an URL parses out the hostname and the port (defaulting # to the appropriate default for the scheme, such as 80 for # http). Attempts to connect to this host and port. If the # connect fails then details are written to the data structure # pointed to by $uridata. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub serverUp { my ( $uri, $uridata ) = @_; my ( $hostname, $port, $socket ); $hostname = $uri->host(); $port = $uri->port(); $uridata->{PORT} = $port; $socket = IO::Socket::INET->new( Proto => "tcp", PeerAddr => $hostname, PeerPort => $port, Reuse => 1, Timeout => 10 ); if ($socket) { $socket->close; $uridata->{HOSTUP} = 1; return 1; } else { $uridata->{HOSTUP} = 0; return 0; } } #=== FUNCTION ================================================================ # NAME: robotRulesOK # PURPOSE: To check the intended URL against the site's robots.txt rules # PARAMETERS: $uri URI object # $rules WWW::RobotRules object # $uridata hashref containing data for this URI # RETURNS: True (1) if the GET of the robots.txt file succeeded and the # rules allow the URI object to be fetched, false (0) otherwise. # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub robotRulesOK { my ( $uri, $rules, $uridata ) = @_; # # Build the URL of robots.txt # my $roboturl = $uri->scheme . '://' . $uri->host . '/robots.txt'; my $robots_txt; # # Use LWP::UserAgent to get the feed and handle errors # my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->agent("$PROG/$VERSION"); my $response = $ua->get($roboturl); if ( $response->is_success ) { $uridata->{ROBOTS} = 'Found'; $robots_txt = $response->decoded_content; $rules->parse( $roboturl, $robots_txt ); return $rules->allowed("$uri"); } else { $uridata->{ROBOTS} = $response->status_line; warn "Failed to get $roboturl\n"; warn $response->status_line . "\n"; return; # undef } } #=== FUNCTION ================================================================ # NAME: checkContentType # PURPOSE: Check the content_type of the url # PARAMETERS: $uri URI object # $uridata hashref containing data for this URI # $headers hashref containing query headers # $children arrayref to return any 'child' feeds we might # find; they are appended to the array # $log Log::Handler object # RETURNS: True (1) if all was well, otherwise false (0) # DESCRIPTION: Ensures that we are pulling txt/html/xml. We get the headers # for the URI object using the LWP::UserAgent head method. Then # we examine the 'content-type' header looking for the string # 'xml' or 'html' in it. The former denotes a feed, and the # latter a normal HTML page. # We use Feed::Find if the page is HTML in order to find any # "child" feeds which we really want to process if we find any. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub checkContentType { my ( $uri, $uridata, $headers, $children, $log ) = @_; my @feeds; $uridata->{HTTP_STATUS} = 'Unknown'; my $browser = LWP::UserAgent->new or return 0; my $response = $browser->head( $uri->as_string, %{$headers} ) or return 0; $uridata->{HTTP_STATUS} = $response->status_line; if ( $response->is_success ) { $uridata->{HTTP_CONTENT_TYPE} = $response->header('content-type'); # # Decode the content-type we received # if ( $uridata->{HTTP_CONTENT_TYPE} =~ m|xml|i ) { $uridata->{TYPE} = 'Feed'; } elsif ( $uridata->{HTTP_CONTENT_TYPE} =~ m|html|i ) { $uridata->{TYPE} = 'HTML'; } else { $uridata->{TYPE} = 'Unknown'; } # # Sometimes perfectly valid feeds misleadingly return text/html so we # try to spot such cases here and adjust the internal type accordingly # if ( $uridata->{TYPE} eq 'HTML' ) { @feeds = Feed::Find->find( $uri->as_string ); if ( scalar(@feeds) == 1 && $feeds[0] eq $uri->as_string ) { emit( $silent, "Feed found with wrong content-type\n" ); $uridata->{TYPE} = 'Feed'; } else { # # This HTML and we found 'child' feeds of some kind # emit( $silent, "Found ", scalar(@feeds), " feeds within this HTML page\n" ); print Dumper( \@feeds ), "\n" if $DEBUG > 0; push(@{$children}, @feeds); } } $log->info( "URL content classified as: ", $uridata->{TYPE} ); emit( $silent, "URL content classified as: ", $uridata->{TYPE}, "\n" ); return 1; } else { $uridata->{HTTP_CONTENT_TYPE} = $uridata->{TYPE} = 'Unknown'; return 0; } } #=== FUNCTION ================================================================ # NAME: getFeed # PURPOSE: Download the contents of a feed URL # PARAMETERS: $feed_url URL of the feed to download # RETURNS: String representation of the feed contents or undef if the # download failed. # DESCRIPTION: Issues a GET on the URL which is expected to be a feed (but # need not be). If successful the contents are decoded and # returned, otherwise undef is returned. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: #=============================================================================== sub getFeed { my ($feed_url) = @_; # # Use LWP::UserAgent to get the feed and handle errors # my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->agent("$PROG/$VERSION"); my $response = $ua->get($feed_url); my $feed_content; if ( $response->is_success ) { $feed_content = $response->decoded_content; return $feed_content; } else { warn "Failed to get $feed_url\n"; warn $response->status_line, "\n"; return; # undef } } #=== FUNCTION ================================================================ # NAME: parseFeed # PURPOSE: Parse a podcast feed that has already been downloaded # PARAMETERS: $feed_url URL of the feed previously downloaded # $feed_content String containing the content of the feed, for # parsing # RETURNS: An XML::Feed object containing the parsed feed or undef if the # parse failed # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub parseFeed { my ( $feed_url, $feed_content ) = @_; my $feed; # # Catch errors, returning from the subroutine with 'undef' if any are # triggered. # try { $feed = XML::Feed->parse( \$feed_content ); unless ($feed) { # # Something went wrong. Abort this feed # warn "Warning: Failed to parse $feed_url: ", XML::Feed->errstr, "\n"; return; # undef } } catch { warn "Warning: Failed to parse $feed_url: $_\n"; return; }; return $feed; } ##=== FUNCTION ================================================================ ## NAME: parseFeedPP ## PURPOSE: Parse a podcast feed that has already been downloaded ## PARAMETERS: $feed_url URL of the feed previously downloaded ## $feed_content String containing the content of the feed, for ## parsing ## RETURNS: An XML::FeedPP object containing the parsed feed or undef if the ## parse failed ## DESCRIPTION: ## THROWS: No exceptions ## COMMENTS: None ## SEE ALSO: N/A ##=============================================================================== #sub parseFeedPP { # my ( $feed_url, $feed_content ) = @_; # # my $feed; # # try { # $feed = XML::FeedPP->parse( \$feed_content, -type => 'string' ); # unless ($feed) { # # # # Something went wrong. Abort this feed # # # warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n"; # return; # undef # } # } # catch { # warn "Failed to parse $feed_url: ", $@, "\n"; # return; # } # # return $feed; # #} #=== FUNCTION ================================================================ # NAME: storeFeed # PURPOSE: Stores feed attributes in a hash # PARAMETERS: $feed XML::Feed object returned from parsing the # feed # $uridata hashref containing data for this URI # RETURNS: Nothing # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub storeFeed { my ($feed, $uridata) = @_; ( $uridata->{TITLE} = $feed->title ) =~ s/(^\s+|\s+$)//g; ( $uridata->{FORMAT} = $feed->format ) =~ s/(^\s+|\s+$)//g; $uridata->{LINK} = $feed->link; $uridata->{DESCRIPTION} = clean_string( $feed->tagline ); $uridata->{AUTHOR} = $feed->author; $uridata->{MODIFIED} = normaliseDT( $feed->modified ); $uridata->{IMAGE} = flattenArray( $feed->image ); $uridata->{COPYRIGHT} = $feed->copyright; $uridata->{GENERATOR} = $feed->generator; $uridata->{LANGUAGE} = $feed->language; #print coalesce($feed->webMaster,'No webMaster'),"\n"; $uridata->{ENCLOSURES} = extractEnclosures($feed); $uridata->{ENCLOSURE_COUNT} = scalar( @{ $uridata->{ENCLOSURES} } ); } #=== FUNCTION ================================================================ # NAME: checkCopyright # PURPOSE: Ask the user to check, or applies simple rules, to accept or # reject a feed based on the copyright # PARAMETERS: $checkmode the mode string: auto, manual or none # $uridata reference to the hash containing details of # this feed # RETURNS: 1 (true) if the feed is to be added, 0 (false) if not # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub checkCopyright { my ($checkmode, $uridata) = @_; my ( $copyright, $re, $decision ); $LOG->info("Checking copyright of feed (mode: $checkmode)"); if ( $checkmode eq 'manual' ) { # # Prompt the user, failing gracefully if there's # a problem. If the user types 'Y' or 'y' we accept the # feed, otherwise we do not (thus a blank return = 'no'). # try { printf STDERR "Feed '%s' has the copyright string:\n%s\n", $uridata->{TITLE}, coalesce( $uridata->{COPYRIGHT}, '' ); $decision = prompt( -in => *STDIN, -out => *STDERR, -prompt => 'OK to add this feed?', -style => 'bold red underlined', -yes ); } catch { warn "Problem processing copyright decision: $_"; $decision = 0; }; } else { # # Careful. Un-escaped spaces are ignored # $re = qr{( CC| Creative\ Commons| creativecommons.org| Attribution.NonCommercial.No.?Derivatives? )}xmi; $decision = 0; $copyright = coalesce( $uridata->{COPYRIGHT}, '' ); emit( $silent, "Copyright: '$copyright'\n" ); $LOG->info("Copyright: '$copyright'"); if ( $copyright eq '' || $copyright =~ /$re/ ) { $decision = 1; } } # # Take action on the decision (or default) # $uridata->{SAVE} = $decision; if ($decision) { emit( $silent, "Feed added\n" ); $LOG->info('Copyright OK'); return 1; } else { emit( $silent, "Feed not added\n" ); $LOG->info('Copyright not OK'); return 0; } } #=== FUNCTION ================================================================ # NAME: parseRSS # PURPOSE: Attempt to parse a feed as RSS # PARAMETERS: $feed_url URL of the feed previously downloaded # $feed_content String containing the content of the feed, for # parsing # RETURNS: An XML::RSS object containing the parsed feed or undef if the # parse failed # DESCRIPTION: ** Incomplete ** # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub parseRSS { my ( $feed_url, $feed_content ) = @_; my $rss = XML::RSS->parse(\$feed_content); unless ($rss) { } } #=== FUNCTION ================================================================ # NAME: addURI # PURPOSE: Adds the data for a URI to the database # PARAMETERS: $dbh database handle # $uridata hashref containing data for the current URI # $keymap hashref containing a map of key names to # database field names # RETURNS: True (1) if the insert succeeded, false (0) otherwise # DESCRIPTION: The hash keys are defined as an array to make it easy to slice # the hash and the SQL is defined internally using the size of # the key array as a guide to the number of '?' placeholders. # These are passed to execSQL to do the work. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub addURI { my ( $dbh, $uridata, $keymap ) = @_; my @keys = ( 'URI', 'DNS', 'HOSTUP', 'HTTP_STATUS', 'HTTP_CONTENT_TYPE', 'TYPE', 'FORMAT', 'TITLE', 'DESCRIPTION', 'AUTHOR', 'MODIFIED', 'LINK', 'IMAGE', 'COPYRIGHT', 'GENERATOR', 'LANGUAGE', ); my $sql = 'INSERT INTO urls (' . join( ",", @{$keymap}{@keys} ) . ') ' . 'VALUES(' . join( ',', ('?') x scalar(@keys) ) . ')'; print "addURI query: $sql\n" if $DEBUG > 0; return execSQL( $dbh, $sql, @{$uridata}{@keys} ); } #=== FUNCTION ================================================================ # NAME: extractEnclosures # PURPOSE: Builds an array of hashes containing enclosure data from an # XML::Feed object # PARAMETERS: $feed XML::Feed object # RETURNS: A reference to the array of anonymous hashes built from the # feed. # DESCRIPTION: The XML::Feed object is expected to contain an array of # entries. These are converted to hashes, references to which # are stored in an array. The two DateTime components are # converted to ISO8601 dates. If there is an enclosure then its # elements are saved. Note that there could be multiple # enclosures per item, but XML::Feed does not cater for them # unless explicitly requested. We do not deal with such a case # here. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub extractEnclosures { my ($feed) = @_; my @encs; foreach my $entry ( $feed->entries ) { my %ent; $ent{title} = $entry->title; $ent{base} = $entry->base; $ent{link} = $entry->link; $ent{category} = join( ", ", $entry->category ); $ent{author} = $entry->author; $ent{id} = $entry->id; $ent{issued} = normaliseDT( $entry->issued ); $ent{modified} = normaliseDT( $entry->modified ); # # Work around a bug in XML::Feed where the source method is only # available for Atom feeds. TODO report this. # if ( $entry->isa('XML::Feed::Entry::Format::Atom') ) { $ent{source} = $entry->source; } my ($enclosure) = $entry->enclosure; if ( defined($enclosure) ) { $ent{url} = $enclosure->url; $ent{type} = $enclosure->type; $ent{length} = $enclosure->length; } push( @encs, \%ent ); } return \@encs; } #=== FUNCTION ================================================================ # NAME: addEnclosures # PURPOSE: Adds episodes extracted from a feed into the database # PARAMETERS: $dbh database handle # $uridata hashref containing data for the current URI # including an arrayref of hashrefs of episode # data # RETURNS: True (1) if all the inserts succeeded, false (0) otherwise # DESCRIPTION: The SQL is defined internally and the hash keys are defined as # an array to make it easy to slice the hash. The enclosures (or # more correctly, feed items) are present in the hash as an # array of anonymous hashes. These are processed one at a time # and inserted into the database. A count of the number of # successful inserts is kept. This is compared with the number # of enclosures to determine the boolean value to return. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub addEnclosures { my ( $dbh, $uridata ) = @_; my $sql = q{INSERT INTO episodes (urls_id, link, enclosure, title, author, category, source, ep_id, issued, modified, byte_length, mime_type) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)}; my @keys = ( 'link', 'url', 'title', 'author', 'category', 'source', 'id', 'issued', 'modified', 'length', 'type' ); my $successes = 0; foreach my $enc ( @{ $uridata->{ENCLOSURES} } ) { if ( execSQL( $dbh, $sql, $uridata->{URI_ID}, @{$enc}{@keys} ) ) { $successes++; } else { emit( $silent, "Failed to add episode $enc->{url}\n" ); } } return ( $successes == $uridata->{ENCLOSURE_COUNT} ); } #=== FUNCTION ================================================================ # NAME: updateURI # PURPOSE: Compare the data in a hash with that in the database # PARAMETERS: $dbh database handle # $uridata hashref containing data for the current URI # including an arrayref of hashrefs of episode # data # $keymap hashref containing a map of key names to # database field names # RETURNS: The number of changes made # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub updateURI { my ( $dbh, $uridata, $keymap ) = @_; my ( $sql1, $sth1, $rv1, $h1 ); my ( %fieldvals, %where ); my ( $diffs, $updates ) = ( 0, 0 ); my @keys = ( 'URI', 'DNS', 'HOSTUP', 'HTTP_STATUS', 'HTTP_CONTENT_TYPE', 'TYPE', 'FORMAT', 'TITLE', 'DESCRIPTION', 'AUTHOR', 'MODIFIED', 'LINK', 'IMAGE', 'COPYRIGHT', 'GENERATOR', 'LANGUAGE', ); # # Get the row from the urls table # $sql1 = q{SELECT * FROM urls WHERE url = ?}; $sth1 = $dbh->prepare($sql1); $rv1 = $sth1->execute( $uridata->{URI} ); if ( $dbh->err ) { warn $dbh->errstr; } $h1 = $sth1->fetchrow_hashref; for my $key (@keys) { unless ( equal( $uridata->{$key}, $h1->{ $keymap->{$key} } ) ) { $diffs++; $fieldvals{$key} = $uridata->{$key}; # # Temporary report # print "Difference: ($key)\n"; print " Feed: ", coalesce($uridata->{$key},''), "\n"; print " Database: ", coalesce($h1->{ $keymap->{$key} },''), "\n"; } } if ($diffs > 0) { # # Prepare to use SQL::Abstract # my $sql = SQL::Abstract->new(); # # Build the update statement # $where{id} = $h1->{id}; my ( $stmt, @bindvals ) = $sql->update( 'urls', \%fieldvals, \%where ); # Temporary print "Statement: $stmt\n"; print "Bind values: ", join(",",@bindvals),"\n"; # # Perform the updates # $sth1 = $dbh->prepare($stmt); $sth1->execute(@bindvals); if ( $dbh->err ) { warn "Processing $h1->{url}\n", $dbh->errstr; } else { emit($silent, "Updated $h1->{url}\n"); $updates++; } } return $updates; } #=== FUNCTION ================================================================ # NAME: updateEnclosures # PURPOSE: Update the enclosures stored with a feed URL # PARAMETERS: $dbh database handle # $uridata hashref containing data for the current URI # including an arrayref of hashrefs of episode # data # RETURNS: The number of changes made # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub updateEnclosures { my ( $dbh, $uridata ) = @_; my ( $sql1, $sth1, $rv1, $h1 ); my ( %fieldvals, %where ); my ( $diffs, $updates ) = ( 0, 0 ); # # Get the enclosures associated with this feed # $sql1 = q{ SELECT * FROM episodes WHERE urls_id = (SELECT id FROM urls WHERE title = ?) }; $sth1 = $dbh->prepare($sql1); $rv1 = $sth1->execute( $uridata->{URI} ); if ( $dbh->err ) { warn $dbh->errstr; } $h1 = $sth1->fetchrow_hashref; } #=== FUNCTION ================================================================ # NAME: equal # PURPOSE: Compare two strings even if undefined # PARAMETERS: $s1 The first string # $s2 The second string # RETURNS: True if both strings are undefined, false if one isn't # defined, otherwise the result of comparing them. # DESCRIPTION: Works on the principle that two undefined strings are equal, # a defined and an undefined string are not, and otherwise they # are equal if they are equal! # THROWS: No exceptions # COMMENTS: None # SEE ALSO: #=============================================================================== sub equal { my ( $s1, $s2 ) = @_; return 1 if ( !defined($s1) && !defined($s2) ); return 0 if ( !defined($s1) || !defined($s2) ); return ( $s1 eq $s2 ); } #=== FUNCTION ================================================================ # NAME: clean_string # PURPOSE: Clean a string of non-printables, newlines, multiple spaces # PARAMETERS: $str The string to process # RETURNS: The processed string # DESCRIPTION: Removes leading and trailing spaces. Removes all non-printable # characters. Removes all CR/LF sequences. Removes multiple # spaces. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: #=============================================================================== sub clean_string { my ($str) = @_; $str =~ s/(^\s+|\s+$)//g; $str =~ tr/[[:graph:]]//c; $str =~ tr/\x0A\x0D/ /; $str =~ tr/ / /s; return $str; } #=== FUNCTION ================================================================ # NAME: normaliseDT # PURPOSE: Normalise an ISO8601 date for comparison, etc. # PARAMETERS: $dt a DateTime object # RETURNS: The DateTime object formatted as an ISO8601 string # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub normaliseDT { my ($dt) = @_; my $p = DateTime::Format::SQLite->new(); return ( defined($dt) ? $p->format_datetime( DateTime::Format::ISO8601->parse_datetime($dt) ) : undef ); } #=== FUNCTION ================================================================ # NAME: flattenArray # PURPOSE: Turns an arrayref into a simple list in a string # PARAMETERS: $item - the item that may be an arrayref # RETURNS: The plain item if it's not an array otherwise the flattened # version # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub flattenArray { my ($item) = @_; my $result; if ( ref( $item ) eq 'ARRAY' ) { $result = join(", ",@{$item}); } else { $result = $item; } return $item; } #=== FUNCTION ================================================================ # NAME: coalesce # PURPOSE: To find the defined argument and return it # PARAMETERS: Arbitrary number of arguments # RETURNS: The first defined argument # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub coalesce { foreach (@_) { return $_ if defined($_); } return; # undef } #=== FUNCTION ================================================================ # NAME: emit # PURPOSE: Print text on STDERR unless silent mode has been selected # PARAMETERS: - Boolean indicating whether to be silent or not # - list of arguments to 'print' # RETURNS: Nothing # DESCRIPTION: This is a wrapper around 'print' to determine whether to send # a message to STDERR depending on a boolean. We need this to be # able to make the script silent when the -silent option is # selected # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub emit { unless (shift) { print STDERR @_; } } #=== FUNCTION ================================================================ # NAME: Options # PURPOSE: Processes command-line options # PARAMETERS: $optref Hash reference to hold the options # RETURNS: Undef # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub Options { my ($optref) = @_; my @options = ( "help", "manpage", "debug=i", "silent!", "load=s", "delete=s", "scan!", "report:s", "check:s", "json:s", "opml:s", "config=s", "out=s", "rejects:s", "template:s", ); if ( !GetOptions( $optref, @options ) ) { pod2usage( -msg => "Version $VERSION\n", -exitval => 1 ); } return; } __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #{{{ =head1 NAME feedWatcher - watch a collection of podcast feeds =head1 VERSION This documentation refers to I version 0.1.1 =head1 USAGE feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report] [-check[=mode]] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE] [-[no]silent] [-config=FILE] [-debug=N] [URL ...] # Load URLs from a file, perform checks and redirect output to standard # output and a named file ./feedWatcher -load=feedWatcher_dumped_URLs.txt -check=auto | \ tee load_$(date +'%Y%m%d_%H%M%S') # Generate Markdown output with a template writing to a named file ./feedWatcher -tem=feedWatcher_3.tpl -out=feedWatcher.mkd =head1 ARGUMENTS Arguments are optional and may consist of an arbitrarily long list of URLs to be processed and possibly added to the database by the script. These URLs are prepended to any which may be provided through the B<-load=FILE> option described below. The script makes sure the URL collection contains no duplicates. =head1 OPTIONS =over 8 =item B<-help> Prints a brief help message describing the usage of the program, and then exits. The full documentation can be displayed with the command: perldoc -oman feedWatcher Alternatively, use the option B<-manpage> =item B<-manpage> Prints the entire documentation for the script. Note that this mode uses a simpler type of formatter than I. =item B<-load=FILE> Defines a file from which new URLs are to be added to the database. These URLs are checked in various ways before adding to the database. If arguments are provided when the script is invoked these URLs are appended to the argument list. =item B<-delete=FILE> Defines a file from which a list of URLs is to be read which are to be deleted from the database. Note that it is possible (though inadvisable) to both add and delete an URL in the same run of the script. The URL will first be added (from the load file or argument list) then deleted. This is a pointless exercise which wastes bandwidth, so don't do it! =item B<-[no]scan> This option (B<-scan>) causes the URLs stored in the database to be scanned and updated. The negated form, which is also the default behaviour of the script, (B<-noscan>) omits the scan. NOTE: This function is not implemented yet. =item B<-out=FILE> This option defines an output file to receive outputi from reporting functions. If the option is omitted the data is written to STDOUT, allowing it to be redirected if required. This option does not cause transactional listings to be captured. =item B<-[no]check> This option (B<-check>) causes each feed which is being to be checked against the script user to check that it's OK to add it. The script reports the I field and requests a I or I response. =item B<-[no]report> This option (B<-report>) causes a report of the contents of the database to be generated. The negated form, which is also the default behaviour of the script, (B<-noreport>) omits the report. NOTE: The report is currently very simple. =item B<-json[=FILE]> This option, which may be omitted, defines the location where the feed and episode details are to be written. If omitted no JSON data is written. If the option is given as B<-json=FILE> the data is written to the nominated file. If the B<=FILE> portion is omitted a default name of 'feedWatcher.json' is used. =item B<-opml[=FILE]> This option, which may be omitted, defines the location where the feed details are to be written. If omitted no OPML data is written. If the option is given as B<-opml=FILE> the data is written to the nominated file. If the B<=FILE> portion is omitted a default name of 'feedWatcher.opml' is used. =item B<-template=FILE> This option defines the template used to generate a form of the feed data. The template is written using the B