Updates for FOSDEM 2023

Changes to the main 'feedWatcher' script: new -check=mode and -rejects=file options to automate copyright checks and save rejected URLs. Made subroutines parseFeed, and execSQL more resilient. Experimented with using XML::FeedPP but haven't done so yet. Enhanced checkCopyright to do auto, manual and no checking. Some POD additions. The database is currently being sent to the repo, but this may be unwise. The script 'make_reports' is for making the various reports uploaded here: html, JSON, OPML, Markdown and PDF. The PDF is built from the Markdown with Pandoc. The HTML is generated from the template 'feedWatcher.tpl', which is the default. The TT² template 'feedWatcher_5.tpl' is for dumping the URLs from the database into a file so that they can be reloaded. Daily dumps of the database are made on my workstation, and kept for 6 months.
2023-01-09 18:20:17 +00:00
parent f9cff60021
commit 4f744f37c4
9 changed files with 1575 additions and 2867 deletions
--- a/323
+++ b/323
@@ -4,8 +4,9 @@
 #         FILE: feedWatcher
 #
 #        USAGE: ./feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan]
-#               [-report[=title]] [-[no]check] [-out=FILE] [-json[=FILE]]
-#               [-opml[=FILE]] [-template[=FILE]] [-[no]silent] [-debug=N]
+#               [-report[=title]] [-check[=mode]] [-out=FILE]
+#               [-rejects[=FILE]] [-json[=FILE]] [-opml[=FILE]]
+#               [-template[=FILE]] [-[no]silent] [-debug=N]
 #               [URL ...]
 #
 #  DESCRIPTION: A rewrite of Ken Fallon's script to collect data about Linux
@@ -25,9 +26,9 @@
 #         BUGS: ---
 #        NOTES: ---
 #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
-#      VERSION: 0.0.15
+#      VERSION: 0.1.1
 #      CREATED: 2013-12-25 12:40:33
-#     REVISION: 2022-11-18 22:27:35
+#     REVISION: 2023-01-09 15:28:13
 #
 #-------------------------------------------------------------------------------
 # Released under the terms of the GNU Affero General Public License (AGPLv3)
@@ -58,6 +59,7 @@ use LWP::UserAgent;
 use WWW::RobotRules;
 use XML::RSS::Parser;
 use XML::Feed;
+use XML::FeedPP; # Fall back?
 use Feed::Find;

 use Template;
@@ -80,7 +82,7 @@ use Data::Dumper;
 #
 # Version number (manually incremented)
 #
-our $VERSION = '0.0.15';
+our $VERSION = '0.1.1';

 #
 # Script name
@@ -92,8 +94,9 @@ our $VERSION = '0.0.15';
 #
 my ( @new_urls, @deletions );
 my ( $rules,    $robot_name ) = ( undef, "$PROG/$VERSION" );
-my ( $search_target );
+my ($search_target);
 my ( $sth1, $h1, $rv );
+my ($rejectcount);

 my $feeds;

@@ -170,13 +173,13 @@ Options( \%options );
 #
 # Default help
 #
-pod2usage( -msg => "Version $VERSION\n", -exitval => 1 )
+pod2usage( -msg => "Version $VERSION\n", -exitval => 1, -verbose => 0 )
    if ( $options{'help'} );

 #
 # Detailed help
 #
-pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1, -noperldoc => 0 )
+pod2usage( -msg => "$PROG version $VERSION\n", -verbose => 2, -exitval => 1 )
    if ( $options{'manpage'} );

 #
@@ -191,13 +194,14 @@ my $loadfile   = $options{'load'};
 my $deletefile = $options{'delete'};

 my $scan   = ( defined( $options{scan} )   ? $options{scan}   : 0 );
-my $check  = ( defined( $options{check} )  ? $options{check}  : 0 );

-my $outfile  = $options{out};
-my $report   = $options{report};
-my $json     = $options{json};
-my $opml     = $options{opml};
-my $template = $options{template};
+my $check      = $options{check};
+my $outfile    = $options{out};
+my $rejectfile = $options{rejects};
+my $report     = $options{report};
+my $json       = $options{json};
+my $opml       = $options{opml};
+my $template   = $options{template};

 #
 # Check the configuration file
@@ -220,6 +224,26 @@ if ($deletefile) {
    die "File $deletefile is not readable\n" unless -r $deletefile;
 }

+#
+# The checking mode defaults to 'auto' if the option has no value, or may be
+# 'manual' or 'none'. If the option is not used at all it defaults to 'none'.
+#
+if ( defined($check) ) {
+    $check =~ s/(^\s+|\s+$)//g;
+    if ($check =~ /^$/) {
+        $check = "auto";
+    }
+    else {
+        $check = lc($check);
+        die "Invalid option '-check=$check'\n" .
+            "Values are <blank>, auto and manual\n"
+            unless ($check =~ /^(auto|manual|none)$/)
+    }
+}
+else {
+    $check = 'none';
+}
+
 #
 # We accept -report, meaning report everything or -report='title' to report
 # just the feed with the given title.
@@ -238,7 +262,7 @@ if ( defined($report) ) {
 #
 if ( defined($json) ) {
    if ($json =~ /^$/) {
-        $json = "$PROG.json"
+        $json = "$PROG.json";
    }
 }

@@ -336,15 +360,29 @@ else {
        or warn "Unable to initialise for writing: $!";
 }

+#-------------------------------------------------------------------------------
+# Open the rejects file if requested, otherwise we don't write reject data
+#-------------------------------------------------------------------------------
+my $rejectfh;
+if ($rejectfile) {
+    if ($rejectfile =~ /^$/) {
+        $rejectfile = "${PROG}_rejected_URLs.txt";
+    }
+
+    open( $rejectfh, ">:encoding(UTF-8)", $rejectfile )
+        or warn "Unable to open $rejectfile for writing: $!";
+
+    $rejectcount = 0;
+}

 #
 # Set up a robot.txt rules parser
 #
 $rules = WWW::RobotRules->new($robot_name);

-#
+#-------------------------------------------------------------------------------
 # Slurp the load file into @new_urls if the file is provided
-#
+#-------------------------------------------------------------------------------
 if ($loadfile) {
    #
    # Load the input file
@@ -364,36 +402,39 @@ if ($loadfile) {
 }

 #
-# Now, we either have URLs from the command line, or from the load file, so we
-# process these.
+# Now, we either have URLs from the command line, or from the load file (or
+# both), so we process these.
 #
-#while (@new_urls) {
-# TODO: Why was this a loop?
-if (@new_urls) {
+# It's a loop because 'loadUrls' might find some more URLs by scanning HTML
+# URLs if given them. If it does we replace @new_urls with the found URLs and
+# go again. When there's nothing returned the loop stops.
+# ----
+# NOTE: This seems dirty, but all the 'while' is testing is whether the array
+# contains anything or not. It's not iterating over it or anything, which would
+# be messy!
+#
+while (@new_urls) {
    #
-    # Remove duplicates
+    # Remove duplicates, finish if it deletes them all!
    #
    @new_urls = uniq(@new_urls);
+    last unless @new_urls;

    #
-    # Remove any commented out lines
+    # Remove any commented out lines, finish if it deletes them all!
    #
    @new_urls = grep {!/^\s*#/} @new_urls;
+    last unless @new_urls;

    $LOG->info( "Received ", scalar(@new_urls),
        " URLs to add to the database" );

    #
    # Load these URLs as appropriate, returning any more that we find by
-    # following HTML urls.
+    # following HTML urls. We overwrite the original list and start all over
+    # again.
    #
    @new_urls = loadUrls( $dbh, \@new_urls, $rules, \%keymap );
-
-    #
-    # Now process any URLs that came back. Since we are explicitly looking for
-    # feeds we can assume that's what we have so don't need to recurse again.
-    #
-    # TODO
 }

 #
@@ -440,6 +481,7 @@ if ($deletefile) {

 #-------------------------------------------------------------------------------
 # Perform a database scan if requested
+# TODO: Needs to be developed; does nothing at the moment.
 #-------------------------------------------------------------------------------
 if ($scan) {
    $LOG->info( "Scan is not fully implemented yet" );
@@ -603,6 +645,11 @@ if ($template) {

 #$dbh->disconnect;

+if ($rejectfile) {
+    emit( $silent,
+        "Number of rejected URLs written to $rejectfile is $rejectcount\n" );
+}
+
 exit;

 #===  FUNCTION  ================================================================
@@ -628,7 +675,7 @@ sub loadUrls {
    #
    # Work through the list of URLs
    #
-    foreach my $rec (@new_urls) {
+    foreach my $rec (@$new_urls) {
        %uridata = ();

        #
@@ -704,7 +751,7 @@ sub loadUrls {
        # Look for the HTTP content type. Don't save if the request failed.
        #
        if ( checkContentType( $uri, \%uridata, \%headers, \@found_urls, $LOG ) ) {
-            emit( $silent, "HTTP request OK\n" );
+            emit( $silent, "HTTP request to check type OK\n" );
        }
        else {
            emit( $silent, "HTTP request failed\n" );
@@ -722,6 +769,8 @@ sub loadUrls {
                $feed = parseFeed( $uridata{URI}, $stream );
                unless ( $feed ) {
                    $uridata{SAVE} = 0;
+                    emit( $silent, "Feed did not parse $uridata{URI}\n" );
+                    $LOG->warning('Feed did not parse: ',$uridata{URI});
                    next;
                }

@@ -735,11 +784,20 @@ sub loadUrls {
                storeFeed($feed,\%uridata);

                #
-                # Perform a check on the copyright. Routine sets
+                # Perform a check on the copyright. The routine sets
                # $uridata{SAVE} = 0 if the copyright is not acceptable.
                #
-                if ($check) {
-                    next unless checkCopyright(\%uridata);
+                if ( $check ne 'none' ) {
+                    unless (checkCopyright( $check, \%uridata )) {
+                        #
+                        # Rejected, write URL to a file if requested
+                        #
+                        if ($rejectfile) {
+                            printf $rejectfh "%s\n", $uridata{URI};
+                            $rejectcount++;
+                        }
+                        next;
+                    }
                }

            }
@@ -1183,6 +1241,8 @@ sub scanFeed {
        $feed = parseFeed( $uridata->{URI}, $stream );
        unless ( $feed ) {
            $uridata->{SCAN_OK} = 0;
+            emit( $silent, "Feed did not parse $uridata->{URI}\n" );
+            $LOG->warning('Feed did not parse: ',$uridata->{URI});
            return 0;
        }

@@ -1418,8 +1478,8 @@ sub collectData {
    my ($dbh) = @_;

    #
-    # Query to report the contents of the 'urls' table with the details of the
-    # latest episode
+    # Query to report only the feeds from the contents of the 'urls' table
+    # with the details of the latest episode
    #
    my $sql = q{
        SELECT urls.id, ae.*, max(coalesce(ae.ep_issued,ae.ep_modified)) AS latest_ep
@@ -1519,11 +1579,18 @@ sub dbSearch {
 sub execSQL {
    my ( $dbh, $sql, @args ) = @_;

-    my $sth1 = $dbh->prepare_cached($sql);
-    my $rv   = $sth1->execute(@args);
-    if ( $dbh->err ) {
-        warn $dbh->errstr;
+    my ( $sth1, $rv );
+
+    $sth1 = $dbh->prepare_cached($sql);
+    try {
+        $rv = $sth1->execute(@args);
    }
+    catch {
+        warn "Problem with query '$sql'\n";
+        if ( $dbh->err ) {
+            warn '** ' . $dbh->errstr;
+        }
+    };
    $sth1->finish;

    return $rv;
@@ -1824,19 +1891,69 @@ sub getFeed {
 sub parseFeed {
    my ( $feed_url, $feed_content ) = @_;

-    my $feed = XML::Feed->parse( \$feed_content );
-    unless ($feed) {
-        #
-        # Something went wrong. Abort this feed
-        #
-        warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
-        return; # undef
+    my $feed;
+
+    #
+    # Catch errors, returning from the subroutine with 'undef' if any are
+    # triggered.
+    #
+    try {
+        $feed = XML::Feed->parse( \$feed_content );
+        unless ($feed) {
+            #
+            # Something went wrong. Abort this feed
+            #
+            warn "Warning: Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
+            return;    # undef
+        }
    }
+    catch {
+        warn "Warning: Failed to parse $feed_url: $_\n";
+        return;
+    };

    return $feed;

 }

+##===  FUNCTION  ================================================================
+##         NAME: parseFeedPP
+##      PURPOSE: Parse a podcast feed that has already been downloaded
+##   PARAMETERS: $feed_url       URL of the feed previously downloaded
+##               $feed_content   String containing the content of the feed, for
+##                               parsing
+##      RETURNS: An XML::FeedPP object containing the parsed feed or undef if the
+##               parse failed
+##  DESCRIPTION:
+##       THROWS: No exceptions
+##     COMMENTS: None
+##     SEE ALSO: N/A
+##===============================================================================
+#sub parseFeedPP {
+#    my ( $feed_url, $feed_content ) = @_;
+#
+#    my $feed;
+#
+#    try {
+#        $feed = XML::FeedPP->parse( \$feed_content, -type => 'string' );
+#        unless ($feed) {
+#            #
+#            # Something went wrong. Abort this feed
+#            #
+#            warn "Failed to parse $feed_url: ", XML::Feed->errstr, "\n";
+#            return;    # undef
+#        }
+#    }
+#    catch {
+#        warn "Failed to parse $feed_url: ", $@, "\n";
+#        return;
+#    }
+#
+#    return $feed;
+#
+#}
+
+
 #===  FUNCTION  ================================================================
 #         NAME: storeFeed
 #      PURPOSE: Stores feed attributes in a hash
@@ -1873,41 +1990,67 @@ sub storeFeed {

 #===  FUNCTION  ================================================================
 #         NAME: checkCopyright
-#      PURPOSE: Ask the user to check the copyright of a feed
-#   PARAMETERS:
-#      RETURNS:
+#      PURPOSE: Ask the user to check, or applies simple rules, to accept or
+#               reject a feed based on the copyright
+#   PARAMETERS: $checkmode      the mode string: auto, manual or none
+#               $uridata        reference to the hash containing details of
+#                               this feed
+#      RETURNS: 1 (true) if the feed is to be added, 0 (false) if not
 #  DESCRIPTION:
 #       THROWS: No exceptions
 #     COMMENTS: None
 #     SEE ALSO: N/A
 #===============================================================================
 sub checkCopyright {
-    my ($uridata) = @_;
+    my ($checkmode, $uridata) = @_;

-    my $decision;
-    $LOG->info('Checking copyright of feed');
+    my ( $copyright, $re, $decision );
+    $LOG->info("Checking copyright of feed (mode: $checkmode)");

-    #
-    # Prompt the user, failing gracefully if there's
-    # a problem. If the user types 'Y' or 'y' we accept the
-    # feed, otherwise we do not (thus a blank return = 'no').
-    #
-    try {
-        printf STDERR
-            "Feed '%s' has the copyright string:\n%s\n",
-            $uridata->{TITLE},
-            coalesce( $uridata->{COPYRIGHT}, '' );
-        $decision = prompt(
-            -in     => *STDIN,
-            -prompt => 'OK to add this feed?',
-            -style  => 'bold red underlined',
-            -yes
-        );
+    if ( $checkmode eq 'manual' ) {
+        #
+        # Prompt the user, failing gracefully if there's
+        # a problem. If the user types 'Y' or 'y' we accept the
+        # feed, otherwise we do not (thus a blank return = 'no').
+        #
+        try {
+            printf STDERR
+                "Feed '%s' has the copyright string:\n%s\n",
+                $uridata->{TITLE},
+                coalesce( $uridata->{COPYRIGHT}, '' );
+            $decision = prompt(
+                -in     => *STDIN,
+                -out    => *STDERR,
+                -prompt => 'OK to add this feed?',
+                -style  => 'bold red underlined',
+                -yes
+            );
+        }
+        catch {
+            warn "Problem processing copyright decision: $_";
+            $decision = 0;
+        };
    }
-    catch {
-        warn "Problem processing copyright decision: $_";
+    else {
+        #
+        # Careful. Un-escaped spaces are ignored
+        #
+        $re = qr{(
+            CC|
+            Creative\ Commons|
+            creativecommons.org|
+            Attribution.NonCommercial.No.?Derivatives?
+        )}xmi;
+
        $decision = 0;
-    };
+
+        $copyright = coalesce( $uridata->{COPYRIGHT}, '' );
+        emit( $silent, "Copyright: '$copyright'\n" );
+        $LOG->info("Copyright: '$copyright'");
+        if ( $copyright eq '' || $copyright =~ /$re/ ) {
+            $decision = 1;
+        }
+    }

    #
    # Take action on the decision (or default)
@@ -2000,8 +2143,9 @@ sub addURI {
 #               are stored in an array. The two DateTime components are
 #               converted to ISO8601 dates. If there is an enclosure then its
 #               elements are saved.  Note that there could be multiple
-#               enclosures, but XML::Feed does not cater for them unless
-#               explicitly requested. We do not deal with such a case here.
+#               enclosures per item, but XML::Feed does not cater for them
+#               unless explicitly requested. We do not deal with such a case
+#               here.
 #       THROWS: No exceptions
 #     COMMENTS: None
 #     SEE ALSO: N/A
@@ -2364,10 +2508,10 @@ sub Options {
    my ($optref) = @_;

    my @options = (
-        "help",   "manpage",  "debug=i", "silent!",
-        "load=s", "delete=s", "scan!",   "report:s",
-        "check!", "json:s",   "opml:s",  "config=s",
-        "out=s",  "template:s",
+        "help",    "manpage",   "debug=i", "silent!",
+        "load=s",  "delete=s",  "scan!",   "report:s",
+        "check:s", "json:s",    "opml:s",  "config=s",
+        "out=s",   "rejects:s", "template:s",
    );

    if ( !GetOptions( $optref, @options ) ) {
@@ -2391,15 +2535,23 @@ feedWatcher - watch a collection of podcast feeds

 =head1 VERSION

-This documentation refers to I<feedWatcher> version 0.0.15
+This documentation refers to I<feedWatcher> version 0.1.1


 =head1 USAGE

 feedWatcher [-help] [-load=FILE] [-delete=FILE] [-[no]scan] [-[no]report]
-    [-[no]check] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
+    [-check[=mode]] [-out=FILE] [-json[=FILE]] [-opml[=FILE]] [-template=FILE]
    [-[no]silent] [-config=FILE] [-debug=N] [URL ...]

+ # Load URLs from a file, perform checks and redirect output to standard
+ # output and a named file
+ ./feedWatcher -load=feedWatcher_dumped_URLs.txt -check=auto | \
+    tee load_$(date +'%Y%m%d_%H%M%S')
+
+ # Generate Markdown output with a template writing to a named file
+ ./feedWatcher -tem=feedWatcher_3.tpl -out=feedWatcher.mkd
+
 =head1 ARGUMENTS

 Arguments are optional and may consist of an arbitrarily long list of URLs to
@@ -2454,9 +2606,10 @@ NOTE: This function is not implemented yet.

 =item B<-out=FILE>

-This option defines an output file to receive any output. If the option is
-omitted the data is written to STDOUT, allowing it to be redirected if
-required.
+This option defines an output file to receive outputi from reporting
+functions. If the option is omitted the data is written to STDOUT, allowing it
+to be redirected if required. This option does not cause transactional
+listings to be captured.

 =item B<-[no]check>