forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			1711 lines
		
	
	
		
			56 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			1711 lines
		
	
	
		
			56 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: summarise_mail
 | |
| #
 | |
| #        USAGE: ./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE]
 | |
| #               [-out=FILE] [-template=FILE] [-[no]silent] [-[no]checknew]
 | |
| #               [-initialise=N]
 | |
| #
 | |
| #  DESCRIPTION: Generates a summary of the HPR mailing list for the Community
 | |
| #  		News
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.5
 | |
| #      CREATED: 2015-02-15 15:06:11
 | |
| #     REVISION: 2015-10-04 20:21:44
 | |
| #
 | |
| #-------------------------------------------------------------------------------
 | |
| # TODO
 | |
| #
 | |
| # - Add a means of reporting all of the message bodies in a thread. They
 | |
| #   ideally need formatting and should be printable so that a template can
 | |
| #   generate plain text versions for use by the Community News hosts when
 | |
| #   reviewing the last month's messages. This is quite a large task but maybe
 | |
| #   there are mail formatters that can do it.
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use 5.010;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| 
 | |
| use Getopt::Long;
 | |
| use Pod::Usage;
 | |
| use Config::General;
 | |
| 
 | |
| use LWP::UserAgent;
 | |
| use Mail::Box::Manager;
 | |
| 
 | |
| use Date::Parse;
 | |
| use Date::Calc qw{:all};
 | |
| use DateTime;
 | |
| use DateTime::TimeZone;
 | |
| 
 | |
| use Template;
 | |
| use Template::Filters;
 | |
| Template::Filters->use_html_entities;    # Use HTML::Entities in the template
 | |
| 
 | |
| use Data::Dumper;
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = '0.0.5';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir    = "$ENV{HOME}/HPR/Community_News";
 | |
| my $configfile = "$basedir/.$PROG.cfg";
 | |
| my $bpfile     = "$basedir/mailnote_template.tpl";
 | |
| 
 | |
| my (%dtargs, $dt,      $tz,    $rd,      $mid,
 | |
|     $irt,    $ref,     $date,  @parsed,  %msg,
 | |
|     %id,     $dt_from, $dt_to, %threads, $total,
 | |
| );
 | |
| 
 | |
| my %tags = (
 | |
|     'from'        => 'From',
 | |
|     'to'          => 'To',
 | |
|     'cc'          => 'CC',
 | |
|     'date'        => 'Date',
 | |
|     'subject'     => 'Subject',
 | |
|     'archived-at' => 'Link',
 | |
| );
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| binmode STDOUT, ":encoding(UTF-8)";
 | |
| binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #
 | |
| # Load configuration data
 | |
| #
 | |
| my $conf = Config::General->new(
 | |
|     -ConfigFile      => $configfile,
 | |
|     -InterPolateVars => 1,
 | |
|     -ExtendedAccess  => 1,
 | |
| ) or die "Unable to open $configfile\n";
 | |
| 
 | |
| my %config = $conf->getall() or die "Unable to process $configfile\n";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options and arguments
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Process options
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Default help
 | |
| #
 | |
| pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
 | |
|     if ( $options{'help'} );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| my $debug      = ( defined( $options{debug} )      ? $options{debug}    : 0 );
 | |
| my $initialise = ( defined( $options{initialise} ) ? 1                  : 0 );
 | |
| my $start      = $options{initialise};
 | |
| my $silent     = ( defined( $options{silent} )     ? $options{silent}   : 0 );
 | |
| my $checknew   = ( defined( $options{checknew} )   ? $options{checknew} : 0 );
 | |
| my $outfile    = $options{out};
 | |
| 
 | |
| my @fromdate;
 | |
| if ( defined( $options{from} ) ) {
 | |
|     @fromdate = parseDate( $options{from} );
 | |
| }
 | |
| 
 | |
| my @todate;
 | |
| if ( defined( $options{to} ) ) {
 | |
|     @todate = parseDate( $options{to} );
 | |
| }
 | |
| 
 | |
| my $template
 | |
|     = ( defined( $options{template} ) ? $options{template} : $bpfile );
 | |
| 
 | |
| #
 | |
| # Check for illegal option combinations
 | |
| #
 | |
| die "Use only one of -initialise and -checknew\n"
 | |
|     if ( $initialise && $checknew );
 | |
| 
 | |
| #
 | |
| # Sanity checks
 | |
| #
 | |
| die "Error: Unable to find template $template\n" unless -r $template;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Default the from date to today and compute a to date if one wasn't given
 | |
| #-------------------------------------------------------------------------------
 | |
| if ( scalar(@fromdate) == 0 ) {
 | |
|     @fromdate = Today();
 | |
| }
 | |
| 
 | |
| if ( scalar(@todate) == 0 ) {
 | |
|     @todate = Add_Delta_Days( @fromdate[ 0, 1 ],
 | |
|         1, Days_in_Month( @fromdate[ 0, 1 ] ) );
 | |
| }
 | |
| 
 | |
| #
 | |
| # We need DateTime dates for comparison
 | |
| #
 | |
| emit( $silent, "From: ", ISO8601_Date(@fromdate), "\n" );
 | |
| emit( $silent, "To:   ", ISO8601_Date(@todate),   "\n" );
 | |
| 
 | |
| $dt_from = calcToDT(@fromdate);
 | |
| $dt_to   = calcToDT(@todate);
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Open the output file (or STDOUT) - we may need the date to do it
 | |
| #-------------------------------------------------------------------------------
 | |
| my $outfh;
 | |
| if ($outfile) {
 | |
|     $outfile = sprintf( $outfile,
 | |
|         sprintf( "%d-%02d", $dt_from->year, $dt_from->month ) )
 | |
|         if ( $outfile =~ /%s/ );
 | |
|     emit( $silent, "Output: ", $outfile, "\n" );
 | |
| 
 | |
|     open( $outfh, ">:encoding(UTF-8)", $outfile )
 | |
|         or die "Unable to open $outfile for writing: $!";
 | |
| }
 | |
| else {
 | |
|     open( $outfh, ">&", \*STDOUT )
 | |
|         or die "Unable to initialise for writing: $!";
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Check the configuration file
 | |
| #-------------------------------------------------------------------------------
 | |
| checkConfig( \%config );
 | |
| 
 | |
| #
 | |
| # The cached mail file
 | |
| #
 | |
| my $mailfile = join( '/', @{ $config{cache} }{ 'directory', 'filename' } );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # If asked to initialise the cache check it makes sense, otherwise look to see
 | |
| # if we're to update an existing cache
 | |
| #-------------------------------------------------------------------------------
 | |
| if ($initialise) {
 | |
|     if ( -e $mailfile ) {
 | |
|         die "The mail cache $mailfile exists.\n"
 | |
|             . "Delete it before initialising\n";
 | |
|     }
 | |
|     unless (initialiseCache( \%config, $start )) {
 | |
|         print "Failed to initialise the cache; can't continue\n";
 | |
|         exit;
 | |
|     }
 | |
| }
 | |
| elsif ($checknew) {
 | |
|     unless ( -e $mailfile ) {
 | |
|         die "The mail cache $mailfile does not exist\n"
 | |
|             . "Initialise it before checking for updates\n";
 | |
|     }
 | |
|     unless (updateCache( \%config )) {
 | |
|         print "Failed to update the cache; continuing with old mail\n";
 | |
|     }
 | |
| }
 | |
| 
 | |
| #
 | |
| # Open the mail file (as a folder in MBOX format)
 | |
| #
 | |
| my $mgr    = Mail::Box::Manager->new;
 | |
| my $folder = $mgr->open($mailfile);
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Walk through the messages in the folder collecting message details into
 | |
| # a hash structure indexed by the UTC timestamp.
 | |
| #-------------------------------------------------------------------------------
 | |
| foreach my $message ( $folder->messages ) {
 | |
|     #
 | |
|     # TODO This doesn't work for a reason I don't understand. Needs to be
 | |
|     # fixed
 | |
|     #
 | |
|     #$message->unfold();
 | |
| 
 | |
|     #
 | |
|     # Parse and convert the message date. The @parsed array contains
 | |
|     # $ss,$mm,$hh,$day,$month,$year,$zone. The year and month need adjustment.
 | |
|     # The timezone value is number of seconds from UTC.
 | |
|     #
 | |
|     $date   = $message->get('Date');
 | |
|     @parsed = strptime($date);
 | |
|     $parsed[5] += 1900;
 | |
|     $parsed[4] += 1;
 | |
| 
 | |
|     #
 | |
|     # Convert the parsed date to a DateTime
 | |
|     #
 | |
|     @dtargs{ 'second', 'minute', 'hour', 'day', 'month', 'year' } = (@parsed);
 | |
|     $dt = DateTime->new(%dtargs);
 | |
|     $tz = DateTime::TimeZone->new(
 | |
|         name => DateTime::TimeZone->offset_as_string( $parsed[6] ) );
 | |
|     $dt->set_time_zone($tz);
 | |
| 
 | |
|     #
 | |
|     # Stash mail parameters
 | |
|     #
 | |
|     $rd  = $dt->utc_rd_as_seconds();
 | |
|     $mid = formatID( $message->get('Message-ID'), 0 );
 | |
|     $irt = formatID( $message->get('In-Reply-To'), 1 );
 | |
| 
 | |
|     my @from = $message->from;
 | |
|     my @to   = $message->to;
 | |
|     my @cc   = $message->cc;
 | |
| 
 | |
|     $msg{$rd} = {
 | |
|         # '_date' => $date,
 | |
|         # '_from' => \@ffrom,
 | |
|         # '_to'   => \@fto,
 | |
|         # '_cc'   => \@fcc,
 | |
|         '_mid'    => $message->get('Message-ID'),
 | |
|         '_irt'    => $message->get('In-Reply-To'),
 | |
|         '_rdate'  => $rd,
 | |
|         'date'    => $dt->ymd . ' ' . $dt->hms . ' ' . $dt->time_zone->name,
 | |
|         'from'    => formatEmail( \@from ),
 | |
|         'to'      => formatEmail( \@to ),
 | |
|         'cc'      => formatEmail( \@cc ),
 | |
|         'subject' => $message->get('Subject'),
 | |
|         'archived-at' => trimHeader( $message->get('Archived-At') ),
 | |
|         'message-id'  => $mid,
 | |
|         'in-reply-to' => $irt,
 | |
|         'references'  => formatReferences( $message->get('References') ),
 | |
|         'parent'      => undef,
 | |
|         'children'    => [],
 | |
|     };
 | |
| 
 | |
|     #
 | |
|     # Stash the message id for easier linking
 | |
|     #
 | |
|     $id{$mid} = $rd;
 | |
| }
 | |
| 
 | |
| $folder->close;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Link the messages according to their internal back-references. Rescue any
 | |
| # broken threads caused by people not using their mail clients properly.
 | |
| #-------------------------------------------------------------------------------
 | |
| linkMessages( \%msg, \%id, ( $debug == 0 ) );
 | |
| repairThreads( \%msg, ( $debug == 0 ) );
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Generate the hash of message thread details that start in the selected
 | |
| # period. Prepare the template and generate the final document
 | |
| #-------------------------------------------------------------------------------
 | |
| $total = buildThreadSummary( $dt_from, $dt_to, \%config, \%msg, \%threads,
 | |
|     ( $debug == 0 ) );
 | |
| 
 | |
| my $tt = Template->new(
 | |
|     {   ABSOLUTE     => 1,
 | |
|         ENCODING     => 'utf8',
 | |
|         INCLUDE_PATH => $basedir,
 | |
|     }
 | |
| );
 | |
| my $vars = {
 | |
|     total   => $total,
 | |
|     threads => \%threads,
 | |
| };
 | |
| my $document;
 | |
| $tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
 | |
|     || die $tt->error(), "\n";
 | |
| print $outfh $document;
 | |
| close($outfh);
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: checkConfig
 | |
| #      PURPOSE: Check the configuration, aborting if it's messed up and
 | |
| #               missing necessary items
 | |
| #   PARAMETERS: $config         configuration hash
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Checks that all of the required items are present in the
 | |
| #               $config hash. These have been parsed from the configuration
 | |
| #               file usually called '.summarise_mail.cfg'. It only performs
 | |
| #               existence checks, nothing more fancy.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub checkConfig {
 | |
|     my ($config) = @_;
 | |
| 
 | |
|     die "Configuration file problem\n"
 | |
|         unless ( defined( $config->{gmane}->{url} )
 | |
|         && defined( $config->{gmane}->{template1} )
 | |
|         && defined( $config->{gmane}->{thread} )
 | |
|         && defined( $config->{gmane}->{template2} )
 | |
|         && defined( $config->{gmane}->{lookahead} )
 | |
|         && defined( $config->{cache}->{directory} )
 | |
|         && defined( $config->{cache}->{filename} )
 | |
|         && defined( $config->{cache}->{regex} ) );
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: initialiseCache
 | |
| #      PURPOSE: Load a new cache from Gmane
 | |
| #   PARAMETERS: $config         Configuration hashref
 | |
| #               $start          Start number to download
 | |
| #      RETURNS: If the GET succeeded then 1 otherwise 0
 | |
| #  DESCRIPTION: Creates a new cache file of mail downloaded from Gmane. The
 | |
| #               file is in MBOX format but has had Gmane's obscuration
 | |
| #               algorithms applied to it, messing up certain elements like
 | |
| #               message-ids. The download begins with message $start and
 | |
| #               finishes with $start+LOOKAHEAD where the lookahead value is
 | |
| #               defined in the configuration file; usually 100.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub initialiseCache {
 | |
|     my ( $config, $start ) = @_;
 | |
| 
 | |
|     #
 | |
|     # How much to look ahead in Gmane; defined in the configuration
 | |
|     #
 | |
|     my $lookahead = $config->{gmane}->{lookahead};
 | |
| 
 | |
|     #
 | |
|     # The name of the mail cache and its directory are in the configuration
 | |
|     #
 | |
|     my $mailfile
 | |
|         = join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
 | |
| 
 | |
|     #
 | |
|     # Make the URL for download
 | |
|     #
 | |
|     my $url = sprintf( $config->{gmane}->{template1},
 | |
|         $start, $start + $lookahead + 1 );
 | |
|     print "URL: $url\n";
 | |
| 
 | |
|     #
 | |
|     # Set up the HTTP GET
 | |
|     #
 | |
|     my $ua = LWP::UserAgent->new;
 | |
|     $ua->agent('HPR-Agent/0.1');
 | |
|     $ua->timeout(10);
 | |
| 
 | |
|     #
 | |
|     # Get the data (if any)
 | |
|     #
 | |
|     my $response = $ua->get($url);
 | |
| 
 | |
|     if ( $response->is_success ) {
 | |
|         #
 | |
|         # The GET succeeded, see what came back
 | |
|         #
 | |
|         if ( length( ${ $response->content_ref } ) > 0 ) {
 | |
|             #
 | |
|             # We got some new data. Append it to the rest
 | |
|             #
 | |
|             print "Mail messages found\n";
 | |
|             open( my $fh, ">", $mailfile )
 | |
|                 or die "Unable to open $mailfile for writing\n";
 | |
|             print $fh $response->decoded_content;
 | |
|             close($fh);
 | |
|         }
 | |
|         else {
 | |
|             print "No messages found\n";
 | |
|         }
 | |
| 
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # The GET failed in a nasty way
 | |
|         #
 | |
|         warn $response->status_line;
 | |
| 
 | |
|         return 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: updateCache
 | |
| #      PURPOSE: Get the latest mail messages from Gmane and save them in the
 | |
| #               cache
 | |
| #   PARAMETERS: $config         configuration hash
 | |
| #      RETURNS: If the GET succeeded then 1 otherwise 0
 | |
| #  DESCRIPTION: Adds more messages downloaded from Gmane to the end of the
 | |
| #               current cache file. It needs to work out the number of the
 | |
| #               last message in the cache, which it does with a regex, looking
 | |
| #               at 'Archived-At:' headers. Then it adds the lookahead value
 | |
| #               from the configuration file to that and downloads a maximum of
 | |
| #               that number of messages using the Gmane API. These messages
 | |
| #               are appended to the existing MBOX file. There is no means of
 | |
| #               expiring older messages, and perhaps there should be.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub updateCache {
 | |
|     my ($config) = @_;
 | |
| 
 | |
|     #
 | |
|     # How much to look ahead in Gmane; defined in the configuration
 | |
|     #
 | |
|     my $lookahead = $config->{gmane}->{lookahead};
 | |
| 
 | |
|     #
 | |
|     # The regex to parse the permalink URL is in the configuration
 | |
|     #
 | |
|     my $re = qr{$config->{cache}->{regex}};
 | |
| 
 | |
|     #
 | |
|     # The name of the mail cache and its directory are in the configuration
 | |
|     #
 | |
|     my $mailfile
 | |
|         = join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
 | |
| 
 | |
|     #
 | |
|     # Open the mail file
 | |
|     #
 | |
|     my $mgr    = Mail::Box::Manager->new;
 | |
|     my $folder = $mgr->open($mailfile);
 | |
| 
 | |
|     #
 | |
|     # Walk all the messages, parsing the permalink URLs to get the start and
 | |
|     # end
 | |
|     #
 | |
|     my $plink;
 | |
|     my ( $msgno, $start, $end ) = ( 0, 0, 0 );
 | |
|     foreach my $message ( $folder->messages ) {
 | |
|         $plink = $message->get('Archived-At');
 | |
|         if ( ($msgno) = ( $plink =~ /$re/ ) ) {
 | |
|             if ( $start eq 0 ) {
 | |
|                 $start = $msgno;
 | |
|             }
 | |
|             else {
 | |
|                 $end = $msgno;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     $folder->close;
 | |
| 
 | |
|     #
 | |
|     # Build the new URL to get more messages, starting from the last one we
 | |
|     # have plus 1 for $lookahead+1 messages
 | |
|     #
 | |
|     my $url = sprintf( $config->{gmane}->{template1},
 | |
|         $end + 1, $end + $lookahead + 1 );
 | |
|     print "URL: $url\n";
 | |
| 
 | |
|     #
 | |
|     # Set up the HTTP GET
 | |
|     #
 | |
|     my $ua = LWP::UserAgent->new;
 | |
|     $ua->agent('HPR-Agent/0.1');
 | |
|     $ua->timeout(10);
 | |
| 
 | |
|     #
 | |
|     # Get the data (if any)
 | |
|     #
 | |
|     my $response = $ua->get($url);
 | |
| 
 | |
|     if ( $response->is_success ) {
 | |
|         #
 | |
|         # The GET succeeded, see what came back
 | |
|         #
 | |
|         if ( length( ${ $response->content_ref } ) > 0 ) {
 | |
|             #
 | |
|             # We got some new data. Append it to the rest
 | |
|             #
 | |
|             print "New messages found\n";
 | |
|             open( my $fh, ">>", $mailfile )
 | |
|                 or die "Unable to open $mailfile for appending\n";
 | |
|             print $fh $response->decoded_content;
 | |
|             close($fh);
 | |
|         }
 | |
|         else {
 | |
|             print "No new messages\n";
 | |
|         }
 | |
| 
 | |
|         return 1;
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # The GET failed in a nasty way
 | |
|         #
 | |
|         warn $response->status_line;
 | |
| 
 | |
|         return 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: parseDate
 | |
| #      PURPOSE: Parse and check a date
 | |
| #   PARAMETERS: $date           Date string
 | |
| #      RETURNS: Date::Calc date or 'undef' if $date is undefined
 | |
| #  DESCRIPTION: Parses a date string using 'strptime' and converts it to
 | |
| #               a Date::Calc date list
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub parseDate {
 | |
|     my ($date) = @_;
 | |
| 
 | |
|     if ( defined($date) ) {
 | |
|         #
 | |
|         # Parse and format the date
 | |
|         #
 | |
|         my @parsed = strptime($date);
 | |
|         @parsed[ 4, 5 ] = ( $parsed[4] + 1, $parsed[5] + 1900 );
 | |
| 
 | |
|         return ( @parsed[ 5, 4, 3 ] );
 | |
|     }
 | |
|     else {
 | |
|         return; # implicit undef
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: ISO8601_Date
 | |
| #      PURPOSE: Format a Date::Calc date in ISO8601 format
 | |
| #   PARAMETERS: @date   - a date in the Date::Calc format
 | |
| #      RETURNS: Text string containing a YYYY-MM-DD date
 | |
| #  DESCRIPTION: Just a convenience to allow a simple call like
 | |
| #               $str = ISO8601_Date(@date)
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub ISO8601_Date {
 | |
|     my (@date) = (@_)[ 0, 1, 2 ];
 | |
| 
 | |
|     if ( check_date(@date) ) {
 | |
|         return sprintf( "%04d-%02d-%02d", @date );
 | |
|     }
 | |
|     else {
 | |
|         return "*Invalid Date*";
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: calcToDT
 | |
| #      PURPOSE: Convert a Date::Calc date to a UTC DateTime date
 | |
| #   PARAMETERS: $date           Date::Calc date
 | |
| #      RETURNS: A DateTime date in the UTC timezone
 | |
| #  DESCRIPTION: Reformats a Date::Calc date to a DateTime date based on UTC
 | |
| #               (for date comparison)
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub calcToDT {
 | |
|     my (@date) = (@_)[ 0, 1, 2 ];
 | |
| 
 | |
|     #
 | |
|     # Transfer Date::Calc values into a hash for initialising a DateTime
 | |
|     # object.
 | |
|     #
 | |
|     my ( %dtargs, $dt );
 | |
|     @dtargs{ 'year', 'month', 'day', 'time_zone' } = ( @date, 'UTC' );
 | |
|     $dt = DateTime->new(%dtargs);
 | |
|     return $dt;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: formatEmail
 | |
| #      PURPOSE: Trims an array of anonymised Gmane email addresss in
 | |
| #               Mail::Address objects
 | |
| #   PARAMETERS: $remails         The array of Mail::Address objects
 | |
| #      RETURNS: An array containing the trimmed addresses formatted as text
 | |
| #  DESCRIPTION: These addresses look like "Ken Fallon
 | |
| #               <ken-sq8tSfcSOvWzQB+pC5nmwQ@public.gmane.org>". The display
 | |
| #               name part (Ken Fallon) is anonymised if it contains an
 | |
| #               address-like specification. The local-part of the address is
 | |
| #               anonymised and the domain replaced by a Gmane domain. The
 | |
| #               front part of the local-part is unaffected. We want to build
 | |
| #               an address from the display name (if there is one), and the
 | |
| #               front of the local-part. So the above becomes:
 | |
| #               "Ken Fallon <ken@...>"
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub formatEmail {
 | |
|     my ($remails) = @_;
 | |
| 
 | |
|     my ( $phrase, $address, $comment, @results );
 | |
| 
 | |
|     foreach my $email ( @{$remails} ) {
 | |
|         #
 | |
|         # Parse out the components
 | |
|         #
 | |
|         ( $phrase, $address, $comment )
 | |
|             = ( $email->phrase, $email->address, $email->comment );
 | |
| 
 | |
|         #
 | |
|         # Assume Gmane obscured any email address wherever it is. Strip out
 | |
|         # the obscuring part to make it more readable
 | |
|         #
 | |
|         foreach my $item ( $phrase, $address, $comment ) {
 | |
|             if ( $item =~ /^"?([^-]+).+@.+$/ ) {
 | |
|                 $item = "$1\@...";
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         #
 | |
|         # Let the module reassemble the address in the right format, and add
 | |
|         # it to the result list
 | |
|         #
 | |
|         my $obj = Mail::Address->new( $phrase, $address, $comment );
 | |
|         push( @results, $obj->format );
 | |
|     }
 | |
| 
 | |
|     return \@results;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: formatID
 | |
| #      PURPOSE: Reformat a Message-ID: or In-Reply-To: value to undo the
 | |
| #               effects of Gmane's mail address obscuration which is applied
 | |
| #               to one but not the other.
 | |
| #   PARAMETERS: $header         The header for processing
 | |
| #               $gmane          Boolean to indicate whether we need to strip
 | |
| #                               a Gmane obscuration part
 | |
| #      RETURNS: The processed header
 | |
| #  DESCRIPTION: If $gmane is true recognises the possible Gmane obscuration
 | |
| #               patterns and extracts the important elements from the ID,
 | |
| #               otherwise just strips the '@' and what follows. This is
 | |
| #               because Gmane (presumably) detects the '@' in every header
 | |
| #               type, assumes it's an address and applies its obscuring
 | |
| #               technique to it. Shame they apply such a sledgehammer
 | |
| #               algorithm.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub formatID {
 | |
|     my ( $header, $gmane ) = @_;
 | |
| 
 | |
|     return $header unless defined($header);
 | |
| 
 | |
|     $header =~ s/(^<|>$)//g;
 | |
|     if ($gmane) {
 | |
|         if ( $header =~ /^(.+)-[^-]+@.+$/ ) {
 | |
|             $header = "$1";
 | |
|         }
 | |
|         elsif ( $header =~ /^(.+)@.+$/ ) {
 | |
|             $header = "$1";
 | |
|         }
 | |
|     }
 | |
|     else {
 | |
|         $header =~ s/@.+$//;
 | |
|     }
 | |
| 
 | |
|     return $header;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: formatReferences
 | |
| #      PURPOSE: Reformat a list of references in the References: header
 | |
| #   PARAMETERS: $header         The header for processing
 | |
| #      RETURNS: The processed header as an arrayref
 | |
| #  DESCRIPTION: Removes the '<>' around each reference, but also removes the
 | |
| #               bit after and including the '@' since we need to compare it
 | |
| #               with the ID values we have had to tidy after Gmane got to
 | |
| #               them.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub formatReferences {
 | |
|     my ($header) = @_;
 | |
| 
 | |
|     return $header unless defined($header);
 | |
| 
 | |
|     my ( @refs, @results );
 | |
| 
 | |
|     @refs = split( /\s+/, $header );
 | |
|     foreach my $ref (@refs) {
 | |
|         $ref =~ s/(^<|>$)//g;
 | |
|         $ref =~ s/@.+$//;
 | |
|         push( @results, $ref );
 | |
|     }
 | |
| 
 | |
|     return \@results;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: trimHeader
 | |
| #      PURPOSE: Trims an arbitrary header which is enclosed in "<..>"
 | |
| #   PARAMETERS: $header         The header for trimming
 | |
| #      RETURNS: The trimmed header
 | |
| #  DESCRIPTION: Certain Gmane headers like "Message-Id" are enclosed in "<>"
 | |
| #               marks. We want to strip these and return the result
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub trimHeader {
 | |
|     my ($header) = @_;
 | |
| 
 | |
|     return $header unless defined($header);
 | |
| 
 | |
|     $header =~ s/(^<|>$)//g;
 | |
| 
 | |
|     return $header;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION: Modelled on the SQL function of the same name. It takes a list
 | |
| #               of arguments, scans it for the first one that is not undefined
 | |
| #               and returns it. If an argument is defined and it's an arrayref
 | |
| #               then the referenced array is returned comma-delimited. This
 | |
| #               allows calls such as "coalesce($var,'undef')" which returns
 | |
| #               the value of $var if it's defined, and 'undef' if not and
 | |
| #               doesn't break anything along the way.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         if ( defined($_) ) {
 | |
|             if ( ref($_) eq 'ARRAY' ) {
 | |
|                 return join( ',', @{$_} );
 | |
|             }
 | |
|             else {
 | |
|                 return $_;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     return; # implicit undef
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: empty
 | |
| #      PURPOSE: Determine whether the argument contains data
 | |
| #   PARAMETERS: $arg            Argument
 | |
| #      RETURNS: 0 if data found, otherwise 1
 | |
| #  DESCRIPTION: If an argument is defined and it's an arrayref then the array
 | |
| #               must contain elements otherwise it's regarded as empty. An
 | |
| #               undefined argument is empty of course.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub empty {
 | |
|     my ($arg) = @_;
 | |
| 
 | |
|     if ( defined($arg) ) {
 | |
|         if ( ref($arg) eq 'ARRAY' ) {
 | |
|             return scalar(@$arg) == 0;
 | |
|         }
 | |
|         else {
 | |
|             return 0;
 | |
|         }
 | |
|     }
 | |
|     else {
 | |
|         return 1;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: linkMessages
 | |
| #      PURPOSE: Link the collected messages using the 'In-Reply-To:' and
 | |
| #               'References' headers
 | |
| #   PARAMETERS: $msg            Hashref containing the messages
 | |
| #               $id             Hashref containing message-id to message key
 | |
| #                               links
 | |
| #               $silent         Boolean determining whether to emit error
 | |
| #                               messages
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: We want to build a structure where messages contain forward
 | |
| #               "child" links to other messages. So, given a message, we look
 | |
| #               for backward links in the 'In-Reply-To:' and 'References'
 | |
| #               headers, go to the referenced message(s) and make the current
 | |
| #               one a child. We don't do anything with multiple 'References',
 | |
| #               though we could.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub linkMessages {
 | |
|     my ( $msg, $id, $silent ) = @_;
 | |
| 
 | |
|     my $irt;
 | |
| 
 | |
|     foreach my $key ( sort( keys( %{$msg} ) ) ) {
 | |
|         $irt = $msg->{$key}->{'in-reply-to'};
 | |
| 
 | |
|         if ( defined($irt) ) {
 | |
|             #
 | |
|             # There's an In-Reply-To: header
 | |
|             #
 | |
|             if ( defined( $id->{$irt} ) ) {
 | |
|                 #
 | |
|                 # The parent of this message is the key of the back reference and
 | |
|                 # we add this key to the children of the parent of this message
 | |
|                 #
 | |
|                 $msg->{$key}->{parent} = $id->{$irt};
 | |
|                 push(
 | |
|                     @{ $msg->{ $msg->{$key}->{parent} }->{children} },
 | |
|                     $id->{ $msg->{$key}->{'message-id'} }
 | |
|                 );
 | |
|             }
 | |
|             else {
 | |
|                 emit( $silent,
 | |
|                     "Unable to find Message-ID: $irt (in message $key)\n" );
 | |
|             }
 | |
|         }
 | |
|         elsif ( defined( $msg->{$key}->{'references'} ) ) {
 | |
|             #
 | |
|             # There's no In-Reply-To: but we have References:, where the last one
 | |
|             # is the equivalent. We update the parent message
 | |
|             #
 | |
|             $ref = $msg->{$key}->{'references'}->[-1];
 | |
|             if ( defined($ref) && defined( $id->{$ref} ) ) {
 | |
|                 #
 | |
|                 # The parent of this message is the key of the back reference and
 | |
|                 # we add this key to the children of the parent of this message
 | |
|                 #
 | |
|                 $msg->{$key}->{parent} = $id->{$ref};
 | |
|                 push(
 | |
|                     @{ $msg->{ $msg->{$key}->{parent} }->{children} },
 | |
|                     $id->{ $msg->{$key}->{'message-id'} }
 | |
|                 );
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: repairThreads
 | |
| #      PURPOSE: Repair broken threads where possible
 | |
| #   PARAMETERS: $mhash          Hashref of all messages
 | |
| #               $silent         Boolean determining whether to emit error
 | |
| #                               messages
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: We look for messages with a subject beginning with 'Re:' which
 | |
| #               don't have any back links. These look like broken threads
 | |
| #               where the sender has not used their mail client's ability to
 | |
| #               generate an 'In-Reply-To:' and/or 'References' header.  We try
 | |
| #               to find the thread root by looking for the subject in all of
 | |
| #               the messages and then make the linkages.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub repairThreads {
 | |
|     my ( $mhash, $silent ) = @_;
 | |
| 
 | |
|     foreach my $key ( sort( keys( %{$mhash} ) ) ) {
 | |
|         if ( $mhash->{$key}->{subject} =~ /^Re:/ ) {
 | |
|             if (!(     defined( $mhash->{$key}->{'in-reply-to'} )
 | |
|                     || defined( $mhash->{$key}->{'references'} )
 | |
|                 )
 | |
|                 )
 | |
|             {
 | |
|                 #
 | |
|                 # We have a message with a subject beginning with 'Re:' which
 | |
|                 # doesn't have any back links. Looks like a broken thread.
 | |
|                 # Find the thread root by looking for the subject.
 | |
|                 #
 | |
|                 my $parent = findParent( $mhash, $key );
 | |
| 
 | |
|                 #
 | |
|                 # The parent has to be defined to be meaningful
 | |
|                 #
 | |
|                 if ( defined($parent) ) {
 | |
|                     #
 | |
|                     # Make this message's parent the message we found in two
 | |
|                     # ways, via the 'parent' link and the 'in-reply-to'
 | |
|                     # header.
 | |
|                     #
 | |
|                     $mhash->{$key}->{parent} = $parent;
 | |
|                     $mhash->{$key}->{'in-reply-to'}
 | |
|                         = $mhash->{$parent}->{'message-id'};
 | |
| 
 | |
|                     #
 | |
|                     # Add this message to the parents children array, making
 | |
|                     # sure it's sorted.
 | |
|                     #
 | |
|                     push( @{ $mhash->{$parent}->{children} }, $key );
 | |
|                     $mhash->{$parent}->{children}
 | |
|                         = [ sort( @{ $mhash->{$parent}->{children} } ) ];
 | |
| 
 | |
|                     #print "Parent:\n";
 | |
|                     #dumpMessage( $mhash->{$parent} );
 | |
|                     #print "Child:\n";
 | |
|                     #dumpMessage( $mhash->{$key} );
 | |
|                 }
 | |
|                 else {
 | |
|                     emit( $silent, "Couldn't find parent of message $key\n" );
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: findParent
 | |
| #      PURPOSE: Given a particular element in the message hash find its parent
 | |
| #               by processing the subject
 | |
| #   PARAMETERS: $mhash          Hashref of all messages
 | |
| #               $key            Key into the message hash
 | |
| #      RETURNS: The key of the parent
 | |
| #  DESCRIPTION: Tries to join an "orphaned" message with its parent by looking
 | |
| #               for another instance of the subject. First the subject of this
 | |
| #               message must begin with "Re:". This string is removed in order
 | |
| #               to perform a search for the message this one is in reference
 | |
| #               to. The search is conducted through all of the stored messages
 | |
| #               and the matching message must not have a parent or any back
 | |
| #               references itself to be acceptable. This is a fairly vague set
 | |
| #               of criteria but it does work for a number of unlinked
 | |
| #               messages. [Note: The regex used in the search has been modified
 | |
| #               to be case insensitive having encountered an instance where
 | |
| #               someone reconstructed the subject line in different case.]
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub findParent {
 | |
|     my ( $mhash, $key ) = @_;
 | |
| 
 | |
|     my $start_subj = $mhash->{$key}->{subject};
 | |
|     my $end_subj;
 | |
|     my $parent;
 | |
| 
 | |
|     if ( $start_subj =~ /^Re:/ ) {
 | |
|         ( $end_subj = $start_subj ) =~ s/^Re:\s*//;
 | |
|         foreach my $k ( keys( %{$mhash} ) ) {
 | |
|             #if ( $msg{$k}->{subject} eq $end_subj ) {
 | |
|             if (   $k ne $key
 | |
|                 && $msg{$k}->{subject} =~ /(?:^Re:\s*)?(?i)$end_subj/ )
 | |
|             {
 | |
|                 if (!(     defined( $msg{$k}->{'in-reply-to'} )
 | |
|                         || defined( $msg{$k}->{'references'} )
 | |
|                     )
 | |
|                     )
 | |
|                 {
 | |
|                     $parent = $k;
 | |
|                     last;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         return $parent;
 | |
|     }
 | |
|     else {
 | |
|         #
 | |
|         # Doesn't look like a message with a parent
 | |
|         #
 | |
|         return; # implicit undef
 | |
|     }
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: buildThreadSummary
 | |
| #      PURPOSE: Build a summary of the thread roots
 | |
| #   PARAMETERS: $dt_from        The from date as a DateTime object
 | |
| #               $dt_to          The to date as a DateTime object
 | |
| #               $config         Hashref of the configuration data
 | |
| #               $mhash          Hashref of all messages
 | |
| #               $threads        Hashref to contain the resulting thread data
 | |
| #               $silent         Boolean determining whether to emit error
 | |
| #                               messages
 | |
| #      RETURNS: Total number of messages in all eligible threads
 | |
| #  DESCRIPTION: We have built a structure containing all of the message
 | |
| #               threads we've seen. However, we're only interested in the
 | |
| #               threads for a certain period, so we need to work them out
 | |
| #               here. A thread root is a message which has no parents, and
 | |
| #               we're interested in those which were written in our time
 | |
| #               period. We collect the details we want for the final display
 | |
| #               and build a weblink to the thread on Gmane. We also count the
 | |
| #               number of messages per thread and the grand total. The
 | |
| #               reulting structure is what we'll pass to the template we use
 | |
| #               to generate the insertion for the Community News notes (and
 | |
| #               other stuff).
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub buildThreadSummary {
 | |
|     my ( $dt_from, $dt_to, $config, $mhash, $threads, $silent ) = @_;
 | |
| 
 | |
|     my ( $count, $total );
 | |
| 
 | |
|     #
 | |
|     # Use the UTC seconds for comparison
 | |
|     #
 | |
|     my $from = $dt_from->utc_rd_as_seconds();
 | |
|     my $to   = $dt_to->utc_rd_as_seconds();
 | |
| 
 | |
|     #
 | |
|     # Adjust the regex from the config file for parsing the stored links.
 | |
|     # Remove the '<>' enclosing it.
 | |
|     #
 | |
|     my $re = $config->{cache}->{regex};
 | |
|     $re =~ s/^<|>$//g;
 | |
|     $re = qr{$re};
 | |
| 
 | |
|     #
 | |
|     # Find each thread root which has messages in the date range
 | |
|     #
 | |
|     $total = 0;
 | |
|     foreach my $key ( sort( keys( %{$mhash} ) ) ) {
 | |
|         if ( !defined( $mhash->{$key}->{parent} )
 | |
|             && eligibleThread( $from, $to, $mhash, $key ) )
 | |
| #            && ( $key ge $from && $key lt $to ) )
 | |
|         {
 | |
|             #
 | |
|             # Stash the headers we want
 | |
|             #
 | |
|             foreach my $k (qw{ from to cc date subject archived-at }) {
 | |
|                 $threads{$key}->{$k} = $mhash->{$key}->{$k};
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Make a weblink to the Gmane thread
 | |
|             # TODO What if the regex doesn't match?
 | |
|             #
 | |
|             if ( $mhash->{$key}->{'archived-at'} =~ /$re/ ) {
 | |
|                 $threads{$key}->{thread}
 | |
|                     = sprintf( $config->{gmane}->{template2}, $1 );
 | |
|             }
 | |
|             else {
 | |
|                 emit( $silent,
 | |
|                     "Unable to match the Gmane thread (in message $key)\n" );
 | |
|             }
 | |
| 
 | |
|             #
 | |
|             # Count the messages in the thread and keep a grand total
 | |
|             #
 | |
|             $count = 0;
 | |
|             threadLength( $from, $to, $mhash, $key, \$count );
 | |
|             $threads{$key}->{count} = $count;
 | |
|             $total += $count;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return $total;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: eligibleThread
 | |
| #      PURPOSE: Determines whether a given thread is eligible to be included
 | |
| #               in a date range
 | |
| #   PARAMETERS: $from           The from date as UTC seconds
 | |
| #               $to             The to date as UTC seconds
 | |
| #               $mhash          Hashref of all messages
 | |
| #               $key            Key into the message hash for this particular
 | |
| #                               thread
 | |
| #      RETURNS: True (1) if the thread is eligible, otherwise false (0)
 | |
| #  DESCRIPTION: Checks the top level message (thread root) for elegibility
 | |
| #               (date in range) then recursively descends the thread checking
 | |
| #               each message in the same manner as it goes. The results are
 | |
| #               ORed together so that if any message is in range the entire
 | |
| #               thread is.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub eligibleThread {
 | |
|     my ( $from, $to, $mhash, $key ) = @_;
 | |
| 
 | |
|     my $res = ( $key ge $from && $key lt $to );
 | |
| 
 | |
|     foreach my $k ( @{ $mhash->{$key}->{children} } ) {
 | |
|         $res ||= ( $k ge $from && $k lt $to )
 | |
|             || eligibleThread( $from, $to, $mhash, $k );
 | |
|     }
 | |
| 
 | |
|     return $res;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: threadLength
 | |
| #      PURPOSE: Count the number of eligible messages in a thread
 | |
| #   PARAMETERS: $from           The from date as UTC seconds
 | |
| #               $to             The to date as UTC seconds
 | |
| #               $mhash          Hashref of all messages
 | |
| #               $key            Key into the message hash
 | |
| #               $len            Scalar ref containing accumulated length
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Recursively descends through a thread (a tree of linked
 | |
| #               messages) counting the elements but only if they are in range.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub threadLength {
 | |
|     my ( $from, $to, $mhash, $key, $len ) = @_;
 | |
| 
 | |
|     $$len++ if ( $key ge $from && $key lt $to );
 | |
| 
 | |
|     foreach my $k ( @{ $mhash->{$key}->{children} } ) {
 | |
|         threadLength( $from, $to, $mhash, $k, $len );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: printThread
 | |
| #      PURPOSE: To walk a message thread and print it
 | |
| #   PARAMETERS: $mhash          Hashref of all messages
 | |
| #               $key            Key into the message hash
 | |
| #               $level          Integer denoting recursion level
 | |
| #               $html           1 for HTML output, 0 for text
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Recursively descend through a thread (a tree of linked
 | |
| #               messages) printing each one. If $html is true the message is
 | |
| #               printed in HTML, otherwise it's plain text with indentation.
 | |
| #               The level of indentation is controlled by $level which
 | |
| #               increments as the tree is recursed.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub printThread {
 | |
|     my ( $mhash, $key, $level, $html ) = @_;
 | |
| 
 | |
|     $level = 0 unless $level;
 | |
|     $html  = 0 unless $html;
 | |
| 
 | |
|     printMessage( $mhash->{$key}, $level, $html );
 | |
| 
 | |
|     foreach my $k ( @{ $mhash->{$key}->{children} } ) {
 | |
|         printThread( $mhash, $k, $level + 1, $html );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: printMessage
 | |
| #      PURPOSE: To print a message for debug purposes
 | |
| #   PARAMETERS: $msg            Hashref containing message attributes
 | |
| #               $level          Integer for indenting
 | |
| #               $html           1 for HTML output, 0 for text
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Print a message. The details held consist only of a set of
 | |
| #               headers, and these are printed as a label and a value. If
 | |
| #               $html is true the output is HTML (an unnumbered list) and only
 | |
| #               headers with values are reported. If $html is false then plain
 | |
| #               text is used, empty values are reported and indentation is
 | |
| #               used, controlled by the value of $level. In plain text mode
 | |
| #               each item ends with a line of hyphens.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub printMessage {
 | |
|     my ( $msg, $level, $html ) = @_;
 | |
| 
 | |
|     $level = 0 unless $level;
 | |
|     $html  = 0 unless $html;
 | |
| 
 | |
|     # TODO Don't do it this way, use the global definition since it's used
 | |
|     # elsewhere
 | |
|     state %tags;
 | |
|     %tags = (
 | |
|         'from'        => 'From',
 | |
|         'to'          => 'To',
 | |
|         'cc'          => 'CC',
 | |
|         'date'        => 'Date',
 | |
|         'subject'     => 'Subject',
 | |
|         'archived-at' => 'Link',
 | |
|     );
 | |
| 
 | |
|     print "<ul>" if $html;
 | |
| 
 | |
|     foreach my $k (qw{ from to cc date subject archived-at }) {
 | |
|         if ($html) {
 | |
|             if ( !empty( $msg->{$k} ) ) {
 | |
|                 printf "<li>%-8s %s</li>\n", "$tags{$k}:",
 | |
|                     coalesce( $msg->{$k}, '' );
 | |
|             }
 | |
|         }
 | |
|         else {
 | |
|             printf "%s%-8s %s\n", '  ' x $level, "$tags{$k}:",
 | |
|                 coalesce( $msg->{$k}, '' );
 | |
|         }
 | |
|     }
 | |
|     if ($html) {
 | |
|         print "</ul>\n";
 | |
|     }
 | |
|     else {
 | |
|         print '-' x 80, "\n";
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: dumpMessage
 | |
| #      PURPOSE: To print all parts of a message hash for debugging
 | |
| #   PARAMETERS: $msg            Hashref containing message attributes
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: Produces a dump of a message for debugging purposes.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub dumpMessage {
 | |
|     my ($msg) = @_;
 | |
| 
 | |
|     foreach my $k ( sort( keys( %{$msg} ) ) ) {
 | |
|         printf "%12s: %s\n", $k, coalesce( $msg->{$k}, '' );
 | |
|     }
 | |
|     print '-' x 80, "\n";
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: emit
 | |
| #      PURPOSE: Print text on STDERR unless silent mode has been selected
 | |
| #   PARAMETERS: - Boolean indicating whether to be silent or not
 | |
| #               - list of arguments to 'print'
 | |
| #      RETURNS: Nothing
 | |
| #  DESCRIPTION: This is a wrapper around 'print' to determine whether to send
 | |
| #               a message to STDERR depending on a boolean. We need this to be
 | |
| #               able to make the script silent when the -silent option is
 | |
| #               selected
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub emit {
 | |
|     unless (shift) {
 | |
|         print STDERR @_;
 | |
|     }
 | |
|     return;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = (
 | |
|         "help",    "debug=i",   "out=s",        "template=s",
 | |
|         "silent!", "checknew!", "initialise=i", "from=s",
 | |
|         "to=s",
 | |
|     );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| __END__
 | |
| 
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #  Application Documentation
 | |
| #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 | |
| #{{{
 | |
| 
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| summarise_mail - Generate a summary of messages on the HPR mailing list
 | |
| 
 | |
| =head1 VERSION
 | |
| 
 | |
| This documentation refers to summarise_mail version 0.0.5
 | |
| 
 | |
| 
 | |
| =head1 USAGE
 | |
| 
 | |
|     ./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE] [-out=FILE]
 | |
|         [-template=FILE] [-[no]silent] [-[no]checknew] [-initialise=N]
 | |
| 
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-help>
 | |
| 
 | |
| Prints a brief help message describing the usage of the program, and then exits.
 | |
| 
 | |
| =item B<-initialise=N>
 | |
| 
 | |
| This option builds a new mailbox cache. The value B<N> defines the starting
 | |
| message to be downloaded from the Gmane site. The number defaults to 1, though
 | |
| this is not advisable since it potentially overloads the Gmane server. The
 | |
| number of messages downloaded is limited by the B<lookahead> value in the
 | |
| configuration file (see below).
 | |
| 
 | |
| If a cache file already exists the script will not run. It is necessary to
 | |
| delete this file before initialising a new one.
 | |
| 
 | |
| After the cache is initialised the script will process the messages it
 | |
| contains.
 | |
| 
 | |
| The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
 | |
| 
 | |
| =item B<-[no]checknew>
 | |
| 
 | |
| This option defines whether the script will check Gmane for updates to the
 | |
| mailing list. If updates are found then they will be incorporated into the
 | |
| cached mailbox file.
 | |
| 
 | |
| If omitted no check is made (B<-nochecknew>).
 | |
| 
 | |
| The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
 | |
| 
 | |
| =item B<-from=DATE>
 | |
| 
 | |
| Specifies the starting date for the mail summary.
 | |
| 
 | |
| The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
 | |
| (e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
 | |
| 
 | |
| If this option is omitted the current date is used.
 | |
| 
 | |
| =item B<-to=DATE>
 | |
| 
 | |
| Specifies the ending date for the mail summary.
 | |
| 
 | |
| The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
 | |
| (e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
 | |
| 
 | |
| If this option is omitted the last day of the month defined by the B<-from>
 | |
| date is used.
 | |
| 
 | |
| =item B<-template=FILE>
 | |
| 
 | |
| This option defines the template that will be used to format the report
 | |
| generated by the script.
 | |
| 
 | |
| If the option is not provided, the script will use a template called
 | |
| B<mailnote_template.tpl> in the directory which holds the script. This
 | |
| template generates HTML suitable for incorporation into the Community News
 | |
| show notes.
 | |
| 
 | |
| =item B<-out=FILE>
 | |
| 
 | |
| This option defines the output file for the report.  If the option is omitted
 | |
| the report is written to STDOUT, allowing it to be redirected if required.
 | |
| 
 | |
| The output file name may contain the characters 'B<%s>'. This denotes the point
 | |
| at which the year and month in the format B<YYYY-MM> are inserted. For example
 | |
| if the script is being run for July 2014 the option:
 | |
| 
 | |
|     -out=mailreport_%s.html
 | |
| 
 | |
| will cause the generation of the file:
 | |
| 
 | |
|     mailreport_2014-07.html
 | |
| 
 | |
| =item B<-[no]silent>
 | |
| 
 | |
| This option controls whether the script reports minimal details of what it is
 | |
| doing to STDERR. If the option is omitted the report is generated (B<-nosilent>).
 | |
| 
 | |
| The script reports: the starting and ending dates it is working on and the name of the output file
 | |
| (if appropriate).
 | |
| 
 | |
| =item B<-debug=N>
 | |
| 
 | |
| Runs the script in debug mode if the value of B<N> is greater than 0. The
 | |
| default setting is zero.
 | |
| 
 | |
| In debug mode the script reports various pieces of information about the
 | |
| parsing of the mail messages and building of threads.
 | |
| 
 | |
| =back
 | |
| 
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| =head2 Overview
 | |
| 
 | |
| This script generates a summary of the mail sent to the HPR mailing list over
 | |
| a period.
 | |
| 
 | |
| Messages to the mailing list are copied to the free B<Gmane> service and the
 | |
| script uses this to prepare its report. The Gmane service offers
 | |
| a downloadable interface at
 | |
| http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
 | |
| where selected messages can be collected in B<MBOX> format.
 | |
| 
 | |
| =head2 Methodology
 | |
| 
 | |
| The script operates in three phases:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item -
 | |
| 
 | |
| Messages are collected from Gmane and stored in a local cache (a file in MBOX
 | |
| format). This is expected to be done once, and thereafter new messages are
 | |
| appended to this file.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| New messages are appended to the cache file. This is done by determining the
 | |
| last message in the cache then requesting the next block of messages from
 | |
| Gmane. The downloaded messages are then added to the file.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| The cache is processed to generate the report. Usually the script is given
 | |
| a start date (and optional end date, though this defaults to the end of the
 | |
| month in the start date).
 | |
| 
 | |
| The algorithm is as follows:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item -
 | |
| 
 | |
| The script parses out the important headers of all messages in the cache.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| The script walks the header structure in time order joining messages based on
 | |
| their 'In-reference-to:' and 'References:' headers. Each message gets its
 | |
| parent recorded as a pointer, and its children as an array of pointers. If
 | |
| a reference is to a message that is not in the cache then it is not recorded.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| At this point all messages without a parent are either the thread roots or
 | |
| cases where someone has not replied properly. The script performs a further
 | |
| join when an "orphaned" message has the same subject as another with a 'Re:'
 | |
| on the front.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| Now all messages without parents are the best that can be done to detect
 | |
| thread roots. These are used to make the report.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| The messages that cannot be threaded are those where the sender does not use
 | |
| 'Reply' or where they reply to a digest. Digests are not threaded because they
 | |
| are blobs of multiple messages.
 | |
| 
 | |
| =item -
 | |
| 
 | |
| Some messages may be threaded erroneously. This happens when a sender, rather
 | |
| than generating a new message, replies to an existing message but changes the
 | |
| 'Subject:' line. There is not much that can be done to correct this.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head2 Report Template
 | |
| 
 | |
| The script uses the B<Template> module to format the report. See the default
 | |
| template B<mailnote_template.tpl> for an example.
 | |
| 
 | |
| The template is passed two variables:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<total>
 | |
| 
 | |
| This is a scalar variable containing a number: the count of messages in the
 | |
| specified range.
 | |
| 
 | |
| =item B<threads>
 | |
| 
 | |
| This is a hash indexed by the message date converted to UTC date then to the
 | |
| number of seconds since the Unix epoch. Each hash value is a hashref
 | |
| containing the following fields:
 | |
| 
 | |
|  Field          Contents
 | |
|  -----          --------
 | |
|  from           The contents of the 'From:' header
 | |
|  to             The contents of the 'To:' header
 | |
|  cc             The contents of the 'Cc:' header
 | |
|  date           The contents of the 'Date:' header
 | |
|  subject        The contents of the 'Subject:' header
 | |
|  archived-at    The contents of the 'Archived-at:' header (the Gmane
 | |
|                 permalink URL)
 | |
|  count          The number of messages in the thread
 | |
| 
 | |
| These messages are the roots of all of the detectable threads in the period.
 | |
| Sorting them by the main hash key results in the threads being ordered by
 | |
| their timestamp.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| A list of every error and warning message that the application can generate
 | |
| (even the ones that will "never happen"), with a full explanation of each
 | |
| problem, one or more likely causes, and any suggested remedies. If the
 | |
| application generates exit status codes (e.g. under Unix) then list the exit
 | |
| status associated with each error.
 | |
| 
 | |
| 
 | |
| =head1 CONFIGURATION AND ENVIRONMENT
 | |
| 
 | |
| The script obtains the information it requires to find the mail details from
 | |
| B<Gmane> from a configuration file. The name of the file it expects is
 | |
| B<.summarise_mail.cfg> in the
 | |
| directory holding the script. To change this will require changing the script.
 | |
| 
 | |
| The configuration file format is as follows:
 | |
| 
 | |
|  <gmane>
 | |
|      url = http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
 | |
|      template = "$url/%d/%d"
 | |
|      lookahead = 100
 | |
|  </gmane>
 | |
| 
 | |
|  <cache>
 | |
|      directory = /home/dave/Community_News/mail_cache
 | |
|      filename = gmane.mbox
 | |
|      regex = "<http://permalink\.gmane\.org/gmane\.network\.syndication\.podcast\.hacker-public-radio/(\d+)>"
 | |
|  </cache>
 | |
| 
 | |
| The elements of this configuration file are explained below. First the
 | |
| B<gmane> section:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<url>
 | |
| 
 | |
| This is the base URL on the Gmane site. It is used to construct URLs for
 | |
| collecting mail messages. This should not require any changes.
 | |
| 
 | |
| =item B<template>
 | |
| 
 | |
| This is a template which is used to generate the actual URL used to download
 | |
| messages. The script replaces each '%d' with a number. The first is the
 | |
| starting message number, and the second the ending number plus one.
 | |
| 
 | |
| This should not require any changes.
 | |
| 
 | |
| =item B<lookahead>
 | |
| 
 | |
| This defines the maximum number of messages to be collected. This number is
 | |
| added to the first number used when contructing the URL from the B<template>
 | |
| above. Normally just a few messages are downloaded at a time, but when the
 | |
| script is used to initialise the mailbox cache file this number of messages may
 | |
| be downloaded.
 | |
| 
 | |
| This value can be changed if desired, but it is set at 100 in order to avoid
 | |
| overloading the Gmane servers.
 | |
| 
 | |
| =back
 | |
| 
 | |
| The B<cache> section contains the following:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<directory>
 | |
| 
 | |
| This defines the full path to the directory holding the mail cache file.
 | |
| 
 | |
| This may be changed to reflect a change of location.
 | |
| 
 | |
| =item B<filename>
 | |
| 
 | |
| This defines the name of the mail cache. It is expected to be found in the
 | |
| directory defined above.
 | |
| 
 | |
| This may be changed to reflect a change of name.
 | |
| 
 | |
| =item B<regex>
 | |
| 
 | |
| Individual messages in the cache each hold an URL which provides a permanent
 | |
| link to the message on the Gmane site. The script uses the regular expression
 | |
| defined here to collect the message number from this URL. This is how the
 | |
| script determines the number of the oldest message in the cache when looking
 | |
| for new messages.
 | |
| 
 | |
| This expression should only be changed with great caution.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DEPENDENCIES
 | |
| 
 | |
|  Getopt::Long
 | |
|  Pod::Usage
 | |
|  Config::General
 | |
|  LWP::UserAgent
 | |
|  Mail::Box::Manager
 | |
|  Date::Parse
 | |
|  Date::Calc
 | |
|  DateTime
 | |
|  DateTime::TimeZone
 | |
|  Template
 | |
|  Template::Filters
 | |
|  Data::Dumper
 | |
| 
 | |
| 
 | |
| =head1 BUGS AND LIMITATIONS
 | |
| 
 | |
| There are no known bugs in this module.
 | |
| Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
 | |
| Patches are welcome.
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Dave Morriss (Dave.Morriss@gmail.com)
 | |
| 
 | |
| 
 | |
| =head1 LICENCE AND COPYRIGHT
 | |
| 
 | |
| Copyright (c) 2014, 2015 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
 | |
| 
 | |
| This module is free software; you can redistribute it and/or
 | |
| modify it under the same terms as Perl itself. See perldoc perlartistic.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful,
 | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #}}}
 | |
| 
 | |
| # [zo to open fold, zc to close]
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |