1711 lines
56 KiB
Perl
Executable File
1711 lines
56 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#===============================================================================
|
|
#
|
|
# FILE: summarise_mail
|
|
#
|
|
# USAGE: ./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE]
|
|
# [-out=FILE] [-template=FILE] [-[no]silent] [-[no]checknew]
|
|
# [-initialise=N]
|
|
#
|
|
# DESCRIPTION: Generates a summary of the HPR mailing list for the Community
|
|
# News
|
|
#
|
|
# OPTIONS: ---
|
|
# REQUIREMENTS: ---
|
|
# BUGS: ---
|
|
# NOTES: ---
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
# VERSION: 0.0.5
|
|
# CREATED: 2015-02-15 15:06:11
|
|
# REVISION: 2015-10-04 20:21:44
|
|
#
|
|
#-------------------------------------------------------------------------------
|
|
# TODO
|
|
#
|
|
# - Add a means of reporting all of the message bodies in a thread. They
|
|
# ideally need formatting and should be printable so that a template can
|
|
# generate plain text versions for use by the Community News hosts when
|
|
# reviewing the last month's messages. This is quite a large task but maybe
|
|
# there are mail formatters that can do it.
|
|
#
|
|
#===============================================================================
|
|
|
|
use 5.010;
|
|
use strict;
|
|
use warnings;
|
|
use utf8;
|
|
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
use Config::General;
|
|
|
|
use LWP::UserAgent;
|
|
use Mail::Box::Manager;
|
|
|
|
use Date::Parse;
|
|
use Date::Calc qw{:all};
|
|
use DateTime;
|
|
use DateTime::TimeZone;
|
|
|
|
use Template;
|
|
use Template::Filters;
|
|
Template::Filters->use_html_entities; # Use HTML::Entities in the template
|
|
|
|
use Data::Dumper;
|
|
|
|
#
|
|
# Version number (manually incremented)
|
|
#
|
|
our $VERSION = '0.0.5';
|
|
|
|
#
|
|
# Script and directory names
|
|
#
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
|
$DIR = '.' unless $DIR;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Declarations
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Constants and other declarations
|
|
#
|
|
my $basedir = "$ENV{HOME}/HPR/Community_News";
|
|
my $configfile = "$basedir/.$PROG.cfg";
|
|
my $bpfile = "$basedir/mailnote_template.tpl";
|
|
|
|
my (%dtargs, $dt, $tz, $rd, $mid,
|
|
$irt, $ref, $date, @parsed, %msg,
|
|
%id, $dt_from, $dt_to, %threads, $total,
|
|
);
|
|
|
|
my %tags = (
|
|
'from' => 'From',
|
|
'to' => 'To',
|
|
'cc' => 'CC',
|
|
'date' => 'Date',
|
|
'subject' => 'Subject',
|
|
'archived-at' => 'Link',
|
|
);
|
|
|
|
#
|
|
# Enable Unicode mode
|
|
#
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
#
|
|
# Load configuration data
|
|
#
|
|
my $conf = Config::General->new(
|
|
-ConfigFile => $configfile,
|
|
-InterPolateVars => 1,
|
|
-ExtendedAccess => 1,
|
|
) or die "Unable to open $configfile\n";
|
|
|
|
my %config = $conf->getall() or die "Unable to process $configfile\n";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Options and arguments
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Process options
|
|
#
|
|
my %options;
|
|
Options( \%options );
|
|
|
|
#
|
|
# Default help
|
|
#
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
|
|
if ( $options{'help'} );
|
|
|
|
#
|
|
# Collect options
|
|
#
|
|
my $debug = ( defined( $options{debug} ) ? $options{debug} : 0 );
|
|
my $initialise = ( defined( $options{initialise} ) ? 1 : 0 );
|
|
my $start = $options{initialise};
|
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
my $checknew = ( defined( $options{checknew} ) ? $options{checknew} : 0 );
|
|
my $outfile = $options{out};
|
|
|
|
my @fromdate;
|
|
if ( defined( $options{from} ) ) {
|
|
@fromdate = parseDate( $options{from} );
|
|
}
|
|
|
|
my @todate;
|
|
if ( defined( $options{to} ) ) {
|
|
@todate = parseDate( $options{to} );
|
|
}
|
|
|
|
my $template
|
|
= ( defined( $options{template} ) ? $options{template} : $bpfile );
|
|
|
|
#
|
|
# Check for illegal option combinations
|
|
#
|
|
die "Use only one of -initialise and -checknew\n"
|
|
if ( $initialise && $checknew );
|
|
|
|
#
|
|
# Sanity checks
|
|
#
|
|
die "Error: Unable to find template $template\n" unless -r $template;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Default the from date to today and compute a to date if one wasn't given
|
|
#-------------------------------------------------------------------------------
|
|
if ( scalar(@fromdate) == 0 ) {
|
|
@fromdate = Today();
|
|
}
|
|
|
|
if ( scalar(@todate) == 0 ) {
|
|
@todate = Add_Delta_Days( @fromdate[ 0, 1 ],
|
|
1, Days_in_Month( @fromdate[ 0, 1 ] ) );
|
|
}
|
|
|
|
#
|
|
# We need DateTime dates for comparison
|
|
#
|
|
emit( $silent, "From: ", ISO8601_Date(@fromdate), "\n" );
|
|
emit( $silent, "To: ", ISO8601_Date(@todate), "\n" );
|
|
|
|
$dt_from = calcToDT(@fromdate);
|
|
$dt_to = calcToDT(@todate);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Open the output file (or STDOUT) - we may need the date to do it
|
|
#-------------------------------------------------------------------------------
|
|
my $outfh;
|
|
if ($outfile) {
|
|
$outfile = sprintf( $outfile,
|
|
sprintf( "%d-%02d", $dt_from->year, $dt_from->month ) )
|
|
if ( $outfile =~ /%s/ );
|
|
emit( $silent, "Output: ", $outfile, "\n" );
|
|
|
|
open( $outfh, ">:encoding(UTF-8)", $outfile )
|
|
or die "Unable to open $outfile for writing: $!";
|
|
}
|
|
else {
|
|
open( $outfh, ">&", \*STDOUT )
|
|
or die "Unable to initialise for writing: $!";
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Check the configuration file
|
|
#-------------------------------------------------------------------------------
|
|
checkConfig( \%config );
|
|
|
|
#
|
|
# The cached mail file
|
|
#
|
|
my $mailfile = join( '/', @{ $config{cache} }{ 'directory', 'filename' } );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# If asked to initialise the cache check it makes sense, otherwise look to see
|
|
# if we're to update an existing cache
|
|
#-------------------------------------------------------------------------------
|
|
if ($initialise) {
|
|
if ( -e $mailfile ) {
|
|
die "The mail cache $mailfile exists.\n"
|
|
. "Delete it before initialising\n";
|
|
}
|
|
unless (initialiseCache( \%config, $start )) {
|
|
print "Failed to initialise the cache; can't continue\n";
|
|
exit;
|
|
}
|
|
}
|
|
elsif ($checknew) {
|
|
unless ( -e $mailfile ) {
|
|
die "The mail cache $mailfile does not exist\n"
|
|
. "Initialise it before checking for updates\n";
|
|
}
|
|
unless (updateCache( \%config )) {
|
|
print "Failed to update the cache; continuing with old mail\n";
|
|
}
|
|
}
|
|
|
|
#
|
|
# Open the mail file (as a folder in MBOX format)
|
|
#
|
|
my $mgr = Mail::Box::Manager->new;
|
|
my $folder = $mgr->open($mailfile);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Walk through the messages in the folder collecting message details into
|
|
# a hash structure indexed by the UTC timestamp.
|
|
#-------------------------------------------------------------------------------
|
|
foreach my $message ( $folder->messages ) {
|
|
#
|
|
# TODO This doesn't work for a reason I don't understand. Needs to be
|
|
# fixed
|
|
#
|
|
#$message->unfold();
|
|
|
|
#
|
|
# Parse and convert the message date. The @parsed array contains
|
|
# $ss,$mm,$hh,$day,$month,$year,$zone. The year and month need adjustment.
|
|
# The timezone value is number of seconds from UTC.
|
|
#
|
|
$date = $message->get('Date');
|
|
@parsed = strptime($date);
|
|
$parsed[5] += 1900;
|
|
$parsed[4] += 1;
|
|
|
|
#
|
|
# Convert the parsed date to a DateTime
|
|
#
|
|
@dtargs{ 'second', 'minute', 'hour', 'day', 'month', 'year' } = (@parsed);
|
|
$dt = DateTime->new(%dtargs);
|
|
$tz = DateTime::TimeZone->new(
|
|
name => DateTime::TimeZone->offset_as_string( $parsed[6] ) );
|
|
$dt->set_time_zone($tz);
|
|
|
|
#
|
|
# Stash mail parameters
|
|
#
|
|
$rd = $dt->utc_rd_as_seconds();
|
|
$mid = formatID( $message->get('Message-ID'), 0 );
|
|
$irt = formatID( $message->get('In-Reply-To'), 1 );
|
|
|
|
my @from = $message->from;
|
|
my @to = $message->to;
|
|
my @cc = $message->cc;
|
|
|
|
$msg{$rd} = {
|
|
# '_date' => $date,
|
|
# '_from' => \@ffrom,
|
|
# '_to' => \@fto,
|
|
# '_cc' => \@fcc,
|
|
'_mid' => $message->get('Message-ID'),
|
|
'_irt' => $message->get('In-Reply-To'),
|
|
'_rdate' => $rd,
|
|
'date' => $dt->ymd . ' ' . $dt->hms . ' ' . $dt->time_zone->name,
|
|
'from' => formatEmail( \@from ),
|
|
'to' => formatEmail( \@to ),
|
|
'cc' => formatEmail( \@cc ),
|
|
'subject' => $message->get('Subject'),
|
|
'archived-at' => trimHeader( $message->get('Archived-At') ),
|
|
'message-id' => $mid,
|
|
'in-reply-to' => $irt,
|
|
'references' => formatReferences( $message->get('References') ),
|
|
'parent' => undef,
|
|
'children' => [],
|
|
};
|
|
|
|
#
|
|
# Stash the message id for easier linking
|
|
#
|
|
$id{$mid} = $rd;
|
|
}
|
|
|
|
$folder->close;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Link the messages according to their internal back-references. Rescue any
|
|
# broken threads caused by people not using their mail clients properly.
|
|
#-------------------------------------------------------------------------------
|
|
linkMessages( \%msg, \%id, ( $debug == 0 ) );
|
|
repairThreads( \%msg, ( $debug == 0 ) );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Generate the hash of message thread details that start in the selected
|
|
# period. Prepare the template and generate the final document
|
|
#-------------------------------------------------------------------------------
|
|
$total = buildThreadSummary( $dt_from, $dt_to, \%config, \%msg, \%threads,
|
|
( $debug == 0 ) );
|
|
|
|
my $tt = Template->new(
|
|
{ ABSOLUTE => 1,
|
|
ENCODING => 'utf8',
|
|
INCLUDE_PATH => $basedir,
|
|
}
|
|
);
|
|
my $vars = {
|
|
total => $total,
|
|
threads => \%threads,
|
|
};
|
|
my $document;
|
|
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|
|
|| die $tt->error(), "\n";
|
|
print $outfh $document;
|
|
close($outfh);
|
|
|
|
exit;
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: checkConfig
|
|
# PURPOSE: Check the configuration, aborting if it's messed up and
|
|
# missing necessary items
|
|
# PARAMETERS: $config configuration hash
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Checks that all of the required items are present in the
|
|
# $config hash. These have been parsed from the configuration
|
|
# file usually called '.summarise_mail.cfg'. It only performs
|
|
# existence checks, nothing more fancy.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub checkConfig {
|
|
my ($config) = @_;
|
|
|
|
die "Configuration file problem\n"
|
|
unless ( defined( $config->{gmane}->{url} )
|
|
&& defined( $config->{gmane}->{template1} )
|
|
&& defined( $config->{gmane}->{thread} )
|
|
&& defined( $config->{gmane}->{template2} )
|
|
&& defined( $config->{gmane}->{lookahead} )
|
|
&& defined( $config->{cache}->{directory} )
|
|
&& defined( $config->{cache}->{filename} )
|
|
&& defined( $config->{cache}->{regex} ) );
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: initialiseCache
|
|
# PURPOSE: Load a new cache from Gmane
|
|
# PARAMETERS: $config Configuration hashref
|
|
# $start Start number to download
|
|
# RETURNS: If the GET succeeded then 1 otherwise 0
|
|
# DESCRIPTION: Creates a new cache file of mail downloaded from Gmane. The
|
|
# file is in MBOX format but has had Gmane's obscuration
|
|
# algorithms applied to it, messing up certain elements like
|
|
# message-ids. The download begins with message $start and
|
|
# finishes with $start+LOOKAHEAD where the lookahead value is
|
|
# defined in the configuration file; usually 100.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub initialiseCache {
|
|
my ( $config, $start ) = @_;
|
|
|
|
#
|
|
# How much to look ahead in Gmane; defined in the configuration
|
|
#
|
|
my $lookahead = $config->{gmane}->{lookahead};
|
|
|
|
#
|
|
# The name of the mail cache and its directory are in the configuration
|
|
#
|
|
my $mailfile
|
|
= join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
|
|
|
|
#
|
|
# Make the URL for download
|
|
#
|
|
my $url = sprintf( $config->{gmane}->{template1},
|
|
$start, $start + $lookahead + 1 );
|
|
print "URL: $url\n";
|
|
|
|
#
|
|
# Set up the HTTP GET
|
|
#
|
|
my $ua = LWP::UserAgent->new;
|
|
$ua->agent('HPR-Agent/0.1');
|
|
$ua->timeout(10);
|
|
|
|
#
|
|
# Get the data (if any)
|
|
#
|
|
my $response = $ua->get($url);
|
|
|
|
if ( $response->is_success ) {
|
|
#
|
|
# The GET succeeded, see what came back
|
|
#
|
|
if ( length( ${ $response->content_ref } ) > 0 ) {
|
|
#
|
|
# We got some new data. Append it to the rest
|
|
#
|
|
print "Mail messages found\n";
|
|
open( my $fh, ">", $mailfile )
|
|
or die "Unable to open $mailfile for writing\n";
|
|
print $fh $response->decoded_content;
|
|
close($fh);
|
|
}
|
|
else {
|
|
print "No messages found\n";
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
else {
|
|
#
|
|
# The GET failed in a nasty way
|
|
#
|
|
warn $response->status_line;
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: updateCache
|
|
# PURPOSE: Get the latest mail messages from Gmane and save them in the
|
|
# cache
|
|
# PARAMETERS: $config configuration hash
|
|
# RETURNS: If the GET succeeded then 1 otherwise 0
|
|
# DESCRIPTION: Adds more messages downloaded from Gmane to the end of the
|
|
# current cache file. It needs to work out the number of the
|
|
# last message in the cache, which it does with a regex, looking
|
|
# at 'Archived-At:' headers. Then it adds the lookahead value
|
|
# from the configuration file to that and downloads a maximum of
|
|
# that number of messages using the Gmane API. These messages
|
|
# are appended to the existing MBOX file. There is no means of
|
|
# expiring older messages, and perhaps there should be.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub updateCache {
|
|
my ($config) = @_;
|
|
|
|
#
|
|
# How much to look ahead in Gmane; defined in the configuration
|
|
#
|
|
my $lookahead = $config->{gmane}->{lookahead};
|
|
|
|
#
|
|
# The regex to parse the permalink URL is in the configuration
|
|
#
|
|
my $re = qr{$config->{cache}->{regex}};
|
|
|
|
#
|
|
# The name of the mail cache and its directory are in the configuration
|
|
#
|
|
my $mailfile
|
|
= join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
|
|
|
|
#
|
|
# Open the mail file
|
|
#
|
|
my $mgr = Mail::Box::Manager->new;
|
|
my $folder = $mgr->open($mailfile);
|
|
|
|
#
|
|
# Walk all the messages, parsing the permalink URLs to get the start and
|
|
# end
|
|
#
|
|
my $plink;
|
|
my ( $msgno, $start, $end ) = ( 0, 0, 0 );
|
|
foreach my $message ( $folder->messages ) {
|
|
$plink = $message->get('Archived-At');
|
|
if ( ($msgno) = ( $plink =~ /$re/ ) ) {
|
|
if ( $start eq 0 ) {
|
|
$start = $msgno;
|
|
}
|
|
else {
|
|
$end = $msgno;
|
|
}
|
|
}
|
|
}
|
|
$folder->close;
|
|
|
|
#
|
|
# Build the new URL to get more messages, starting from the last one we
|
|
# have plus 1 for $lookahead+1 messages
|
|
#
|
|
my $url = sprintf( $config->{gmane}->{template1},
|
|
$end + 1, $end + $lookahead + 1 );
|
|
print "URL: $url\n";
|
|
|
|
#
|
|
# Set up the HTTP GET
|
|
#
|
|
my $ua = LWP::UserAgent->new;
|
|
$ua->agent('HPR-Agent/0.1');
|
|
$ua->timeout(10);
|
|
|
|
#
|
|
# Get the data (if any)
|
|
#
|
|
my $response = $ua->get($url);
|
|
|
|
if ( $response->is_success ) {
|
|
#
|
|
# The GET succeeded, see what came back
|
|
#
|
|
if ( length( ${ $response->content_ref } ) > 0 ) {
|
|
#
|
|
# We got some new data. Append it to the rest
|
|
#
|
|
print "New messages found\n";
|
|
open( my $fh, ">>", $mailfile )
|
|
or die "Unable to open $mailfile for appending\n";
|
|
print $fh $response->decoded_content;
|
|
close($fh);
|
|
}
|
|
else {
|
|
print "No new messages\n";
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
else {
|
|
#
|
|
# The GET failed in a nasty way
|
|
#
|
|
warn $response->status_line;
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: parseDate
|
|
# PURPOSE: Parse and check a date
|
|
# PARAMETERS: $date Date string
|
|
# RETURNS: Date::Calc date or 'undef' if $date is undefined
|
|
# DESCRIPTION: Parses a date string using 'strptime' and converts it to
|
|
# a Date::Calc date list
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub parseDate {
|
|
my ($date) = @_;
|
|
|
|
if ( defined($date) ) {
|
|
#
|
|
# Parse and format the date
|
|
#
|
|
my @parsed = strptime($date);
|
|
@parsed[ 4, 5 ] = ( $parsed[4] + 1, $parsed[5] + 1900 );
|
|
|
|
return ( @parsed[ 5, 4, 3 ] );
|
|
}
|
|
else {
|
|
return; # implicit undef
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: ISO8601_Date
|
|
# PURPOSE: Format a Date::Calc date in ISO8601 format
|
|
# PARAMETERS: @date - a date in the Date::Calc format
|
|
# RETURNS: Text string containing a YYYY-MM-DD date
|
|
# DESCRIPTION: Just a convenience to allow a simple call like
|
|
# $str = ISO8601_Date(@date)
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub ISO8601_Date {
|
|
my (@date) = (@_)[ 0, 1, 2 ];
|
|
|
|
if ( check_date(@date) ) {
|
|
return sprintf( "%04d-%02d-%02d", @date );
|
|
}
|
|
else {
|
|
return "*Invalid Date*";
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: calcToDT
|
|
# PURPOSE: Convert a Date::Calc date to a UTC DateTime date
|
|
# PARAMETERS: $date Date::Calc date
|
|
# RETURNS: A DateTime date in the UTC timezone
|
|
# DESCRIPTION: Reformats a Date::Calc date to a DateTime date based on UTC
|
|
# (for date comparison)
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub calcToDT {
|
|
my (@date) = (@_)[ 0, 1, 2 ];
|
|
|
|
#
|
|
# Transfer Date::Calc values into a hash for initialising a DateTime
|
|
# object.
|
|
#
|
|
my ( %dtargs, $dt );
|
|
@dtargs{ 'year', 'month', 'day', 'time_zone' } = ( @date, 'UTC' );
|
|
$dt = DateTime->new(%dtargs);
|
|
return $dt;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: formatEmail
|
|
# PURPOSE: Trims an array of anonymised Gmane email addresss in
|
|
# Mail::Address objects
|
|
# PARAMETERS: $remails The array of Mail::Address objects
|
|
# RETURNS: An array containing the trimmed addresses formatted as text
|
|
# DESCRIPTION: These addresses look like "Ken Fallon
|
|
# <ken-sq8tSfcSOvWzQB+pC5nmwQ@public.gmane.org>". The display
|
|
# name part (Ken Fallon) is anonymised if it contains an
|
|
# address-like specification. The local-part of the address is
|
|
# anonymised and the domain replaced by a Gmane domain. The
|
|
# front part of the local-part is unaffected. We want to build
|
|
# an address from the display name (if there is one), and the
|
|
# front of the local-part. So the above becomes:
|
|
# "Ken Fallon <ken@...>"
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub formatEmail {
|
|
my ($remails) = @_;
|
|
|
|
my ( $phrase, $address, $comment, @results );
|
|
|
|
foreach my $email ( @{$remails} ) {
|
|
#
|
|
# Parse out the components
|
|
#
|
|
( $phrase, $address, $comment )
|
|
= ( $email->phrase, $email->address, $email->comment );
|
|
|
|
#
|
|
# Assume Gmane obscured any email address wherever it is. Strip out
|
|
# the obscuring part to make it more readable
|
|
#
|
|
foreach my $item ( $phrase, $address, $comment ) {
|
|
if ( $item =~ /^"?([^-]+).+@.+$/ ) {
|
|
$item = "$1\@...";
|
|
}
|
|
}
|
|
|
|
#
|
|
# Let the module reassemble the address in the right format, and add
|
|
# it to the result list
|
|
#
|
|
my $obj = Mail::Address->new( $phrase, $address, $comment );
|
|
push( @results, $obj->format );
|
|
}
|
|
|
|
return \@results;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: formatID
|
|
# PURPOSE: Reformat a Message-ID: or In-Reply-To: value to undo the
|
|
# effects of Gmane's mail address obscuration which is applied
|
|
# to one but not the other.
|
|
# PARAMETERS: $header The header for processing
|
|
# $gmane Boolean to indicate whether we need to strip
|
|
# a Gmane obscuration part
|
|
# RETURNS: The processed header
|
|
# DESCRIPTION: If $gmane is true recognises the possible Gmane obscuration
|
|
# patterns and extracts the important elements from the ID,
|
|
# otherwise just strips the '@' and what follows. This is
|
|
# because Gmane (presumably) detects the '@' in every header
|
|
# type, assumes it's an address and applies its obscuring
|
|
# technique to it. Shame they apply such a sledgehammer
|
|
# algorithm.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub formatID {
|
|
my ( $header, $gmane ) = @_;
|
|
|
|
return $header unless defined($header);
|
|
|
|
$header =~ s/(^<|>$)//g;
|
|
if ($gmane) {
|
|
if ( $header =~ /^(.+)-[^-]+@.+$/ ) {
|
|
$header = "$1";
|
|
}
|
|
elsif ( $header =~ /^(.+)@.+$/ ) {
|
|
$header = "$1";
|
|
}
|
|
}
|
|
else {
|
|
$header =~ s/@.+$//;
|
|
}
|
|
|
|
return $header;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: formatReferences
|
|
# PURPOSE: Reformat a list of references in the References: header
|
|
# PARAMETERS: $header The header for processing
|
|
# RETURNS: The processed header as an arrayref
|
|
# DESCRIPTION: Removes the '<>' around each reference, but also removes the
|
|
# bit after and including the '@' since we need to compare it
|
|
# with the ID values we have had to tidy after Gmane got to
|
|
# them.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub formatReferences {
|
|
my ($header) = @_;
|
|
|
|
return $header unless defined($header);
|
|
|
|
my ( @refs, @results );
|
|
|
|
@refs = split( /\s+/, $header );
|
|
foreach my $ref (@refs) {
|
|
$ref =~ s/(^<|>$)//g;
|
|
$ref =~ s/@.+$//;
|
|
push( @results, $ref );
|
|
}
|
|
|
|
return \@results;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: trimHeader
|
|
# PURPOSE: Trims an arbitrary header which is enclosed in "<..>"
|
|
# PARAMETERS: $header The header for trimming
|
|
# RETURNS: The trimmed header
|
|
# DESCRIPTION: Certain Gmane headers like "Message-Id" are enclosed in "<>"
|
|
# marks. We want to strip these and return the result
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub trimHeader {
|
|
my ($header) = @_;
|
|
|
|
return $header unless defined($header);
|
|
|
|
$header =~ s/(^<|>$)//g;
|
|
|
|
return $header;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: coalesce
|
|
# PURPOSE: To find the first defined argument and return it
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
# RETURNS: The first defined argument or undef if there are none
|
|
# DESCRIPTION: Modelled on the SQL function of the same name. It takes a list
|
|
# of arguments, scans it for the first one that is not undefined
|
|
# and returns it. If an argument is defined and it's an arrayref
|
|
# then the referenced array is returned comma-delimited. This
|
|
# allows calls such as "coalesce($var,'undef')" which returns
|
|
# the value of $var if it's defined, and 'undef' if not and
|
|
# doesn't break anything along the way.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub coalesce {
|
|
foreach (@_) {
|
|
if ( defined($_) ) {
|
|
if ( ref($_) eq 'ARRAY' ) {
|
|
return join( ',', @{$_} );
|
|
}
|
|
else {
|
|
return $_;
|
|
}
|
|
}
|
|
}
|
|
return; # implicit undef
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: empty
|
|
# PURPOSE: Determine whether the argument contains data
|
|
# PARAMETERS: $arg Argument
|
|
# RETURNS: 0 if data found, otherwise 1
|
|
# DESCRIPTION: If an argument is defined and it's an arrayref then the array
|
|
# must contain elements otherwise it's regarded as empty. An
|
|
# undefined argument is empty of course.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub empty {
|
|
my ($arg) = @_;
|
|
|
|
if ( defined($arg) ) {
|
|
if ( ref($arg) eq 'ARRAY' ) {
|
|
return scalar(@$arg) == 0;
|
|
}
|
|
else {
|
|
return 0;
|
|
}
|
|
}
|
|
else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: linkMessages
|
|
# PURPOSE: Link the collected messages using the 'In-Reply-To:' and
|
|
# 'References' headers
|
|
# PARAMETERS: $msg Hashref containing the messages
|
|
# $id Hashref containing message-id to message key
|
|
# links
|
|
# $silent Boolean determining whether to emit error
|
|
# messages
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: We want to build a structure where messages contain forward
|
|
# "child" links to other messages. So, given a message, we look
|
|
# for backward links in the 'In-Reply-To:' and 'References'
|
|
# headers, go to the referenced message(s) and make the current
|
|
# one a child. We don't do anything with multiple 'References',
|
|
# though we could.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub linkMessages {
|
|
my ( $msg, $id, $silent ) = @_;
|
|
|
|
my $irt;
|
|
|
|
foreach my $key ( sort( keys( %{$msg} ) ) ) {
|
|
$irt = $msg->{$key}->{'in-reply-to'};
|
|
|
|
if ( defined($irt) ) {
|
|
#
|
|
# There's an In-Reply-To: header
|
|
#
|
|
if ( defined( $id->{$irt} ) ) {
|
|
#
|
|
# The parent of this message is the key of the back reference and
|
|
# we add this key to the children of the parent of this message
|
|
#
|
|
$msg->{$key}->{parent} = $id->{$irt};
|
|
push(
|
|
@{ $msg->{ $msg->{$key}->{parent} }->{children} },
|
|
$id->{ $msg->{$key}->{'message-id'} }
|
|
);
|
|
}
|
|
else {
|
|
emit( $silent,
|
|
"Unable to find Message-ID: $irt (in message $key)\n" );
|
|
}
|
|
}
|
|
elsif ( defined( $msg->{$key}->{'references'} ) ) {
|
|
#
|
|
# There's no In-Reply-To: but we have References:, where the last one
|
|
# is the equivalent. We update the parent message
|
|
#
|
|
$ref = $msg->{$key}->{'references'}->[-1];
|
|
if ( defined($ref) && defined( $id->{$ref} ) ) {
|
|
#
|
|
# The parent of this message is the key of the back reference and
|
|
# we add this key to the children of the parent of this message
|
|
#
|
|
$msg->{$key}->{parent} = $id->{$ref};
|
|
push(
|
|
@{ $msg->{ $msg->{$key}->{parent} }->{children} },
|
|
$id->{ $msg->{$key}->{'message-id'} }
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: repairThreads
|
|
# PURPOSE: Repair broken threads where possible
|
|
# PARAMETERS: $mhash Hashref of all messages
|
|
# $silent Boolean determining whether to emit error
|
|
# messages
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: We look for messages with a subject beginning with 'Re:' which
|
|
# don't have any back links. These look like broken threads
|
|
# where the sender has not used their mail client's ability to
|
|
# generate an 'In-Reply-To:' and/or 'References' header. We try
|
|
# to find the thread root by looking for the subject in all of
|
|
# the messages and then make the linkages.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub repairThreads {
|
|
my ( $mhash, $silent ) = @_;
|
|
|
|
foreach my $key ( sort( keys( %{$mhash} ) ) ) {
|
|
if ( $mhash->{$key}->{subject} =~ /^Re:/ ) {
|
|
if (!( defined( $mhash->{$key}->{'in-reply-to'} )
|
|
|| defined( $mhash->{$key}->{'references'} )
|
|
)
|
|
)
|
|
{
|
|
#
|
|
# We have a message with a subject beginning with 'Re:' which
|
|
# doesn't have any back links. Looks like a broken thread.
|
|
# Find the thread root by looking for the subject.
|
|
#
|
|
my $parent = findParent( $mhash, $key );
|
|
|
|
#
|
|
# The parent has to be defined to be meaningful
|
|
#
|
|
if ( defined($parent) ) {
|
|
#
|
|
# Make this message's parent the message we found in two
|
|
# ways, via the 'parent' link and the 'in-reply-to'
|
|
# header.
|
|
#
|
|
$mhash->{$key}->{parent} = $parent;
|
|
$mhash->{$key}->{'in-reply-to'}
|
|
= $mhash->{$parent}->{'message-id'};
|
|
|
|
#
|
|
# Add this message to the parents children array, making
|
|
# sure it's sorted.
|
|
#
|
|
push( @{ $mhash->{$parent}->{children} }, $key );
|
|
$mhash->{$parent}->{children}
|
|
= [ sort( @{ $mhash->{$parent}->{children} } ) ];
|
|
|
|
#print "Parent:\n";
|
|
#dumpMessage( $mhash->{$parent} );
|
|
#print "Child:\n";
|
|
#dumpMessage( $mhash->{$key} );
|
|
}
|
|
else {
|
|
emit( $silent, "Couldn't find parent of message $key\n" );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: findParent
|
|
# PURPOSE: Given a particular element in the message hash find its parent
|
|
# by processing the subject
|
|
# PARAMETERS: $mhash Hashref of all messages
|
|
# $key Key into the message hash
|
|
# RETURNS: The key of the parent
|
|
# DESCRIPTION: Tries to join an "orphaned" message with its parent by looking
|
|
# for another instance of the subject. First the subject of this
|
|
# message must begin with "Re:". This string is removed in order
|
|
# to perform a search for the message this one is in reference
|
|
# to. The search is conducted through all of the stored messages
|
|
# and the matching message must not have a parent or any back
|
|
# references itself to be acceptable. This is a fairly vague set
|
|
# of criteria but it does work for a number of unlinked
|
|
# messages. [Note: The regex used in the search has been modified
|
|
# to be case insensitive having encountered an instance where
|
|
# someone reconstructed the subject line in different case.]
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub findParent {
|
|
my ( $mhash, $key ) = @_;
|
|
|
|
my $start_subj = $mhash->{$key}->{subject};
|
|
my $end_subj;
|
|
my $parent;
|
|
|
|
if ( $start_subj =~ /^Re:/ ) {
|
|
( $end_subj = $start_subj ) =~ s/^Re:\s*//;
|
|
foreach my $k ( keys( %{$mhash} ) ) {
|
|
#if ( $msg{$k}->{subject} eq $end_subj ) {
|
|
if ( $k ne $key
|
|
&& $msg{$k}->{subject} =~ /(?:^Re:\s*)?(?i)$end_subj/ )
|
|
{
|
|
if (!( defined( $msg{$k}->{'in-reply-to'} )
|
|
|| defined( $msg{$k}->{'references'} )
|
|
)
|
|
)
|
|
{
|
|
$parent = $k;
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
return $parent;
|
|
}
|
|
else {
|
|
#
|
|
# Doesn't look like a message with a parent
|
|
#
|
|
return; # implicit undef
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: buildThreadSummary
|
|
# PURPOSE: Build a summary of the thread roots
|
|
# PARAMETERS: $dt_from The from date as a DateTime object
|
|
# $dt_to The to date as a DateTime object
|
|
# $config Hashref of the configuration data
|
|
# $mhash Hashref of all messages
|
|
# $threads Hashref to contain the resulting thread data
|
|
# $silent Boolean determining whether to emit error
|
|
# messages
|
|
# RETURNS: Total number of messages in all eligible threads
|
|
# DESCRIPTION: We have built a structure containing all of the message
|
|
# threads we've seen. However, we're only interested in the
|
|
# threads for a certain period, so we need to work them out
|
|
# here. A thread root is a message which has no parents, and
|
|
# we're interested in those which were written in our time
|
|
# period. We collect the details we want for the final display
|
|
# and build a weblink to the thread on Gmane. We also count the
|
|
# number of messages per thread and the grand total. The
|
|
# reulting structure is what we'll pass to the template we use
|
|
# to generate the insertion for the Community News notes (and
|
|
# other stuff).
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub buildThreadSummary {
|
|
my ( $dt_from, $dt_to, $config, $mhash, $threads, $silent ) = @_;
|
|
|
|
my ( $count, $total );
|
|
|
|
#
|
|
# Use the UTC seconds for comparison
|
|
#
|
|
my $from = $dt_from->utc_rd_as_seconds();
|
|
my $to = $dt_to->utc_rd_as_seconds();
|
|
|
|
#
|
|
# Adjust the regex from the config file for parsing the stored links.
|
|
# Remove the '<>' enclosing it.
|
|
#
|
|
my $re = $config->{cache}->{regex};
|
|
$re =~ s/^<|>$//g;
|
|
$re = qr{$re};
|
|
|
|
#
|
|
# Find each thread root which has messages in the date range
|
|
#
|
|
$total = 0;
|
|
foreach my $key ( sort( keys( %{$mhash} ) ) ) {
|
|
if ( !defined( $mhash->{$key}->{parent} )
|
|
&& eligibleThread( $from, $to, $mhash, $key ) )
|
|
# && ( $key ge $from && $key lt $to ) )
|
|
{
|
|
#
|
|
# Stash the headers we want
|
|
#
|
|
foreach my $k (qw{ from to cc date subject archived-at }) {
|
|
$threads{$key}->{$k} = $mhash->{$key}->{$k};
|
|
}
|
|
|
|
#
|
|
# Make a weblink to the Gmane thread
|
|
# TODO What if the regex doesn't match?
|
|
#
|
|
if ( $mhash->{$key}->{'archived-at'} =~ /$re/ ) {
|
|
$threads{$key}->{thread}
|
|
= sprintf( $config->{gmane}->{template2}, $1 );
|
|
}
|
|
else {
|
|
emit( $silent,
|
|
"Unable to match the Gmane thread (in message $key)\n" );
|
|
}
|
|
|
|
#
|
|
# Count the messages in the thread and keep a grand total
|
|
#
|
|
$count = 0;
|
|
threadLength( $from, $to, $mhash, $key, \$count );
|
|
$threads{$key}->{count} = $count;
|
|
$total += $count;
|
|
}
|
|
}
|
|
|
|
return $total;
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: eligibleThread
|
|
# PURPOSE: Determines whether a given thread is eligible to be included
|
|
# in a date range
|
|
# PARAMETERS: $from The from date as UTC seconds
|
|
# $to The to date as UTC seconds
|
|
# $mhash Hashref of all messages
|
|
# $key Key into the message hash for this particular
|
|
# thread
|
|
# RETURNS: True (1) if the thread is eligible, otherwise false (0)
|
|
# DESCRIPTION: Checks the top level message (thread root) for elegibility
|
|
# (date in range) then recursively descends the thread checking
|
|
# each message in the same manner as it goes. The results are
|
|
# ORed together so that if any message is in range the entire
|
|
# thread is.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub eligibleThread {
|
|
my ( $from, $to, $mhash, $key ) = @_;
|
|
|
|
my $res = ( $key ge $from && $key lt $to );
|
|
|
|
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
|
|
$res ||= ( $k ge $from && $k lt $to )
|
|
|| eligibleThread( $from, $to, $mhash, $k );
|
|
}
|
|
|
|
return $res;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: threadLength
|
|
# PURPOSE: Count the number of eligible messages in a thread
|
|
# PARAMETERS: $from The from date as UTC seconds
|
|
# $to The to date as UTC seconds
|
|
# $mhash Hashref of all messages
|
|
# $key Key into the message hash
|
|
# $len Scalar ref containing accumulated length
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Recursively descends through a thread (a tree of linked
|
|
# messages) counting the elements but only if they are in range.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub threadLength {
|
|
my ( $from, $to, $mhash, $key, $len ) = @_;
|
|
|
|
$$len++ if ( $key ge $from && $key lt $to );
|
|
|
|
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
|
|
threadLength( $from, $to, $mhash, $k, $len );
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: printThread
|
|
# PURPOSE: To walk a message thread and print it
|
|
# PARAMETERS: $mhash Hashref of all messages
|
|
# $key Key into the message hash
|
|
# $level Integer denoting recursion level
|
|
# $html 1 for HTML output, 0 for text
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Recursively descend through a thread (a tree of linked
|
|
# messages) printing each one. If $html is true the message is
|
|
# printed in HTML, otherwise it's plain text with indentation.
|
|
# The level of indentation is controlled by $level which
|
|
# increments as the tree is recursed.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub printThread {
|
|
my ( $mhash, $key, $level, $html ) = @_;
|
|
|
|
$level = 0 unless $level;
|
|
$html = 0 unless $html;
|
|
|
|
printMessage( $mhash->{$key}, $level, $html );
|
|
|
|
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
|
|
printThread( $mhash, $k, $level + 1, $html );
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: printMessage
|
|
# PURPOSE: To print a message for debug purposes
|
|
# PARAMETERS: $msg Hashref containing message attributes
|
|
# $level Integer for indenting
|
|
# $html 1 for HTML output, 0 for text
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Print a message. The details held consist only of a set of
|
|
# headers, and these are printed as a label and a value. If
|
|
# $html is true the output is HTML (an unnumbered list) and only
|
|
# headers with values are reported. If $html is false then plain
|
|
# text is used, empty values are reported and indentation is
|
|
# used, controlled by the value of $level. In plain text mode
|
|
# each item ends with a line of hyphens.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub printMessage {
|
|
my ( $msg, $level, $html ) = @_;
|
|
|
|
$level = 0 unless $level;
|
|
$html = 0 unless $html;
|
|
|
|
# TODO Don't do it this way, use the global definition since it's used
|
|
# elsewhere
|
|
state %tags;
|
|
%tags = (
|
|
'from' => 'From',
|
|
'to' => 'To',
|
|
'cc' => 'CC',
|
|
'date' => 'Date',
|
|
'subject' => 'Subject',
|
|
'archived-at' => 'Link',
|
|
);
|
|
|
|
print "<ul>" if $html;
|
|
|
|
foreach my $k (qw{ from to cc date subject archived-at }) {
|
|
if ($html) {
|
|
if ( !empty( $msg->{$k} ) ) {
|
|
printf "<li>%-8s %s</li>\n", "$tags{$k}:",
|
|
coalesce( $msg->{$k}, '' );
|
|
}
|
|
}
|
|
else {
|
|
printf "%s%-8s %s\n", ' ' x $level, "$tags{$k}:",
|
|
coalesce( $msg->{$k}, '' );
|
|
}
|
|
}
|
|
if ($html) {
|
|
print "</ul>\n";
|
|
}
|
|
else {
|
|
print '-' x 80, "\n";
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: dumpMessage
|
|
# PURPOSE: To print all parts of a message hash for debugging
|
|
# PARAMETERS: $msg Hashref containing message attributes
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Produces a dump of a message for debugging purposes.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub dumpMessage {
|
|
my ($msg) = @_;
|
|
|
|
foreach my $k ( sort( keys( %{$msg} ) ) ) {
|
|
printf "%12s: %s\n", $k, coalesce( $msg->{$k}, '' );
|
|
}
|
|
print '-' x 80, "\n";
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: emit
|
|
# PURPOSE: Print text on STDERR unless silent mode has been selected
|
|
# PARAMETERS: - Boolean indicating whether to be silent or not
|
|
# - list of arguments to 'print'
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
|
|
# a message to STDERR depending on a boolean. We need this to be
|
|
# able to make the script silent when the -silent option is
|
|
# selected
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub emit {
|
|
unless (shift) {
|
|
print STDERR @_;
|
|
}
|
|
return;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: Options
|
|
# PURPOSE: Processes command-line options
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
# RETURNS: Undef
|
|
# DESCRIPTION:
|
|
# THROWS: no exceptions
|
|
# COMMENTS: none
|
|
# SEE ALSO: n/a
|
|
#===============================================================================
|
|
sub Options {
|
|
my ($optref) = @_;
|
|
|
|
my @options = (
|
|
"help", "debug=i", "out=s", "template=s",
|
|
"silent!", "checknew!", "initialise=i", "from=s",
|
|
"to=s",
|
|
);
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
__END__
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# Application Documentation
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
#{{{
|
|
|
|
|
|
=head1 NAME
|
|
|
|
summarise_mail - Generate a summary of messages on the HPR mailing list
|
|
|
|
=head1 VERSION
|
|
|
|
This documentation refers to summarise_mail version 0.0.5
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE] [-out=FILE]
|
|
[-template=FILE] [-[no]silent] [-[no]checknew] [-initialise=N]
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<-help>
|
|
|
|
Prints a brief help message describing the usage of the program, and then exits.
|
|
|
|
=item B<-initialise=N>
|
|
|
|
This option builds a new mailbox cache. The value B<N> defines the starting
|
|
message to be downloaded from the Gmane site. The number defaults to 1, though
|
|
this is not advisable since it potentially overloads the Gmane server. The
|
|
number of messages downloaded is limited by the B<lookahead> value in the
|
|
configuration file (see below).
|
|
|
|
If a cache file already exists the script will not run. It is necessary to
|
|
delete this file before initialising a new one.
|
|
|
|
After the cache is initialised the script will process the messages it
|
|
contains.
|
|
|
|
The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
|
|
|
|
=item B<-[no]checknew>
|
|
|
|
This option defines whether the script will check Gmane for updates to the
|
|
mailing list. If updates are found then they will be incorporated into the
|
|
cached mailbox file.
|
|
|
|
If omitted no check is made (B<-nochecknew>).
|
|
|
|
The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
|
|
|
|
=item B<-from=DATE>
|
|
|
|
Specifies the starting date for the mail summary.
|
|
|
|
The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
|
|
(e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
|
|
|
|
If this option is omitted the current date is used.
|
|
|
|
=item B<-to=DATE>
|
|
|
|
Specifies the ending date for the mail summary.
|
|
|
|
The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
|
|
(e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
|
|
|
|
If this option is omitted the last day of the month defined by the B<-from>
|
|
date is used.
|
|
|
|
=item B<-template=FILE>
|
|
|
|
This option defines the template that will be used to format the report
|
|
generated by the script.
|
|
|
|
If the option is not provided, the script will use a template called
|
|
B<mailnote_template.tpl> in the directory which holds the script. This
|
|
template generates HTML suitable for incorporation into the Community News
|
|
show notes.
|
|
|
|
=item B<-out=FILE>
|
|
|
|
This option defines the output file for the report. If the option is omitted
|
|
the report is written to STDOUT, allowing it to be redirected if required.
|
|
|
|
The output file name may contain the characters 'B<%s>'. This denotes the point
|
|
at which the year and month in the format B<YYYY-MM> are inserted. For example
|
|
if the script is being run for July 2014 the option:
|
|
|
|
-out=mailreport_%s.html
|
|
|
|
will cause the generation of the file:
|
|
|
|
mailreport_2014-07.html
|
|
|
|
=item B<-[no]silent>
|
|
|
|
This option controls whether the script reports minimal details of what it is
|
|
doing to STDERR. If the option is omitted the report is generated (B<-nosilent>).
|
|
|
|
The script reports: the starting and ending dates it is working on and the name of the output file
|
|
(if appropriate).
|
|
|
|
=item B<-debug=N>
|
|
|
|
Runs the script in debug mode if the value of B<N> is greater than 0. The
|
|
default setting is zero.
|
|
|
|
In debug mode the script reports various pieces of information about the
|
|
parsing of the mail messages and building of threads.
|
|
|
|
=back
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
=head2 Overview
|
|
|
|
This script generates a summary of the mail sent to the HPR mailing list over
|
|
a period.
|
|
|
|
Messages to the mailing list are copied to the free B<Gmane> service and the
|
|
script uses this to prepare its report. The Gmane service offers
|
|
a downloadable interface at
|
|
http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
|
|
where selected messages can be collected in B<MBOX> format.
|
|
|
|
=head2 Methodology
|
|
|
|
The script operates in three phases:
|
|
|
|
=over 4
|
|
|
|
=item -
|
|
|
|
Messages are collected from Gmane and stored in a local cache (a file in MBOX
|
|
format). This is expected to be done once, and thereafter new messages are
|
|
appended to this file.
|
|
|
|
=item -
|
|
|
|
New messages are appended to the cache file. This is done by determining the
|
|
last message in the cache then requesting the next block of messages from
|
|
Gmane. The downloaded messages are then added to the file.
|
|
|
|
=item -
|
|
|
|
The cache is processed to generate the report. Usually the script is given
|
|
a start date (and optional end date, though this defaults to the end of the
|
|
month in the start date).
|
|
|
|
The algorithm is as follows:
|
|
|
|
=over 4
|
|
|
|
=item -
|
|
|
|
The script parses out the important headers of all messages in the cache.
|
|
|
|
=item -
|
|
|
|
The script walks the header structure in time order joining messages based on
|
|
their 'In-reference-to:' and 'References:' headers. Each message gets its
|
|
parent recorded as a pointer, and its children as an array of pointers. If
|
|
a reference is to a message that is not in the cache then it is not recorded.
|
|
|
|
=item -
|
|
|
|
At this point all messages without a parent are either the thread roots or
|
|
cases where someone has not replied properly. The script performs a further
|
|
join when an "orphaned" message has the same subject as another with a 'Re:'
|
|
on the front.
|
|
|
|
=item -
|
|
|
|
Now all messages without parents are the best that can be done to detect
|
|
thread roots. These are used to make the report.
|
|
|
|
=item -
|
|
|
|
The messages that cannot be threaded are those where the sender does not use
|
|
'Reply' or where they reply to a digest. Digests are not threaded because they
|
|
are blobs of multiple messages.
|
|
|
|
=item -
|
|
|
|
Some messages may be threaded erroneously. This happens when a sender, rather
|
|
than generating a new message, replies to an existing message but changes the
|
|
'Subject:' line. There is not much that can be done to correct this.
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=head2 Report Template
|
|
|
|
The script uses the B<Template> module to format the report. See the default
|
|
template B<mailnote_template.tpl> for an example.
|
|
|
|
The template is passed two variables:
|
|
|
|
=over 4
|
|
|
|
=item B<total>
|
|
|
|
This is a scalar variable containing a number: the count of messages in the
|
|
specified range.
|
|
|
|
=item B<threads>
|
|
|
|
This is a hash indexed by the message date converted to UTC date then to the
|
|
number of seconds since the Unix epoch. Each hash value is a hashref
|
|
containing the following fields:
|
|
|
|
Field Contents
|
|
----- --------
|
|
from The contents of the 'From:' header
|
|
to The contents of the 'To:' header
|
|
cc The contents of the 'Cc:' header
|
|
date The contents of the 'Date:' header
|
|
subject The contents of the 'Subject:' header
|
|
archived-at The contents of the 'Archived-at:' header (the Gmane
|
|
permalink URL)
|
|
count The number of messages in the thread
|
|
|
|
These messages are the roots of all of the detectable threads in the period.
|
|
Sorting them by the main hash key results in the threads being ordered by
|
|
their timestamp.
|
|
|
|
=back
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
A list of every error and warning message that the application can generate
|
|
(even the ones that will "never happen"), with a full explanation of each
|
|
problem, one or more likely causes, and any suggested remedies. If the
|
|
application generates exit status codes (e.g. under Unix) then list the exit
|
|
status associated with each error.
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT
|
|
|
|
The script obtains the information it requires to find the mail details from
|
|
B<Gmane> from a configuration file. The name of the file it expects is
|
|
B<.summarise_mail.cfg> in the
|
|
directory holding the script. To change this will require changing the script.
|
|
|
|
The configuration file format is as follows:
|
|
|
|
<gmane>
|
|
url = http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
|
|
template = "$url/%d/%d"
|
|
lookahead = 100
|
|
</gmane>
|
|
|
|
<cache>
|
|
directory = /home/dave/Community_News/mail_cache
|
|
filename = gmane.mbox
|
|
regex = "<http://permalink\.gmane\.org/gmane\.network\.syndication\.podcast\.hacker-public-radio/(\d+)>"
|
|
</cache>
|
|
|
|
The elements of this configuration file are explained below. First the
|
|
B<gmane> section:
|
|
|
|
=over 4
|
|
|
|
=item B<url>
|
|
|
|
This is the base URL on the Gmane site. It is used to construct URLs for
|
|
collecting mail messages. This should not require any changes.
|
|
|
|
=item B<template>
|
|
|
|
This is a template which is used to generate the actual URL used to download
|
|
messages. The script replaces each '%d' with a number. The first is the
|
|
starting message number, and the second the ending number plus one.
|
|
|
|
This should not require any changes.
|
|
|
|
=item B<lookahead>
|
|
|
|
This defines the maximum number of messages to be collected. This number is
|
|
added to the first number used when contructing the URL from the B<template>
|
|
above. Normally just a few messages are downloaded at a time, but when the
|
|
script is used to initialise the mailbox cache file this number of messages may
|
|
be downloaded.
|
|
|
|
This value can be changed if desired, but it is set at 100 in order to avoid
|
|
overloading the Gmane servers.
|
|
|
|
=back
|
|
|
|
The B<cache> section contains the following:
|
|
|
|
=over 4
|
|
|
|
=item B<directory>
|
|
|
|
This defines the full path to the directory holding the mail cache file.
|
|
|
|
This may be changed to reflect a change of location.
|
|
|
|
=item B<filename>
|
|
|
|
This defines the name of the mail cache. It is expected to be found in the
|
|
directory defined above.
|
|
|
|
This may be changed to reflect a change of name.
|
|
|
|
=item B<regex>
|
|
|
|
Individual messages in the cache each hold an URL which provides a permanent
|
|
link to the message on the Gmane site. The script uses the regular expression
|
|
defined here to collect the message number from this URL. This is how the
|
|
script determines the number of the oldest message in the cache when looking
|
|
for new messages.
|
|
|
|
This expression should only be changed with great caution.
|
|
|
|
=back
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
Getopt::Long
|
|
Pod::Usage
|
|
Config::General
|
|
LWP::UserAgent
|
|
Mail::Box::Manager
|
|
Date::Parse
|
|
Date::Calc
|
|
DateTime
|
|
DateTime::TimeZone
|
|
Template
|
|
Template::Filters
|
|
Data::Dumper
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
There are no known bugs in this module.
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
Patches are welcome.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
Copyright (c) 2014, 2015 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
|
|
|
This module is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
=cut
|
|
|
|
#}}}
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|