hpr-tools/Community_News/summarise_mail

1711 lines
56 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
#===============================================================================
#
# FILE: summarise_mail
#
# USAGE: ./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE]
# [-out=FILE] [-template=FILE] [-[no]silent] [-[no]checknew]
# [-initialise=N]
#
# DESCRIPTION: Generates a summary of the HPR mailing list for the Community
# News
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.5
# CREATED: 2015-02-15 15:06:11
# REVISION: 2015-10-04 20:21:44
#
#-------------------------------------------------------------------------------
# TODO
#
# - Add a means of reporting all of the message bodies in a thread. They
# ideally need formatting and should be printable so that a template can
# generate plain text versions for use by the Community News hosts when
# reviewing the last month's messages. This is quite a large task but maybe
# there are mail formatters that can do it.
#
#===============================================================================
use 5.010;
use strict;
use warnings;
use utf8;
use Getopt::Long;
use Pod::Usage;
use Config::General;
use LWP::UserAgent;
use Mail::Box::Manager;
use Date::Parse;
use Date::Calc qw{:all};
use DateTime;
use DateTime::TimeZone;
use Template;
use Template::Filters;
Template::Filters->use_html_entities; # Use HTML::Entities in the template
use Data::Dumper;
#
# Version number (manually incremented)
#
our $VERSION = '0.0.5';
#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;
#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Community_News";
my $configfile = "$basedir/.$PROG.cfg";
my $bpfile = "$basedir/mailnote_template.tpl";
my (%dtargs, $dt, $tz, $rd, $mid,
$irt, $ref, $date, @parsed, %msg,
%id, $dt_from, $dt_to, %threads, $total,
);
my %tags = (
'from' => 'From',
'to' => 'To',
'cc' => 'CC',
'date' => 'Date',
'subject' => 'Subject',
'archived-at' => 'Link',
);
#
# Enable Unicode mode
#
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
#
# Load configuration data
#
my $conf = Config::General->new(
-ConfigFile => $configfile,
-InterPolateVars => 1,
-ExtendedAccess => 1,
) or die "Unable to open $configfile\n";
my %config = $conf->getall() or die "Unable to process $configfile\n";
#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );
#
# Default help
#
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 )
if ( $options{'help'} );
#
# Collect options
#
my $debug = ( defined( $options{debug} ) ? $options{debug} : 0 );
my $initialise = ( defined( $options{initialise} ) ? 1 : 0 );
my $start = $options{initialise};
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
my $checknew = ( defined( $options{checknew} ) ? $options{checknew} : 0 );
my $outfile = $options{out};
my @fromdate;
if ( defined( $options{from} ) ) {
@fromdate = parseDate( $options{from} );
}
my @todate;
if ( defined( $options{to} ) ) {
@todate = parseDate( $options{to} );
}
my $template
= ( defined( $options{template} ) ? $options{template} : $bpfile );
#
# Check for illegal option combinations
#
die "Use only one of -initialise and -checknew\n"
if ( $initialise && $checknew );
#
# Sanity checks
#
die "Error: Unable to find template $template\n" unless -r $template;
#-------------------------------------------------------------------------------
# Default the from date to today and compute a to date if one wasn't given
#-------------------------------------------------------------------------------
if ( scalar(@fromdate) == 0 ) {
@fromdate = Today();
}
if ( scalar(@todate) == 0 ) {
@todate = Add_Delta_Days( @fromdate[ 0, 1 ],
1, Days_in_Month( @fromdate[ 0, 1 ] ) );
}
#
# We need DateTime dates for comparison
#
emit( $silent, "From: ", ISO8601_Date(@fromdate), "\n" );
emit( $silent, "To: ", ISO8601_Date(@todate), "\n" );
$dt_from = calcToDT(@fromdate);
$dt_to = calcToDT(@todate);
#-------------------------------------------------------------------------------
# Open the output file (or STDOUT) - we may need the date to do it
#-------------------------------------------------------------------------------
my $outfh;
if ($outfile) {
$outfile = sprintf( $outfile,
sprintf( "%d-%02d", $dt_from->year, $dt_from->month ) )
if ( $outfile =~ /%s/ );
emit( $silent, "Output: ", $outfile, "\n" );
open( $outfh, ">:encoding(UTF-8)", $outfile )
or die "Unable to open $outfile for writing: $!";
}
else {
open( $outfh, ">&", \*STDOUT )
or die "Unable to initialise for writing: $!";
}
#-------------------------------------------------------------------------------
# Check the configuration file
#-------------------------------------------------------------------------------
checkConfig( \%config );
#
# The cached mail file
#
my $mailfile = join( '/', @{ $config{cache} }{ 'directory', 'filename' } );
#-------------------------------------------------------------------------------
# If asked to initialise the cache check it makes sense, otherwise look to see
# if we're to update an existing cache
#-------------------------------------------------------------------------------
if ($initialise) {
if ( -e $mailfile ) {
die "The mail cache $mailfile exists.\n"
. "Delete it before initialising\n";
}
unless (initialiseCache( \%config, $start )) {
print "Failed to initialise the cache; can't continue\n";
exit;
}
}
elsif ($checknew) {
unless ( -e $mailfile ) {
die "The mail cache $mailfile does not exist\n"
. "Initialise it before checking for updates\n";
}
unless (updateCache( \%config )) {
print "Failed to update the cache; continuing with old mail\n";
}
}
#
# Open the mail file (as a folder in MBOX format)
#
my $mgr = Mail::Box::Manager->new;
my $folder = $mgr->open($mailfile);
#-------------------------------------------------------------------------------
# Walk through the messages in the folder collecting message details into
# a hash structure indexed by the UTC timestamp.
#-------------------------------------------------------------------------------
foreach my $message ( $folder->messages ) {
#
# TODO This doesn't work for a reason I don't understand. Needs to be
# fixed
#
#$message->unfold();
#
# Parse and convert the message date. The @parsed array contains
# $ss,$mm,$hh,$day,$month,$year,$zone. The year and month need adjustment.
# The timezone value is number of seconds from UTC.
#
$date = $message->get('Date');
@parsed = strptime($date);
$parsed[5] += 1900;
$parsed[4] += 1;
#
# Convert the parsed date to a DateTime
#
@dtargs{ 'second', 'minute', 'hour', 'day', 'month', 'year' } = (@parsed);
$dt = DateTime->new(%dtargs);
$tz = DateTime::TimeZone->new(
name => DateTime::TimeZone->offset_as_string( $parsed[6] ) );
$dt->set_time_zone($tz);
#
# Stash mail parameters
#
$rd = $dt->utc_rd_as_seconds();
$mid = formatID( $message->get('Message-ID'), 0 );
$irt = formatID( $message->get('In-Reply-To'), 1 );
my @from = $message->from;
my @to = $message->to;
my @cc = $message->cc;
$msg{$rd} = {
# '_date' => $date,
# '_from' => \@ffrom,
# '_to' => \@fto,
# '_cc' => \@fcc,
'_mid' => $message->get('Message-ID'),
'_irt' => $message->get('In-Reply-To'),
'_rdate' => $rd,
'date' => $dt->ymd . ' ' . $dt->hms . ' ' . $dt->time_zone->name,
'from' => formatEmail( \@from ),
'to' => formatEmail( \@to ),
'cc' => formatEmail( \@cc ),
'subject' => $message->get('Subject'),
'archived-at' => trimHeader( $message->get('Archived-At') ),
'message-id' => $mid,
'in-reply-to' => $irt,
'references' => formatReferences( $message->get('References') ),
'parent' => undef,
'children' => [],
};
#
# Stash the message id for easier linking
#
$id{$mid} = $rd;
}
$folder->close;
#-------------------------------------------------------------------------------
# Link the messages according to their internal back-references. Rescue any
# broken threads caused by people not using their mail clients properly.
#-------------------------------------------------------------------------------
linkMessages( \%msg, \%id, ( $debug == 0 ) );
repairThreads( \%msg, ( $debug == 0 ) );
#-------------------------------------------------------------------------------
# Generate the hash of message thread details that start in the selected
# period. Prepare the template and generate the final document
#-------------------------------------------------------------------------------
$total = buildThreadSummary( $dt_from, $dt_to, \%config, \%msg, \%threads,
( $debug == 0 ) );
my $tt = Template->new(
{ ABSOLUTE => 1,
ENCODING => 'utf8',
INCLUDE_PATH => $basedir,
}
);
my $vars = {
total => $total,
threads => \%threads,
};
my $document;
$tt->process( $template, $vars, \$document, { binmode => ':utf8' } )
|| die $tt->error(), "\n";
print $outfh $document;
close($outfh);
exit;
#=== FUNCTION ================================================================
# NAME: checkConfig
# PURPOSE: Check the configuration, aborting if it's messed up and
# missing necessary items
# PARAMETERS: $config configuration hash
# RETURNS: Nothing
# DESCRIPTION: Checks that all of the required items are present in the
# $config hash. These have been parsed from the configuration
# file usually called '.summarise_mail.cfg'. It only performs
# existence checks, nothing more fancy.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub checkConfig {
my ($config) = @_;
die "Configuration file problem\n"
unless ( defined( $config->{gmane}->{url} )
&& defined( $config->{gmane}->{template1} )
&& defined( $config->{gmane}->{thread} )
&& defined( $config->{gmane}->{template2} )
&& defined( $config->{gmane}->{lookahead} )
&& defined( $config->{cache}->{directory} )
&& defined( $config->{cache}->{filename} )
&& defined( $config->{cache}->{regex} ) );
return;
}
#=== FUNCTION ================================================================
# NAME: initialiseCache
# PURPOSE: Load a new cache from Gmane
# PARAMETERS: $config Configuration hashref
# $start Start number to download
# RETURNS: If the GET succeeded then 1 otherwise 0
# DESCRIPTION: Creates a new cache file of mail downloaded from Gmane. The
# file is in MBOX format but has had Gmane's obscuration
# algorithms applied to it, messing up certain elements like
# message-ids. The download begins with message $start and
# finishes with $start+LOOKAHEAD where the lookahead value is
# defined in the configuration file; usually 100.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub initialiseCache {
my ( $config, $start ) = @_;
#
# How much to look ahead in Gmane; defined in the configuration
#
my $lookahead = $config->{gmane}->{lookahead};
#
# The name of the mail cache and its directory are in the configuration
#
my $mailfile
= join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
#
# Make the URL for download
#
my $url = sprintf( $config->{gmane}->{template1},
$start, $start + $lookahead + 1 );
print "URL: $url\n";
#
# Set up the HTTP GET
#
my $ua = LWP::UserAgent->new;
$ua->agent('HPR-Agent/0.1');
$ua->timeout(10);
#
# Get the data (if any)
#
my $response = $ua->get($url);
if ( $response->is_success ) {
#
# The GET succeeded, see what came back
#
if ( length( ${ $response->content_ref } ) > 0 ) {
#
# We got some new data. Append it to the rest
#
print "Mail messages found\n";
open( my $fh, ">", $mailfile )
or die "Unable to open $mailfile for writing\n";
print $fh $response->decoded_content;
close($fh);
}
else {
print "No messages found\n";
}
return 1;
}
else {
#
# The GET failed in a nasty way
#
warn $response->status_line;
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: updateCache
# PURPOSE: Get the latest mail messages from Gmane and save them in the
# cache
# PARAMETERS: $config configuration hash
# RETURNS: If the GET succeeded then 1 otherwise 0
# DESCRIPTION: Adds more messages downloaded from Gmane to the end of the
# current cache file. It needs to work out the number of the
# last message in the cache, which it does with a regex, looking
# at 'Archived-At:' headers. Then it adds the lookahead value
# from the configuration file to that and downloads a maximum of
# that number of messages using the Gmane API. These messages
# are appended to the existing MBOX file. There is no means of
# expiring older messages, and perhaps there should be.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub updateCache {
my ($config) = @_;
#
# How much to look ahead in Gmane; defined in the configuration
#
my $lookahead = $config->{gmane}->{lookahead};
#
# The regex to parse the permalink URL is in the configuration
#
my $re = qr{$config->{cache}->{regex}};
#
# The name of the mail cache and its directory are in the configuration
#
my $mailfile
= join( '/', @{ $config->{cache} }{ 'directory', 'filename' } );
#
# Open the mail file
#
my $mgr = Mail::Box::Manager->new;
my $folder = $mgr->open($mailfile);
#
# Walk all the messages, parsing the permalink URLs to get the start and
# end
#
my $plink;
my ( $msgno, $start, $end ) = ( 0, 0, 0 );
foreach my $message ( $folder->messages ) {
$plink = $message->get('Archived-At');
if ( ($msgno) = ( $plink =~ /$re/ ) ) {
if ( $start eq 0 ) {
$start = $msgno;
}
else {
$end = $msgno;
}
}
}
$folder->close;
#
# Build the new URL to get more messages, starting from the last one we
# have plus 1 for $lookahead+1 messages
#
my $url = sprintf( $config->{gmane}->{template1},
$end + 1, $end + $lookahead + 1 );
print "URL: $url\n";
#
# Set up the HTTP GET
#
my $ua = LWP::UserAgent->new;
$ua->agent('HPR-Agent/0.1');
$ua->timeout(10);
#
# Get the data (if any)
#
my $response = $ua->get($url);
if ( $response->is_success ) {
#
# The GET succeeded, see what came back
#
if ( length( ${ $response->content_ref } ) > 0 ) {
#
# We got some new data. Append it to the rest
#
print "New messages found\n";
open( my $fh, ">>", $mailfile )
or die "Unable to open $mailfile for appending\n";
print $fh $response->decoded_content;
close($fh);
}
else {
print "No new messages\n";
}
return 1;
}
else {
#
# The GET failed in a nasty way
#
warn $response->status_line;
return 0;
}
}
#=== FUNCTION ================================================================
# NAME: parseDate
# PURPOSE: Parse and check a date
# PARAMETERS: $date Date string
# RETURNS: Date::Calc date or 'undef' if $date is undefined
# DESCRIPTION: Parses a date string using 'strptime' and converts it to
# a Date::Calc date list
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub parseDate {
my ($date) = @_;
if ( defined($date) ) {
#
# Parse and format the date
#
my @parsed = strptime($date);
@parsed[ 4, 5 ] = ( $parsed[4] + 1, $parsed[5] + 1900 );
return ( @parsed[ 5, 4, 3 ] );
}
else {
return; # implicit undef
}
}
#=== FUNCTION ================================================================
# NAME: ISO8601_Date
# PURPOSE: Format a Date::Calc date in ISO8601 format
# PARAMETERS: @date - a date in the Date::Calc format
# RETURNS: Text string containing a YYYY-MM-DD date
# DESCRIPTION: Just a convenience to allow a simple call like
# $str = ISO8601_Date(@date)
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub ISO8601_Date {
my (@date) = (@_)[ 0, 1, 2 ];
if ( check_date(@date) ) {
return sprintf( "%04d-%02d-%02d", @date );
}
else {
return "*Invalid Date*";
}
}
#=== FUNCTION ================================================================
# NAME: calcToDT
# PURPOSE: Convert a Date::Calc date to a UTC DateTime date
# PARAMETERS: $date Date::Calc date
# RETURNS: A DateTime date in the UTC timezone
# DESCRIPTION: Reformats a Date::Calc date to a DateTime date based on UTC
# (for date comparison)
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub calcToDT {
my (@date) = (@_)[ 0, 1, 2 ];
#
# Transfer Date::Calc values into a hash for initialising a DateTime
# object.
#
my ( %dtargs, $dt );
@dtargs{ 'year', 'month', 'day', 'time_zone' } = ( @date, 'UTC' );
$dt = DateTime->new(%dtargs);
return $dt;
}
#=== FUNCTION ================================================================
# NAME: formatEmail
# PURPOSE: Trims an array of anonymised Gmane email addresss in
# Mail::Address objects
# PARAMETERS: $remails The array of Mail::Address objects
# RETURNS: An array containing the trimmed addresses formatted as text
# DESCRIPTION: These addresses look like "Ken Fallon
# <ken-sq8tSfcSOvWzQB+pC5nmwQ@public.gmane.org>". The display
# name part (Ken Fallon) is anonymised if it contains an
# address-like specification. The local-part of the address is
# anonymised and the domain replaced by a Gmane domain. The
# front part of the local-part is unaffected. We want to build
# an address from the display name (if there is one), and the
# front of the local-part. So the above becomes:
# "Ken Fallon <ken@...>"
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub formatEmail {
my ($remails) = @_;
my ( $phrase, $address, $comment, @results );
foreach my $email ( @{$remails} ) {
#
# Parse out the components
#
( $phrase, $address, $comment )
= ( $email->phrase, $email->address, $email->comment );
#
# Assume Gmane obscured any email address wherever it is. Strip out
# the obscuring part to make it more readable
#
foreach my $item ( $phrase, $address, $comment ) {
if ( $item =~ /^"?([^-]+).+@.+$/ ) {
$item = "$1\@...";
}
}
#
# Let the module reassemble the address in the right format, and add
# it to the result list
#
my $obj = Mail::Address->new( $phrase, $address, $comment );
push( @results, $obj->format );
}
return \@results;
}
#=== FUNCTION ================================================================
# NAME: formatID
# PURPOSE: Reformat a Message-ID: or In-Reply-To: value to undo the
# effects of Gmane's mail address obscuration which is applied
# to one but not the other.
# PARAMETERS: $header The header for processing
# $gmane Boolean to indicate whether we need to strip
# a Gmane obscuration part
# RETURNS: The processed header
# DESCRIPTION: If $gmane is true recognises the possible Gmane obscuration
# patterns and extracts the important elements from the ID,
# otherwise just strips the '@' and what follows. This is
# because Gmane (presumably) detects the '@' in every header
# type, assumes it's an address and applies its obscuring
# technique to it. Shame they apply such a sledgehammer
# algorithm.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub formatID {
my ( $header, $gmane ) = @_;
return $header unless defined($header);
$header =~ s/(^<|>$)//g;
if ($gmane) {
if ( $header =~ /^(.+)-[^-]+@.+$/ ) {
$header = "$1";
}
elsif ( $header =~ /^(.+)@.+$/ ) {
$header = "$1";
}
}
else {
$header =~ s/@.+$//;
}
return $header;
}
#=== FUNCTION ================================================================
# NAME: formatReferences
# PURPOSE: Reformat a list of references in the References: header
# PARAMETERS: $header The header for processing
# RETURNS: The processed header as an arrayref
# DESCRIPTION: Removes the '<>' around each reference, but also removes the
# bit after and including the '@' since we need to compare it
# with the ID values we have had to tidy after Gmane got to
# them.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub formatReferences {
my ($header) = @_;
return $header unless defined($header);
my ( @refs, @results );
@refs = split( /\s+/, $header );
foreach my $ref (@refs) {
$ref =~ s/(^<|>$)//g;
$ref =~ s/@.+$//;
push( @results, $ref );
}
return \@results;
}
#=== FUNCTION ================================================================
# NAME: trimHeader
# PURPOSE: Trims an arbitrary header which is enclosed in "<..>"
# PARAMETERS: $header The header for trimming
# RETURNS: The trimmed header
# DESCRIPTION: Certain Gmane headers like "Message-Id" are enclosed in "<>"
# marks. We want to strip these and return the result
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub trimHeader {
my ($header) = @_;
return $header unless defined($header);
$header =~ s/(^<|>$)//g;
return $header;
}
#=== FUNCTION ================================================================
# NAME: coalesce
# PURPOSE: To find the first defined argument and return it
# PARAMETERS: Arbitrary number of arguments
# RETURNS: The first defined argument or undef if there are none
# DESCRIPTION: Modelled on the SQL function of the same name. It takes a list
# of arguments, scans it for the first one that is not undefined
# and returns it. If an argument is defined and it's an arrayref
# then the referenced array is returned comma-delimited. This
# allows calls such as "coalesce($var,'undef')" which returns
# the value of $var if it's defined, and 'undef' if not and
# doesn't break anything along the way.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub coalesce {
foreach (@_) {
if ( defined($_) ) {
if ( ref($_) eq 'ARRAY' ) {
return join( ',', @{$_} );
}
else {
return $_;
}
}
}
return; # implicit undef
}
#=== FUNCTION ================================================================
# NAME: empty
# PURPOSE: Determine whether the argument contains data
# PARAMETERS: $arg Argument
# RETURNS: 0 if data found, otherwise 1
# DESCRIPTION: If an argument is defined and it's an arrayref then the array
# must contain elements otherwise it's regarded as empty. An
# undefined argument is empty of course.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub empty {
my ($arg) = @_;
if ( defined($arg) ) {
if ( ref($arg) eq 'ARRAY' ) {
return scalar(@$arg) == 0;
}
else {
return 0;
}
}
else {
return 1;
}
}
#=== FUNCTION ================================================================
# NAME: linkMessages
# PURPOSE: Link the collected messages using the 'In-Reply-To:' and
# 'References' headers
# PARAMETERS: $msg Hashref containing the messages
# $id Hashref containing message-id to message key
# links
# $silent Boolean determining whether to emit error
# messages
# RETURNS: Nothing
# DESCRIPTION: We want to build a structure where messages contain forward
# "child" links to other messages. So, given a message, we look
# for backward links in the 'In-Reply-To:' and 'References'
# headers, go to the referenced message(s) and make the current
# one a child. We don't do anything with multiple 'References',
# though we could.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub linkMessages {
my ( $msg, $id, $silent ) = @_;
my $irt;
foreach my $key ( sort( keys( %{$msg} ) ) ) {
$irt = $msg->{$key}->{'in-reply-to'};
if ( defined($irt) ) {
#
# There's an In-Reply-To: header
#
if ( defined( $id->{$irt} ) ) {
#
# The parent of this message is the key of the back reference and
# we add this key to the children of the parent of this message
#
$msg->{$key}->{parent} = $id->{$irt};
push(
@{ $msg->{ $msg->{$key}->{parent} }->{children} },
$id->{ $msg->{$key}->{'message-id'} }
);
}
else {
emit( $silent,
"Unable to find Message-ID: $irt (in message $key)\n" );
}
}
elsif ( defined( $msg->{$key}->{'references'} ) ) {
#
# There's no In-Reply-To: but we have References:, where the last one
# is the equivalent. We update the parent message
#
$ref = $msg->{$key}->{'references'}->[-1];
if ( defined($ref) && defined( $id->{$ref} ) ) {
#
# The parent of this message is the key of the back reference and
# we add this key to the children of the parent of this message
#
$msg->{$key}->{parent} = $id->{$ref};
push(
@{ $msg->{ $msg->{$key}->{parent} }->{children} },
$id->{ $msg->{$key}->{'message-id'} }
);
}
}
}
return;
}
#=== FUNCTION ================================================================
# NAME: repairThreads
# PURPOSE: Repair broken threads where possible
# PARAMETERS: $mhash Hashref of all messages
# $silent Boolean determining whether to emit error
# messages
# RETURNS: Nothing
# DESCRIPTION: We look for messages with a subject beginning with 'Re:' which
# don't have any back links. These look like broken threads
# where the sender has not used their mail client's ability to
# generate an 'In-Reply-To:' and/or 'References' header. We try
# to find the thread root by looking for the subject in all of
# the messages and then make the linkages.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub repairThreads {
my ( $mhash, $silent ) = @_;
foreach my $key ( sort( keys( %{$mhash} ) ) ) {
if ( $mhash->{$key}->{subject} =~ /^Re:/ ) {
if (!( defined( $mhash->{$key}->{'in-reply-to'} )
|| defined( $mhash->{$key}->{'references'} )
)
)
{
#
# We have a message with a subject beginning with 'Re:' which
# doesn't have any back links. Looks like a broken thread.
# Find the thread root by looking for the subject.
#
my $parent = findParent( $mhash, $key );
#
# The parent has to be defined to be meaningful
#
if ( defined($parent) ) {
#
# Make this message's parent the message we found in two
# ways, via the 'parent' link and the 'in-reply-to'
# header.
#
$mhash->{$key}->{parent} = $parent;
$mhash->{$key}->{'in-reply-to'}
= $mhash->{$parent}->{'message-id'};
#
# Add this message to the parents children array, making
# sure it's sorted.
#
push( @{ $mhash->{$parent}->{children} }, $key );
$mhash->{$parent}->{children}
= [ sort( @{ $mhash->{$parent}->{children} } ) ];
#print "Parent:\n";
#dumpMessage( $mhash->{$parent} );
#print "Child:\n";
#dumpMessage( $mhash->{$key} );
}
else {
emit( $silent, "Couldn't find parent of message $key\n" );
}
}
}
}
return;
}
#=== FUNCTION ================================================================
# NAME: findParent
# PURPOSE: Given a particular element in the message hash find its parent
# by processing the subject
# PARAMETERS: $mhash Hashref of all messages
# $key Key into the message hash
# RETURNS: The key of the parent
# DESCRIPTION: Tries to join an "orphaned" message with its parent by looking
# for another instance of the subject. First the subject of this
# message must begin with "Re:". This string is removed in order
# to perform a search for the message this one is in reference
# to. The search is conducted through all of the stored messages
# and the matching message must not have a parent or any back
# references itself to be acceptable. This is a fairly vague set
# of criteria but it does work for a number of unlinked
# messages. [Note: The regex used in the search has been modified
# to be case insensitive having encountered an instance where
# someone reconstructed the subject line in different case.]
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub findParent {
my ( $mhash, $key ) = @_;
my $start_subj = $mhash->{$key}->{subject};
my $end_subj;
my $parent;
if ( $start_subj =~ /^Re:/ ) {
( $end_subj = $start_subj ) =~ s/^Re:\s*//;
foreach my $k ( keys( %{$mhash} ) ) {
#if ( $msg{$k}->{subject} eq $end_subj ) {
if ( $k ne $key
&& $msg{$k}->{subject} =~ /(?:^Re:\s*)?(?i)$end_subj/ )
{
if (!( defined( $msg{$k}->{'in-reply-to'} )
|| defined( $msg{$k}->{'references'} )
)
)
{
$parent = $k;
last;
}
}
}
return $parent;
}
else {
#
# Doesn't look like a message with a parent
#
return; # implicit undef
}
}
#=== FUNCTION ================================================================
# NAME: buildThreadSummary
# PURPOSE: Build a summary of the thread roots
# PARAMETERS: $dt_from The from date as a DateTime object
# $dt_to The to date as a DateTime object
# $config Hashref of the configuration data
# $mhash Hashref of all messages
# $threads Hashref to contain the resulting thread data
# $silent Boolean determining whether to emit error
# messages
# RETURNS: Total number of messages in all eligible threads
# DESCRIPTION: We have built a structure containing all of the message
# threads we've seen. However, we're only interested in the
# threads for a certain period, so we need to work them out
# here. A thread root is a message which has no parents, and
# we're interested in those which were written in our time
# period. We collect the details we want for the final display
# and build a weblink to the thread on Gmane. We also count the
# number of messages per thread and the grand total. The
# reulting structure is what we'll pass to the template we use
# to generate the insertion for the Community News notes (and
# other stuff).
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub buildThreadSummary {
my ( $dt_from, $dt_to, $config, $mhash, $threads, $silent ) = @_;
my ( $count, $total );
#
# Use the UTC seconds for comparison
#
my $from = $dt_from->utc_rd_as_seconds();
my $to = $dt_to->utc_rd_as_seconds();
#
# Adjust the regex from the config file for parsing the stored links.
# Remove the '<>' enclosing it.
#
my $re = $config->{cache}->{regex};
$re =~ s/^<|>$//g;
$re = qr{$re};
#
# Find each thread root which has messages in the date range
#
$total = 0;
foreach my $key ( sort( keys( %{$mhash} ) ) ) {
if ( !defined( $mhash->{$key}->{parent} )
&& eligibleThread( $from, $to, $mhash, $key ) )
# && ( $key ge $from && $key lt $to ) )
{
#
# Stash the headers we want
#
foreach my $k (qw{ from to cc date subject archived-at }) {
$threads{$key}->{$k} = $mhash->{$key}->{$k};
}
#
# Make a weblink to the Gmane thread
# TODO What if the regex doesn't match?
#
if ( $mhash->{$key}->{'archived-at'} =~ /$re/ ) {
$threads{$key}->{thread}
= sprintf( $config->{gmane}->{template2}, $1 );
}
else {
emit( $silent,
"Unable to match the Gmane thread (in message $key)\n" );
}
#
# Count the messages in the thread and keep a grand total
#
$count = 0;
threadLength( $from, $to, $mhash, $key, \$count );
$threads{$key}->{count} = $count;
$total += $count;
}
}
return $total;
}
#=== FUNCTION ================================================================
# NAME: eligibleThread
# PURPOSE: Determines whether a given thread is eligible to be included
# in a date range
# PARAMETERS: $from The from date as UTC seconds
# $to The to date as UTC seconds
# $mhash Hashref of all messages
# $key Key into the message hash for this particular
# thread
# RETURNS: True (1) if the thread is eligible, otherwise false (0)
# DESCRIPTION: Checks the top level message (thread root) for elegibility
# (date in range) then recursively descends the thread checking
# each message in the same manner as it goes. The results are
# ORed together so that if any message is in range the entire
# thread is.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub eligibleThread {
my ( $from, $to, $mhash, $key ) = @_;
my $res = ( $key ge $from && $key lt $to );
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
$res ||= ( $k ge $from && $k lt $to )
|| eligibleThread( $from, $to, $mhash, $k );
}
return $res;
}
#=== FUNCTION ================================================================
# NAME: threadLength
# PURPOSE: Count the number of eligible messages in a thread
# PARAMETERS: $from The from date as UTC seconds
# $to The to date as UTC seconds
# $mhash Hashref of all messages
# $key Key into the message hash
# $len Scalar ref containing accumulated length
# RETURNS: Nothing
# DESCRIPTION: Recursively descends through a thread (a tree of linked
# messages) counting the elements but only if they are in range.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub threadLength {
my ( $from, $to, $mhash, $key, $len ) = @_;
$$len++ if ( $key ge $from && $key lt $to );
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
threadLength( $from, $to, $mhash, $k, $len );
}
return;
}
#=== FUNCTION ================================================================
# NAME: printThread
# PURPOSE: To walk a message thread and print it
# PARAMETERS: $mhash Hashref of all messages
# $key Key into the message hash
# $level Integer denoting recursion level
# $html 1 for HTML output, 0 for text
# RETURNS: Nothing
# DESCRIPTION: Recursively descend through a thread (a tree of linked
# messages) printing each one. If $html is true the message is
# printed in HTML, otherwise it's plain text with indentation.
# The level of indentation is controlled by $level which
# increments as the tree is recursed.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub printThread {
my ( $mhash, $key, $level, $html ) = @_;
$level = 0 unless $level;
$html = 0 unless $html;
printMessage( $mhash->{$key}, $level, $html );
foreach my $k ( @{ $mhash->{$key}->{children} } ) {
printThread( $mhash, $k, $level + 1, $html );
}
return;
}
#=== FUNCTION ================================================================
# NAME: printMessage
# PURPOSE: To print a message for debug purposes
# PARAMETERS: $msg Hashref containing message attributes
# $level Integer for indenting
# $html 1 for HTML output, 0 for text
# RETURNS: Nothing
# DESCRIPTION: Print a message. The details held consist only of a set of
# headers, and these are printed as a label and a value. If
# $html is true the output is HTML (an unnumbered list) and only
# headers with values are reported. If $html is false then plain
# text is used, empty values are reported and indentation is
# used, controlled by the value of $level. In plain text mode
# each item ends with a line of hyphens.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub printMessage {
my ( $msg, $level, $html ) = @_;
$level = 0 unless $level;
$html = 0 unless $html;
# TODO Don't do it this way, use the global definition since it's used
# elsewhere
state %tags;
%tags = (
'from' => 'From',
'to' => 'To',
'cc' => 'CC',
'date' => 'Date',
'subject' => 'Subject',
'archived-at' => 'Link',
);
print "<ul>" if $html;
foreach my $k (qw{ from to cc date subject archived-at }) {
if ($html) {
if ( !empty( $msg->{$k} ) ) {
printf "<li>%-8s %s</li>\n", "$tags{$k}:",
coalesce( $msg->{$k}, '' );
}
}
else {
printf "%s%-8s %s\n", ' ' x $level, "$tags{$k}:",
coalesce( $msg->{$k}, '' );
}
}
if ($html) {
print "</ul>\n";
}
else {
print '-' x 80, "\n";
}
return;
}
#=== FUNCTION ================================================================
# NAME: dumpMessage
# PURPOSE: To print all parts of a message hash for debugging
# PARAMETERS: $msg Hashref containing message attributes
# RETURNS: Nothing
# DESCRIPTION: Produces a dump of a message for debugging purposes.
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub dumpMessage {
my ($msg) = @_;
foreach my $k ( sort( keys( %{$msg} ) ) ) {
printf "%12s: %s\n", $k, coalesce( $msg->{$k}, '' );
}
print '-' x 80, "\n";
return;
}
#=== FUNCTION ================================================================
# NAME: emit
# PURPOSE: Print text on STDERR unless silent mode has been selected
# PARAMETERS: - Boolean indicating whether to be silent or not
# - list of arguments to 'print'
# RETURNS: Nothing
# DESCRIPTION: This is a wrapper around 'print' to determine whether to send
# a message to STDERR depending on a boolean. We need this to be
# able to make the script silent when the -silent option is
# selected
# THROWS: No exceptions
# COMMENTS: None
# SEE ALSO: N/A
#===============================================================================
sub emit {
unless (shift) {
print STDERR @_;
}
return;
}
#=== FUNCTION ================================================================
# NAME: Options
# PURPOSE: Processes command-line options
# PARAMETERS: $optref Hash reference to hold the options
# RETURNS: Undef
# DESCRIPTION:
# THROWS: no exceptions
# COMMENTS: none
# SEE ALSO: n/a
#===============================================================================
sub Options {
my ($optref) = @_;
my @options = (
"help", "debug=i", "out=s", "template=s",
"silent!", "checknew!", "initialise=i", "from=s",
"to=s",
);
if ( !GetOptions( $optref, @options ) ) {
pod2usage( -msg => "$PROG version $VERSION\n", -exitval => 1 );
}
return;
}
__END__
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#{{{
=head1 NAME
summarise_mail - Generate a summary of messages on the HPR mailing list
=head1 VERSION
This documentation refers to summarise_mail version 0.0.5
=head1 USAGE
./summarise_mail [-help] [-debug=N] [-from=DATE] [-to=DATE] [-out=FILE]
[-template=FILE] [-[no]silent] [-[no]checknew] [-initialise=N]
=head1 OPTIONS
=over 8
=item B<-help>
Prints a brief help message describing the usage of the program, and then exits.
=item B<-initialise=N>
This option builds a new mailbox cache. The value B<N> defines the starting
message to be downloaded from the Gmane site. The number defaults to 1, though
this is not advisable since it potentially overloads the Gmane server. The
number of messages downloaded is limited by the B<lookahead> value in the
configuration file (see below).
If a cache file already exists the script will not run. It is necessary to
delete this file before initialising a new one.
After the cache is initialised the script will process the messages it
contains.
The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
=item B<-[no]checknew>
This option defines whether the script will check Gmane for updates to the
mailing list. If updates are found then they will be incorporated into the
cached mailbox file.
If omitted no check is made (B<-nochecknew>).
The B<-initialise=N> and the B<-[no]checknew> options are mutually exclusive.
=item B<-from=DATE>
Specifies the starting date for the mail summary.
The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
(e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
If this option is omitted the current date is used.
=item B<-to=DATE>
Specifies the ending date for the mail summary.
The date format should be B<DD-Mon-YYYY> (e.g. 12-Jun-2014), B<DD-MM-YYYY>
(e.g. 12-06-2014) or B<YYYY-MM-DD> (e.g. 2014-06-12).
If this option is omitted the last day of the month defined by the B<-from>
date is used.
=item B<-template=FILE>
This option defines the template that will be used to format the report
generated by the script.
If the option is not provided, the script will use a template called
B<mailnote_template.tpl> in the directory which holds the script. This
template generates HTML suitable for incorporation into the Community News
show notes.
=item B<-out=FILE>
This option defines the output file for the report. If the option is omitted
the report is written to STDOUT, allowing it to be redirected if required.
The output file name may contain the characters 'B<%s>'. This denotes the point
at which the year and month in the format B<YYYY-MM> are inserted. For example
if the script is being run for July 2014 the option:
-out=mailreport_%s.html
will cause the generation of the file:
mailreport_2014-07.html
=item B<-[no]silent>
This option controls whether the script reports minimal details of what it is
doing to STDERR. If the option is omitted the report is generated (B<-nosilent>).
The script reports: the starting and ending dates it is working on and the name of the output file
(if appropriate).
=item B<-debug=N>
Runs the script in debug mode if the value of B<N> is greater than 0. The
default setting is zero.
In debug mode the script reports various pieces of information about the
parsing of the mail messages and building of threads.
=back
=head1 DESCRIPTION
=head2 Overview
This script generates a summary of the mail sent to the HPR mailing list over
a period.
Messages to the mailing list are copied to the free B<Gmane> service and the
script uses this to prepare its report. The Gmane service offers
a downloadable interface at
http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
where selected messages can be collected in B<MBOX> format.
=head2 Methodology
The script operates in three phases:
=over 4
=item -
Messages are collected from Gmane and stored in a local cache (a file in MBOX
format). This is expected to be done once, and thereafter new messages are
appended to this file.
=item -
New messages are appended to the cache file. This is done by determining the
last message in the cache then requesting the next block of messages from
Gmane. The downloaded messages are then added to the file.
=item -
The cache is processed to generate the report. Usually the script is given
a start date (and optional end date, though this defaults to the end of the
month in the start date).
The algorithm is as follows:
=over 4
=item -
The script parses out the important headers of all messages in the cache.
=item -
The script walks the header structure in time order joining messages based on
their 'In-reference-to:' and 'References:' headers. Each message gets its
parent recorded as a pointer, and its children as an array of pointers. If
a reference is to a message that is not in the cache then it is not recorded.
=item -
At this point all messages without a parent are either the thread roots or
cases where someone has not replied properly. The script performs a further
join when an "orphaned" message has the same subject as another with a 'Re:'
on the front.
=item -
Now all messages without parents are the best that can be done to detect
thread roots. These are used to make the report.
=item -
The messages that cannot be threaded are those where the sender does not use
'Reply' or where they reply to a digest. Digests are not threaded because they
are blobs of multiple messages.
=item -
Some messages may be threaded erroneously. This happens when a sender, rather
than generating a new message, replies to an existing message but changes the
'Subject:' line. There is not much that can be done to correct this.
=back
=back
=head2 Report Template
The script uses the B<Template> module to format the report. See the default
template B<mailnote_template.tpl> for an example.
The template is passed two variables:
=over 4
=item B<total>
This is a scalar variable containing a number: the count of messages in the
specified range.
=item B<threads>
This is a hash indexed by the message date converted to UTC date then to the
number of seconds since the Unix epoch. Each hash value is a hashref
containing the following fields:
Field Contents
----- --------
from The contents of the 'From:' header
to The contents of the 'To:' header
cc The contents of the 'Cc:' header
date The contents of the 'Date:' header
subject The contents of the 'Subject:' header
archived-at The contents of the 'Archived-at:' header (the Gmane
permalink URL)
count The number of messages in the thread
These messages are the roots of all of the detectable threads in the period.
Sorting them by the main hash key results in the threads being ordered by
their timestamp.
=back
=head1 DIAGNOSTICS
A list of every error and warning message that the application can generate
(even the ones that will "never happen"), with a full explanation of each
problem, one or more likely causes, and any suggested remedies. If the
application generates exit status codes (e.g. under Unix) then list the exit
status associated with each error.
=head1 CONFIGURATION AND ENVIRONMENT
The script obtains the information it requires to find the mail details from
B<Gmane> from a configuration file. The name of the file it expects is
B<.summarise_mail.cfg> in the
directory holding the script. To change this will require changing the script.
The configuration file format is as follows:
<gmane>
url = http://download.gmane.org/gmane.network.syndication.podcast.hacker-public-radio
template = "$url/%d/%d"
lookahead = 100
</gmane>
<cache>
directory = /home/dave/Community_News/mail_cache
filename = gmane.mbox
regex = "<http://permalink\.gmane\.org/gmane\.network\.syndication\.podcast\.hacker-public-radio/(\d+)>"
</cache>
The elements of this configuration file are explained below. First the
B<gmane> section:
=over 4
=item B<url>
This is the base URL on the Gmane site. It is used to construct URLs for
collecting mail messages. This should not require any changes.
=item B<template>
This is a template which is used to generate the actual URL used to download
messages. The script replaces each '%d' with a number. The first is the
starting message number, and the second the ending number plus one.
This should not require any changes.
=item B<lookahead>
This defines the maximum number of messages to be collected. This number is
added to the first number used when contructing the URL from the B<template>
above. Normally just a few messages are downloaded at a time, but when the
script is used to initialise the mailbox cache file this number of messages may
be downloaded.
This value can be changed if desired, but it is set at 100 in order to avoid
overloading the Gmane servers.
=back
The B<cache> section contains the following:
=over 4
=item B<directory>
This defines the full path to the directory holding the mail cache file.
This may be changed to reflect a change of location.
=item B<filename>
This defines the name of the mail cache. It is expected to be found in the
directory defined above.
This may be changed to reflect a change of name.
=item B<regex>
Individual messages in the cache each hold an URL which provides a permanent
link to the message on the Gmane site. The script uses the regular expression
defined here to collect the message number from this URL. This is how the
script determines the number of the oldest message in the cache when looking
for new messages.
This expression should only be changed with great caution.
=back
=head1 DEPENDENCIES
Getopt::Long
Pod::Usage
Config::General
LWP::UserAgent
Mail::Box::Manager
Date::Parse
Date::Calc
DateTime
DateTime::TimeZone
Template
Template::Filters
Data::Dumper
=head1 BUGS AND LIMITATIONS
There are no known bugs in this module.
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
Patches are welcome.
=head1 AUTHOR
Dave Morriss (Dave.Morriss@gmail.com)
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2014, 2015 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See perldoc perlartistic.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
#}}}
# [zo to open fold, zc to close]
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker