b7cae1cb90
Show_Submission/copy_shownotes: Changed the location of the function library Show_Submission/do_brave: Updates to the way local stand-alone HTML is generated for review purposes. Show_Submission/do_index: Changed the location of the function library Show_Submission/do_pandoc: Changed the location of the function library; now uses 'author_title.pl' to generate YAML for Pandoc Show_Submission/do_parse: Trivial change Show_Submission/do_pictures: Changed the location of the function library; better handling of the show specification Show_Submission/do_report: Changed the location of the function library Show_Submission/do_update_reservations: Changed the location of the function library Show_Submission/fix_relative_links: Added features 'say' and 'state' Show_Submission/parse_JSON: New checks: notes too short, trailing spaces on title, summary and tags (needing JSON changes). Check for Markdown in the assets (see 'do_pandoc_assets'). New 'trim' function.
2426 lines
77 KiB
Perl
Executable File
2426 lines
77 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#===============================================================================
|
|
#
|
|
# FILE: parse_JSON
|
|
#
|
|
# USAGE: ./parse_JSON [-help] [-manpage] -episode=N -infile=FILE
|
|
# [-shownotes[=FILE]] [-format=FILE] [-release=FILE]
|
|
# [-assets=FILE] [-debug=N] [-[no]silent] [-[no]test]
|
|
#
|
|
# DESCRIPTION: Script to parse the JSON now being generated for HPR
|
|
# shows. Replaces the original 'parse_shownotes' and is a fair
|
|
# bit simpler because the parsing of the original PHP dump isn't
|
|
# needed. However, it has grown in complexity as features for
|
|
# handling assets have been added. Downstream scripts now need
|
|
# information about pictures and assets, and all archive files
|
|
# need to be unpacked and placed in a directory for uploading to
|
|
# the HPR server.
|
|
#
|
|
# OPTIONS: ---
|
|
# REQUIREMENTS: ---
|
|
# BUGS: ---
|
|
# NOTES: ---
|
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
|
# VERSION: 0.0.17
|
|
# CREATED: 2020-11-28 10:52:02
|
|
# REVISION: 2024-10-04 18:37:29
|
|
#
|
|
#===============================================================================
|
|
|
|
use v5.36;
|
|
use utf8;
|
|
use feature qw{ postderef say signatures state try };
|
|
no warnings
|
|
qw{ experimental::postderef experimental::signatures experimental::try };
|
|
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
|
|
use Term::ANSIColor;
|
|
|
|
use HTML::Parser ();
|
|
use HTML::LinkExtor;
|
|
use Text::CSV;
|
|
use Text::SpellChecker;
|
|
|
|
use JSON;
|
|
|
|
use List::MoreUtils qw{any};
|
|
use Archive::Any;
|
|
use MIME::Types;
|
|
use IO::Compress::Zip qw(:all);
|
|
use File::Copy;
|
|
use File::Spec::Functions;
|
|
use Path::Class;
|
|
|
|
use Log::Handler;
|
|
|
|
use Data::Dumper;
|
|
|
|
#
|
|
# Version number (manually incremented)
|
|
#
|
|
our $VERSION = '0.0.17';
|
|
|
|
#
|
|
# Script and directory names
|
|
#
|
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
|
$DIR = '.' unless $DIR;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Declarations
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Constants and other declarations
|
|
#
|
|
my $basedir = "$ENV{HOME}/HPR/Show_Submission";
|
|
my $logdir = "$basedir/logs";
|
|
my $logfile = "$logdir/${PROG}.log";
|
|
|
|
my ( $snlevel, $showno, $summarylength, $notelength );
|
|
|
|
our $MARKUP_DETECTED = 0;
|
|
|
|
#
|
|
# Maximum and minimum number of characters (bytes)
|
|
#
|
|
my $MAXNOTELEN = 4000;
|
|
my $MINNOTELEN = 10;
|
|
|
|
#
|
|
# Printing: general output format
|
|
#
|
|
my $ofmt = "%-17s %s\n";
|
|
|
|
#
|
|
# URL base for relative URLs pointing at assets
|
|
#
|
|
my $base_url = 'https://hackerpublicradio.org/';
|
|
|
|
#
|
|
# Text colours
|
|
#
|
|
my $red = color('red');
|
|
my $green = color('green');
|
|
my $yellow = color('yellow');
|
|
my $magenta = color('magenta');
|
|
my $bold = color('bold');
|
|
my $reset = color('reset');
|
|
|
|
my %snatts = (
|
|
html_percent => 0,
|
|
html_tags => 0,
|
|
is_markup => 0,
|
|
);
|
|
|
|
#
|
|
# A list of signature expressions used to try and guess what markup format was
|
|
# used.
|
|
#
|
|
# {{{
|
|
#
|
|
# Note that the regexes are applied to a string which is a concatenation of
|
|
# the notes, so care needs to be taken with anchors. After processing the JSON
|
|
# the lines of the notes will be separated by newline characters. Use an 'm'
|
|
# qualifier to make a '^' anchor treat it as multi-line.
|
|
#
|
|
my @markup_types = (
|
|
# Has [...](...) and ![...](...) entities
|
|
{ regex => qr{!?\[[^\]]+\]\(https?:[^)]+\)},
|
|
type => ['markdown'],
|
|
comment => "Markdown link or image"
|
|
},
|
|
|
|
# Has [...][nnn] entities
|
|
{ regex => qr{\[[^\]]+\]\[[^\]]+\]},
|
|
type => ['markdown'],
|
|
comment => "Markdown reference"
|
|
},
|
|
|
|
# Has lines beginning with 1-6 hashes (Atx-style header)
|
|
{ regex => qr{^#{1,6} \w}m,
|
|
type => ['markdown'],
|
|
comment => "Markdown Atx-style header"
|
|
},
|
|
|
|
# A line of stuff followed by a line of = or - (Setex-style header)
|
|
{ regex => qr{^(\s|\w)+.^[=-]+$}sm,
|
|
type => ['markdown'],
|
|
comment => "Markdown Setex-style header"
|
|
},
|
|
|
|
# Has lines starting with 0+ spaces hyphen 1+ spaces
|
|
{ regex => qr{^\s*[-*]\s+\w}m,
|
|
type => [ 'markdown', 'reStructuredText', 'org-mode' ],
|
|
comment => "Markdown or reStructuredText list element"
|
|
},
|
|
|
|
# Has lines beginning with '\d.\s' (number(s) dot space(s))
|
|
{ regex => qr{^\s*\d+\.\s+\w}m,
|
|
type => [ 'markdown', 'reStructuredText' ],
|
|
comment =>
|
|
"Markdown, reStructuredText or org-mode ordered list element"
|
|
},
|
|
|
|
# Has lines which begin with 3+ ` or ~
|
|
{ regex => qr{^[`~]{3,}$}m,
|
|
type => ['markdown'],
|
|
comment => "Markdown fenced block delimiter"
|
|
},
|
|
|
|
# Has words enclosed in 1 or 2 *
|
|
{ regex => qr{\*{1,2}\w+\*{1,2}},
|
|
type => [ 'markdown', 'reStructuredText' ],
|
|
comment => "Markdown or reStructuredText emphasis or bold"
|
|
},
|
|
|
|
# Has words enclosed in `
|
|
{ regex => qr{`\w+`},
|
|
type => ['markdown'],
|
|
comment => "Markdown verbatim text"
|
|
},
|
|
|
|
# Has a line of 3 or more *, - or _ possibly with intervening spaces
|
|
{ regex => qr{^[*_-]\s?[*_-]\s?[*_-][*_ -]*$}m,
|
|
type => ['markdown'],
|
|
comment => "Markdown horizontal rule"
|
|
},
|
|
|
|
# Has bare http/https URLs
|
|
{ regex => qr{[^(]\bhttps?:},
|
|
type => ['txt2tags'],
|
|
comment => "Bare http(s) url"
|
|
},
|
|
|
|
# Has [...]_ entities (footnote, citation reference)
|
|
{ regex => qr{\[(\w+|\d+|#|\*)\]_},
|
|
type => ['reStructuredText'],
|
|
comment => "reStructuredText footnote, citation or reference"
|
|
},
|
|
|
|
# Has `Text <URL>`_ entities (hyperlinks)
|
|
{ regex => qr{`\w+<https?:[^>]+>`_},
|
|
type => ['reStructuredText'],
|
|
comment => "reStructuredText hyperlink"
|
|
},
|
|
|
|
# Has superscript or subscript markup
|
|
{ regex => qr{(\^\w+\^|\~\w+\~)},
|
|
type => ['markdown'],
|
|
comment => "Markdown superscript or subscript"
|
|
},
|
|
|
|
# STARS KEYWORD PRIORITY TITLE TAGS is a header definition in Org mode
|
|
{ regex =>
|
|
qr{^\*+\s(TODO|FEEDBACK|VERIFY|\||DONE|DELEGATED)?\s?(\[[#]\w\])?\s?\w+}m,
|
|
type => ['org-mode'],
|
|
comment => "Org mode header"
|
|
},
|
|
|
|
# { regex => qr{},
|
|
# type => ['markdown'],
|
|
# comment => ""
|
|
# },
|
|
|
|
);
|
|
|
|
# }}}
|
|
|
|
my @markup_found;
|
|
my $markup_choice;
|
|
|
|
my ( $showdir, $assetdir, $statusfile );
|
|
my ( %media_files, $media_files );
|
|
my ( %assets, @assets, @audio, $sanitised );
|
|
my ( $has_audio, $has_extra, $has_archives );
|
|
my ( @archives, @extracted );
|
|
my ( $astate, $has_pictures, @pictures, $pictures );
|
|
my ( $has_markup, @markup, $markup );
|
|
my ( $json_change, @json_changes, %spellchecks );
|
|
my @pstates = (
|
|
'No pictures found', # 0
|
|
'Pictures that need management', # 1
|
|
'Managed pictures', # 2
|
|
);
|
|
|
|
#
|
|
# Enable Unicode mode
|
|
#
|
|
binmode STDOUT, ":encoding(UTF-8)";
|
|
binmode STDERR, ":encoding(UTF-8)";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Options and arguments
|
|
#-------------------------------------------------------------------------------
|
|
my $DEF_DEBUG = 0;
|
|
|
|
#
|
|
# Process options
|
|
#
|
|
my %options;
|
|
Options( \%options );
|
|
|
|
#
|
|
# Default help
|
|
#
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\n",
|
|
-verbose => 0,
|
|
-exitval => 1
|
|
) if ( $options{'help'} );
|
|
|
|
#
|
|
# Detailed help
|
|
#
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\n",
|
|
-verbose => 2,
|
|
-exitval => 1,
|
|
-noperldoc => 0
|
|
) if ( $options{'documentation'} );
|
|
|
|
#
|
|
# Collect options
|
|
#
|
|
my $DEBUG = ( defined( $options{debug} ) ? $options{debug} : $DEF_DEBUG );
|
|
my $silent = ( defined( $options{silent} ) ? $options{silent} : 0 );
|
|
my $test = ( defined( $options{test} ) ? $options{test} : 0 );
|
|
|
|
$showno = $options{episode};
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\nMissing mandatory option -episode\n",
|
|
-verbose => 0,
|
|
-exitval => 1
|
|
) unless $showno;
|
|
|
|
my $infile = $options{infile};
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\nMissing mandatory option -infile\n",
|
|
-verbose => 0,
|
|
-exitval => 1
|
|
) unless $infile;
|
|
die "Unable to access input file $infile: $!\n" unless -r $infile;
|
|
die "Input file $infile is empty\n" if -z $infile;
|
|
|
|
#
|
|
# Where to put various files. The second group of options must all have
|
|
# filenames to be created, but the first can define its own name if not given.
|
|
#
|
|
my $shownotes = $options{shownotes};
|
|
|
|
my $formatfile = $options{format};
|
|
my $releasefile = $options{release};
|
|
my $assetfile = $options{assets};
|
|
my $picturefile = $options{pictures};
|
|
my $zipfile = $options{zip};
|
|
|
|
#
|
|
# Work on the show notes output file, allowing defaults and substitution points
|
|
# for convenience.
|
|
#
|
|
if ( defined($shownotes) ) {
|
|
$shownotes = output_file_name( $shownotes, $showno, 'hpr%d.out' );
|
|
}
|
|
|
|
#
|
|
# Directories and file specific to this show
|
|
#
|
|
$showdir = "$basedir/shownotes/hpr$showno";
|
|
$assetdir = "$showdir/uploads";
|
|
$statusfile = "$showdir/.status";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Set up logging keeping the default log layout except for the date. The format
|
|
# is "%T [%L] %m" where '%T' is the timestamp, '%L' is the log level and '%m is
|
|
# the message.
|
|
#-------------------------------------------------------------------------------
|
|
my $log = Log::Handler->new();
|
|
|
|
$log->add(
|
|
file => {
|
|
timeformat => "%Y/%m/%d %H:%M:%S",
|
|
message_layout => ($test ? 'T ' : '') . "%T [%L] %m",
|
|
filename => $logfile,
|
|
minlevel => 0, # emergency, emerg
|
|
maxlevel => 7, # debug
|
|
}
|
|
);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Read the input file and parse it into $content (hashref)
|
|
#-------------------------------------------------------------------------------
|
|
my $json = JSON->new->utf8;
|
|
open( my $fh, '<:encoding(UTF-8)', $infile );
|
|
my $json_text = <$fh>;
|
|
close($fh);
|
|
|
|
my $content;
|
|
try {
|
|
$content = decode_json($json_text);
|
|
}
|
|
catch ($e) {
|
|
die colored( "Failed to decode the JSON in $infile", 'red' ) . "\n"
|
|
}
|
|
|
|
$log->info( $showno, "[$VERSION] Processing $infile" );
|
|
|
|
_debug( $DEBUG > 2, Dumper($content) );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Validate the JSON structure in case it's changed for some reason
|
|
#-------------------------------------------------------------------------------
|
|
die "The JSON doesn't have the right structure\n"
|
|
unless ( exists( $content->{episode} )
|
|
&& exists( $content->{metadata} )
|
|
&& exists( $content->{host} ) );
|
|
|
|
print STDERR '-' x 80, "\n";
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Report details from the parsed data. The percentages of HTML in the Host
|
|
# Profile and the notes themselves are computed using an HTML parser that counts
|
|
# the tags.
|
|
#-------------------------------------------------------------------------------
|
|
printf STDERR $ofmt, "Show:", $content->{metadata}{Episode_Number};
|
|
printf STDERR $ofmt, "Date:", $content->{metadata}{Episode_Date};
|
|
|
|
#
|
|
# Trim off leading and trailing spaces in these fields
|
|
#
|
|
$json_change = 0;
|
|
for my $key ( 'Title', 'Summary', 'Tags' ) {
|
|
my $str = trim($content->{episode}{$key});
|
|
if ($str ne $content->{episode}{$key}) {
|
|
$content->{episode}{$key} = $str;
|
|
$json_change = 1;
|
|
push(@json_changes,$key);
|
|
}
|
|
}
|
|
|
|
#
|
|
# Detect Unicode in the Title, Summary or Tags and flag their presence.
|
|
#
|
|
for my $key ( 'Title', 'Summary', 'Tags' ) {
|
|
my $uflag = ( $content->{episode}{$key} =~ /[^\x{00}-\x{7F}]/ );
|
|
printf STDERR $ofmt, "$key:",
|
|
colour_if(
|
|
$uflag,
|
|
# 'bold yellow on_magenta',
|
|
'black on_bright_yellow',
|
|
$content->{episode}{$key}
|
|
);
|
|
}
|
|
|
|
alert( $ofmt, "JSON to be updated; changes to: " .
|
|
join(',',@json_changes) ) if $json_change;
|
|
|
|
#
|
|
# Check summary length. The field might be filled and something might have
|
|
# been lost.
|
|
# TODO: Check on other lengths?
|
|
#
|
|
$summarylength = length($content->{episode}{Summary});
|
|
if ($summarylength eq 100) {
|
|
printf STDERR $ofmt, "- Summary: check length";
|
|
}
|
|
|
|
#
|
|
# Perform a spelling check on these fields. Collect any that are detected
|
|
# then, if any were found, report them.
|
|
#
|
|
for my $key ( 'Title', 'Summary', 'Tags' ) {
|
|
my @errors = spellcheck( $content->{episode}{$key} );
|
|
$spellchecks{$key} = \@errors if @errors;
|
|
}
|
|
_debug( $DEBUG > 2, '%spellchecks: ' . Dumper( \%spellchecks ) );
|
|
if (%spellchecks) {
|
|
print STDERR colored( "Spelling checks:\n", 'bold yellow' );
|
|
for my $key ( 'Title', 'Summary', 'Tags' ) {
|
|
printf STDERR $ofmt, "- $key:", join( ',', @{ $spellchecks{$key} } )
|
|
if ( exists( $spellchecks{$key} ) );
|
|
}
|
|
}
|
|
|
|
#
|
|
# Report on HTML in profile and notes
|
|
#
|
|
printf STDERR $ofmt, "Profile:",
|
|
count_tags( $content->{host}{Host_Profile} )
|
|
. " HTML start/end tags found";
|
|
$snlevel = count_tags( $content->{episode}{Show_Notes} );
|
|
printf STDERR $ofmt, "Notes:", $snlevel . " HTML start/end tags found";
|
|
|
|
#
|
|
# Check the note length because if they are too long we might want to split
|
|
# them
|
|
#
|
|
$notelength = length( $content->{episode}{Show_Notes} );
|
|
if ( $notelength > $MAXNOTELEN ) {
|
|
printf STDERR $ofmt, "Notes:",
|
|
colored( "Notes are longer than $MAXNOTELEN bytes ($notelength)",
|
|
'bold yellow on_magenta' );
|
|
}
|
|
elsif ( $notelength <= $MINNOTELEN ) {
|
|
printf STDERR $ofmt, "Notes:",
|
|
colored( "Notes are shorter than $MINNOTELEN bytes ($notelength)",
|
|
'bold yellow on_magenta' );
|
|
}
|
|
|
|
#
|
|
# Look for Unicode in the notes and flag it if found
|
|
#
|
|
if ( find_Unicode( $content->{episode}{Show_Notes} ) ) {
|
|
printf STDERR $ofmt, "Notes:",
|
|
colored( "Unicode characters found", 'bold yellow on_magenta' );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Always check that the given show number matches what we parsed
|
|
#-------------------------------------------------------------------------------
|
|
die "The episode specified ($showno) doesn't match the one in the JSON "
|
|
. "($content->{metadata}{Episode_Number})\n"
|
|
unless ( $content->{metadata}{Episode_Number} == $showno );
|
|
|
|
#
|
|
# A zero host id means this is a new host. HOWEVER, it can also mean an
|
|
# existing host gave a mail address we don't have in the database. to take
|
|
# care!
|
|
#
|
|
if ( $content->{host}{Host_ID} eq 0 ) {
|
|
print STDERR colored(
|
|
sprintf( $ofmt, "New host:", $content->{host}{Host_Name} ),
|
|
'bold yellow' );
|
|
}
|
|
else {
|
|
printf STDERR $ofmt, "Host name:", $content->{host}{Host_Name};
|
|
}
|
|
|
|
#
|
|
# Save HTML tag count
|
|
#
|
|
$snatts{html_tags} = $snlevel;
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Have a go at the HTML tag percentage
|
|
#-------------------------------------------------------------------------------
|
|
$snatts{html_percent}
|
|
= html_level( $content->{episode}{Show_Notes}, $snlevel );
|
|
|
|
if ( $snlevel > 0 ) {
|
|
$log->info( $showno, "Notes contain HTML" );
|
|
push( @markup_found, 'HTML' );
|
|
if ( $snatts{html_percent} > 5 ) {
|
|
$log->info( $showno, "Notes probably are HTML" );
|
|
push( @markup_found, 'HTML' );
|
|
}
|
|
else {
|
|
push(
|
|
@markup_found,
|
|
@{ detect_markup(
|
|
$content->{episode}{Show_Notes}, \@markup_types,
|
|
$ofmt
|
|
)
|
|
}
|
|
);
|
|
}
|
|
}
|
|
else {
|
|
$log->info( $showno, "Notes contain no HTML" );
|
|
push(
|
|
@markup_found,
|
|
@{ detect_markup( $content->{episode}{Show_Notes},
|
|
\@markup_types, $ofmt )
|
|
}
|
|
);
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Report what markup was used (not really necessary any more but left in
|
|
# because it's interesting).
|
|
#-------------------------------------------------------------------------------
|
|
# 2022-02-18 Now we have added 'org-mode' which doesn't match any formats sent
|
|
# from the web form, so we highlight this to help with the editing process.
|
|
# It's possible to use Pandoc to generate Markdown for editing or just convert
|
|
# straight to HTML if the result looks good.
|
|
#
|
|
$markup_choice = choose_markup( \@markup_found );
|
|
|
|
printf STDERR $ofmt, "Declared format:",
|
|
colour_if(
|
|
$content->{metadata}{Shownotes_Format} ne 'plain_text',
|
|
'bold yellow on_magenta',
|
|
$content->{metadata}{Shownotes_Format}
|
|
);
|
|
printf STDERR $ofmt, "Found:", summarise_markup( \@markup_found );
|
|
#printf STDERR $ofmt, "Recommendation:",
|
|
# colour_if(
|
|
# $markup_choice eq 'org-mode',
|
|
# 'bold yellow on_magenta',
|
|
# $markup_choice
|
|
# );
|
|
|
|
printf STDERR $ofmt, "Recommendation:",
|
|
colour_switch(
|
|
[
|
|
$markup_choice eq 'org-mode',
|
|
$markup_choice eq 'markdown',
|
|
$markup_choice eq 'txt2tags',
|
|
$markup_choice eq 'none',
|
|
$markup_choice =~ / or /,
|
|
],
|
|
[
|
|
'bold yellow on_magenta',
|
|
'yellow',
|
|
'yellow',
|
|
'red',
|
|
'yellow',
|
|
],
|
|
$markup_choice
|
|
);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# What about the media?
|
|
#-------------------------------------------------------------------------------
|
|
# Examine the delivered media details. Start by finding the maximum index
|
|
# (media_files is a JSON hash containing keys like 'name' where the values
|
|
# are arrays). The elements are linked by their positions.
|
|
#
|
|
my $maxind = $#{ $content->{metadata}{FILES}->{media_files}->{name} };
|
|
|
|
#
|
|
# Step through the media file names and their sizes building a Perl hash keyed by the
|
|
# name containing the file size of each
|
|
#
|
|
for ( my $i = 0; $i <= $maxind; $i++ ) {
|
|
$media_files{ $content->{metadata}{FILES}->{media_files}->{name}->[$i] }
|
|
= $content->{metadata}{FILES}->{media_files}->{size}->[$i];
|
|
}
|
|
|
|
#
|
|
# If a media file is zero size, remove it from the list
|
|
#
|
|
for my $key ( keys(%media_files) ) {
|
|
if ( $media_files{$key} eq 0 ) {
|
|
delete( $media_files{$key} );
|
|
$maxind--;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Generate a string containing the names of the (non-empty) media files
|
|
#
|
|
if (%media_files) {
|
|
$media_files = join( ", ", keys(%media_files) );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Determine the 'asset' state of the show
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# (The information is in %media_files although strictly there could be other
|
|
# files like scripts, and pictures.) The audio is in %media_files unless the
|
|
# host has sent in an URL for downloading.
|
|
#
|
|
if (%media_files) {
|
|
#
|
|
# Build a hash (%assets) keyed by filename with the value being a hashref
|
|
# with keys 'type' and 'size'.
|
|
#
|
|
for ( my $i = 0; $i <= $maxind; $i++ ) {
|
|
my $name = $content->{metadata}{FILES}->{media_files}->{name}->[$i];
|
|
$assets{$name} = {
|
|
type => $content->{metadata}{FILES}->{media_files}->{type}->[$i],
|
|
size => $content->{metadata}{FILES}->{media_files}->{size}->[$i],
|
|
};
|
|
}
|
|
|
|
#
|
|
# Process the assets
|
|
#
|
|
if (%assets) {
|
|
#
|
|
# Save just the file names in an array (@assets) for convenience.
|
|
#
|
|
@assets = keys(%assets);
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Check that there's audio in the media.
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# First look for any assets marked as audio or video (this one because
|
|
# some audio has been seen to be marked this way). We haven't expanded
|
|
# archive files yet because we don't want to eliminate those files
|
|
# from @assets.
|
|
#
|
|
$has_audio = @audio
|
|
= grep { $assets{$_}->{type} =~ /^(audio|video)/i } keys(%assets);
|
|
|
|
#
|
|
# Now, if no audio has yet been found, look for audio file extensions
|
|
#
|
|
unless ($has_audio) {
|
|
$has_audio = @audio
|
|
= grep { $_ =~ /\.(aac|flac|mp3|ogg|spx|wav)$/i }
|
|
keys(%assets);
|
|
}
|
|
|
|
#
|
|
# Check there is not more than one audio file
|
|
#
|
|
if ($#audio > 1) {
|
|
print colored(
|
|
"**Warning** More than one audio file uploaded", 'red'), "\n";
|
|
$log->warning( $showno, "More than one audio file uploaded" );
|
|
}
|
|
|
|
#
|
|
# We have audio. Use the @audio array to remove the matches from
|
|
# @assets.
|
|
#
|
|
if ($has_audio) {
|
|
@assets = array_difference( \@assets, \@audio );
|
|
}
|
|
if (@assets) {
|
|
_debug( $DEBUG > 2, '@assets: ' . Dumper( \@assets ) );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Look for un-processed markup in the assets hash. For the moment we
|
|
# only look for Markdown.
|
|
#-------------------------------------------------------------------------------
|
|
$has_markup = @markup
|
|
= grep { $assets{$_}->{type} =~ m{^text/markdown$} } keys(%assets);
|
|
$markup = join( ', ', @markup );
|
|
if (@markup) {
|
|
_debug( $DEBUG > 2, '@markup ' . Dumper( \@markup ) );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Look for archive files in the assets
|
|
#-------------------------------------------------------------------------------
|
|
if ( find_archive( \%assets, \@archives ) > 0 ) {
|
|
_debug( $DEBUG > 1, '@archives: ' . Dumper( \@archives ) );
|
|
|
|
#
|
|
# Extract the contents of any archives into the show directory and
|
|
# write details into %assets and @extracted.
|
|
#
|
|
$has_archives
|
|
= ( extract_archives( \%assets, \@archives, \@extracted, $showdir ) > 0 );
|
|
}
|
|
else {
|
|
#
|
|
# No archives, but we may have assets. Keep copies of the assets
|
|
# in case we need to make destructive changes.
|
|
#
|
|
if (@assets && $zipfile) {
|
|
my @input = map {"$showdir/$_"} @assets;
|
|
_debug( $DEBUG > 2, 'Zip creation - @input: ' . Dumper(
|
|
\@input ) );
|
|
|
|
zip \@input => $zipfile
|
|
or die "Zip creation failed: $ZipError\n";
|
|
}
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Refresh assets after expanding archives
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Remove archive files and add extracted files to the assets. We may
|
|
# have a different number than we started with because any archives found
|
|
# will have been expanded and the archive names removed.
|
|
#
|
|
@assets = array_difference(\@assets,\@archives);
|
|
push(@assets, @extracted);
|
|
|
|
#
|
|
# Remove markup files from the assets so we don't upload them
|
|
#
|
|
@assets = array_difference(\@assets,\@markup);
|
|
|
|
#
|
|
# Remove directory stuff from @assets elements and %assets keys
|
|
#
|
|
# foreach my $file (@assets) {
|
|
# ( my $key = $file ) =~ s{^.*/}{};
|
|
# $assets{$key} = delete( $assets{$file} );
|
|
# $file = $key;
|
|
# }
|
|
prepare_assets( \%assets, \@assets, $showdir, $assetdir );
|
|
if (@assets) {
|
|
_debug( $DEBUG > 1, 'After prepare_assets, %assets: ' . Dumper( \%assets ) );
|
|
_debug( $DEBUG > 1, 'After prepare_assets, @assets: ' . Dumper( \@assets ) );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Sanitise any filenames that need it
|
|
#-------------------------------------------------------------------------------
|
|
$sanitised = sanitise_filenames( \%assets, \@assets, $assetdir );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Look for any image files in the assets hash
|
|
#-------------------------------------------------------------------------------
|
|
$has_pictures = @pictures
|
|
= grep { $assets{$_}->{type} =~ /^image/ } keys(%assets);
|
|
$pictures = join( ', ', @pictures );
|
|
|
|
# if (@archives =
|
|
# grep { $assets{$_}->{type} =~ q{^application/zip} } keys(%assets)) {
|
|
# my $archive = Archive::Any->new( "$showdir/$archives[0]" );
|
|
# @archived = $archive->files;
|
|
# $has_pictures += scalar(@archived);
|
|
# }
|
|
|
|
# print "D> Pictures found\n" if $has_pictures;
|
|
# if ($has_pictures && @archived && $DEBUG > 2) {
|
|
# if ($has_pictures && $DEBUG > 2) {
|
|
# _debug (1, '@archives: ' . Dumper(\@archives));
|
|
# _debug (1, '@archived: ' . Dumper(\@archived));
|
|
# }
|
|
|
|
if ( $has_pictures && $DEBUG > 2 ) {
|
|
_debug( 1, '@pictures: ' . Dumper( \@pictures ) );
|
|
}
|
|
|
|
#
|
|
# Look for files not matching audio or image
|
|
#
|
|
$has_extra = grep { $assets{$_}->{type} !~ /^(image|audio|video)/ }
|
|
keys(%assets);
|
|
|
|
_debug( $DEBUG > 2, '%assets: ' . Dumper( \%assets ) ) if (%assets);
|
|
|
|
$log->info( $showno, "Media files: $media_files" ) if ($media_files);
|
|
$log->info( $showno, "Pictures: " . join( ', ', @pictures ) )
|
|
if (@pictures);
|
|
$log->info( $showno, "Markup " . join( ', ', @markup ) )
|
|
if (@markup);
|
|
$log->info( $showno, "Assets: " . join( ', ', @assets ) )
|
|
if (@assets);
|
|
|
|
}
|
|
}
|
|
else {
|
|
#
|
|
# No media files at all!
|
|
#
|
|
$has_archives = $has_pictures = $has_audio = $has_extra = 0;
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Sanity checks
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# 1. Check for included media or a download URL. Alert if neither
|
|
#
|
|
if ($media_files) {
|
|
#
|
|
# Have media, but do we have audio?
|
|
#
|
|
# printf STDERR $ofmt, "Media files:", $media_files;
|
|
printf STDERR "%s\n",
|
|
textFormat( $media_files, 'Media files:', 'L', 18, 80 );
|
|
unless ($has_audio) {
|
|
alert( $ofmt, "No audio!" );
|
|
$log->error( $showno, "No audio!" );
|
|
}
|
|
}
|
|
elsif ( length( $content->{metadata}{url} ) > 0 ) {
|
|
#
|
|
# No media, but do we have a download URL?
|
|
#
|
|
printf STDERR $ofmt, "Download from:", $content->{metadata}{url};
|
|
}
|
|
else {
|
|
#
|
|
# No media and no download URL, so no audio
|
|
#
|
|
alert( $ofmt, "No audio!" );
|
|
$log->error( $showno, "No audio!" );
|
|
}
|
|
|
|
#
|
|
# 2. Alert if no tags
|
|
#
|
|
if ( length( $content->{episode}{Tags} ) == 0 ) {
|
|
alert( $ofmt, "No tags!" );
|
|
$log->error( $showno, "No tags!" );
|
|
}
|
|
|
|
#
|
|
# 3. Declared html5 but no HTML found
|
|
#
|
|
if ( $snlevel == 0 && $content->{metadata}{Shownotes_Format} =~ /html5/i ) {
|
|
alert( $ofmt, "Declared format 'html5' with no HTML in the notes!" );
|
|
$log->error( $showno,
|
|
"Declared format 'html5' with no HTML in the notes!" );
|
|
# printf STDERR $ofmt, "",
|
|
# "${yellow}Forced format to 'plain_text'.${reset}";
|
|
printf STDERR colored(
|
|
sprintf( $ofmt, "", "Forced format to 'plain_text'." ),
|
|
'bold yellow' );
|
|
$content->{metadata}{Shownotes_Format} = 'plain_text';
|
|
# TODO: propagate the format change into the JSON
|
|
}
|
|
|
|
#
|
|
# 4. Tags aren't CSV. Not much of a check since so many formats are acceptable
|
|
# here but not in the database.
|
|
# If there's only one tag write an alert reporting the length in case the
|
|
# sender forgot the commas.
|
|
# TODO: Reconsider after using for a while. May be no point in doing this.
|
|
#
|
|
if ( length( $content->{episode}{Tags} ) > 0 ) {
|
|
my $csv = Text::CSV->new( { sep_char => ',', allow_whitespace => 1 } )
|
|
or warn "Problem with Text::CSV: $@";
|
|
my $status = $csv->parse( $content->{episode}{'Tags'} );
|
|
unless ($status) {
|
|
alert( $ofmt, $csv->error_input() . "\n" . $csv->error_diag() );
|
|
$log->error( $showno, "Tag format is invalid" );
|
|
}
|
|
my @cols = $csv->fields();
|
|
if ( scalar(@cols) eq 1 ) {
|
|
alert( $ofmt,
|
|
"Only one tag " . length( $cols[0] ) . " characters long!" );
|
|
}
|
|
}
|
|
|
|
#
|
|
# 5. Declared HTML, but does it have assets and if so are they properly
|
|
# pointed to?
|
|
#
|
|
if ( $snlevel > 0 && $content->{metadata}{Shownotes_Format} =~ /html5/i ) {
|
|
#
|
|
# Perform a check on the notes in memory, looking for asset links and
|
|
# checking they are kosher
|
|
#
|
|
unless ( check_html( $content->{episode}{Show_Notes}, $base_url ) ) {
|
|
print colored(
|
|
"Apparently incorrect URLs detected in the notes", 'red'
|
|
), "\n";
|
|
}
|
|
}
|
|
|
|
#
|
|
# 6. Declared plain text, but analysis recommends HTML, so highlight this to
|
|
# allow changes to be made.
|
|
#
|
|
#if ( $snlevel > 5 && $content->{metadata}{Shownotes_Format} =~ /plain_text/i ) {
|
|
if ( $markup_choice eq 'html5'
|
|
&& $content->{metadata}{Shownotes_Format} =~ /plain_text/i )
|
|
{
|
|
alert( $ofmt,
|
|
"Declared format 'plain_text' but notes seem to be HTML5!" );
|
|
$log->error( $showno,
|
|
"Declared format 'plain_text' but notes seem to be HTML5!" );
|
|
}
|
|
|
|
#
|
|
# 7. The host has sent in markup version(s) of their external notes, so we
|
|
# need to take special action.
|
|
#
|
|
if ($markup) {
|
|
printf STDERR "%s\n",
|
|
textFormat( $markup, 'Markup files:', 'L', 18, 80 );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Determine the picture asset state
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# If there are pictures and the format is plain text then the state is 'Pictures
|
|
# that need management' (1), otherwise 'Managed pictures' (2). If no pictures
|
|
# then state is 'No pictures found' (0).
|
|
#
|
|
if ($has_pictures) {
|
|
$astate
|
|
= ( $content->{metadata}{Shownotes_Format} eq 'plain_text' ? 1 : 2 );
|
|
# printf STDERR $ofmt, "Pictures:", $pictures;
|
|
printf STDERR "%s\n", textFormat( $pictures, 'Pictures:', 'L', 18, 80 );
|
|
}
|
|
else {
|
|
$astate = 0;
|
|
}
|
|
printf STDERR $ofmt, "Picture state:",
|
|
colour_if( $astate > 0, 'bold yellow on_magenta', $pstates[$astate] );
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Flag other assets so they aren't missed
|
|
#-------------------------------------------------------------------------------
|
|
if ($has_extra) {
|
|
printf STDERR $ofmt, "Asset state:",
|
|
colored( ['bold yellow on_magenta'], 'Other files uploaded' );
|
|
}
|
|
|
|
if (@assets) {
|
|
printf STDERR $ofmt, "Archives: ", join( ', ', @archives )
|
|
if $has_archives;
|
|
# printf STDERR $ofmt, "All assets:", join( ', ', @assets );
|
|
printf STDERR "%s\n",
|
|
textFormat( join( ', ', @assets ), 'All assets:', 'L', 18, 80 );
|
|
if ( $sanitised > 0 ) {
|
|
printf STDERR $ofmt, 'Assets sanitised:', "$sanitised";
|
|
}
|
|
}
|
|
|
|
#
|
|
# End of content report
|
|
#
|
|
print STDERR '-' x 80, "\n";
|
|
|
|
#
|
|
# Log choices and problems
|
|
#
|
|
$log->info( $showno,
|
|
"Declared format: " . $content->{metadata}{Shownotes_Format} );
|
|
$log->info( $showno, "Recommended markup processing: $markup_choice" );
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Output to relevant files
|
|
#-------------------------------------------------------------------------------
|
|
#
|
|
# Write the declared format to a file if requested
|
|
#
|
|
if ( defined($formatfile) ) {
|
|
open( my $fmtfile, '>:encoding(UTF-8)', $formatfile )
|
|
or die "Unable to open output file $formatfile $!\n";
|
|
print $fmtfile $content->{metadata}{Shownotes_Format}, "\n";
|
|
close($fmtfile);
|
|
print colored( "Format details written to $formatfile", 'green' ), "\n"
|
|
unless $silent;
|
|
$log->info( $showno, "Format details written to $formatfile" );
|
|
}
|
|
|
|
#
|
|
# Write the release date to a file if requested
|
|
#
|
|
if ( defined($releasefile) ) {
|
|
open( my $relfile, '>:encoding(UTF-8)', $releasefile )
|
|
or die "Unable to open output file $releasefile $!\n";
|
|
print $relfile $content->{metadata}{Episode_Date}, "\n";
|
|
close($relfile);
|
|
print colored( "Release date written to $releasefile", 'green' ), "\n"
|
|
unless $silent;
|
|
$log->info( $showno, "Release date written to $releasefile" );
|
|
}
|
|
|
|
#
|
|
# Write the picture file names to a file if requested and there are some
|
|
#
|
|
if ( @pictures && defined($picturefile) ) {
|
|
open( my $picfile, '>:encoding(UTF-8)', $picturefile )
|
|
or die "Unable to open output file $picturefile $!\n";
|
|
|
|
#
|
|
# Sort the pictures alphabetically in case the host didn't order them
|
|
# sensibly.
|
|
#
|
|
foreach my $picture (sort(@pictures)) {
|
|
print $picfile "$picture\n";
|
|
}
|
|
close($picfile);
|
|
|
|
print colored( "Picture names written to $picturefile", 'green' ), "\n"
|
|
unless $silent;
|
|
$log->info( $showno, "Picture names written to $picturefile" );
|
|
}
|
|
|
|
#
|
|
# Write the asset file names to a file if requested and there are some
|
|
#
|
|
if ( @assets && defined($assetfile) ) {
|
|
open( my $assfile, '>:encoding(UTF-8)', $assetfile )
|
|
or die "Unable to open output file $assetfile $!\n";
|
|
foreach my $asset (@assets) {
|
|
print $assfile "$asset\n";
|
|
}
|
|
close($assfile);
|
|
print colored( "Asset names written to $assetfile", 'green' ), "\n"
|
|
unless $silent;
|
|
$log->info( $showno, "Asset names written to $assetfile" );
|
|
}
|
|
|
|
#-------------------------------------------------------------------------------
|
|
# Write the shownotes component to a file if requested
|
|
#-------------------------------------------------------------------------------
|
|
if ($shownotes) {
|
|
#
|
|
# Remove carriage returns
|
|
#
|
|
$content->{episode}{Show_Notes} =~ s/\r//g;
|
|
_debug( $DEBUG > 2, quote_crlf( $content->{episode}{Show_Notes} ) );
|
|
|
|
#
|
|
# Save the notes
|
|
#
|
|
open( my $out, '>:encoding(UTF-8)', $shownotes )
|
|
or die "Unable to open output file $shownotes: $!\n";
|
|
print $out $content->{episode}{Show_Notes}, "\n";
|
|
close($out);
|
|
print colored( "Shownotes written to $shownotes", 'green' ), "\n"
|
|
unless $silent;
|
|
$log->info( $showno, "Shownotes written to $shownotes" );
|
|
|
|
#
|
|
# We're generating shownotes and we have declared HTML (or we're pretty
|
|
# sure after parsing that they are HTML), so save the notes as HTML as
|
|
# well. Now achieved simply by symlinking the two files if they are
|
|
# different. This allows the 'do_edit` script to edit the *.out file and
|
|
# thereby affect the *.html file.
|
|
#
|
|
# 2018-09-13: Removed the check for $markup_choice because it's adding
|
|
# links inappropriately.
|
|
#
|
|
# 2020-05-31: Turned the symlink to a hard link
|
|
#
|
|
if ( $content->{metadata}{Shownotes_Format} =~ /html5/i ) {
|
|
( my $htmlfile = $shownotes ) =~ s/\.[^.]+$/.html/;
|
|
|
|
unless ( $htmlfile eq $shownotes ) {
|
|
if ( link( $shownotes, $htmlfile ) ) {
|
|
print colored(
|
|
"HTML notes - linked $htmlfile to $shownotes", 'green'
|
|
),
|
|
"\n";
|
|
$log->info( $showno,
|
|
"HTML notes - linked $htmlfile to $shownotes" );
|
|
}
|
|
else {
|
|
print colored(
|
|
"**Warning** Failed to link $htmlfile to $shownotes",
|
|
'red'
|
|
),
|
|
"\n";
|
|
print colored( "Equivalent CLI command:", 'yellow' ),
|
|
"\n",
|
|
colored( "ln $shownotes $htmlfile", 'yellow' ), "\n";
|
|
$log->warning( $showno,
|
|
"Failed to link $htmlfile to $shownotes" );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
print colored( "No shownotes written", 'green' ), "\n" unless $silent;
|
|
$log->info( $showno, "No shownotes written" );
|
|
}
|
|
|
|
#
|
|
# Save the status
|
|
#
|
|
update_status($statusfile,'parsed');
|
|
|
|
exit;
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: find_archive
|
|
# PURPOSE: Look though the assets for files that are some kind of archive
|
|
# PARAMETERS: $assets hashref pointing to the asset filename list
|
|
# (contains size and type information)
|
|
# $archfiles arrayref to hold the archive files found
|
|
# RETURNS: Number of archive files found
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: Uses 'any' from 'List::MoreUtils'
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub find_archive {
|
|
my ( $assets, $archfiles ) = @_;
|
|
|
|
my ( $count, $type );
|
|
|
|
#
|
|
# Return if no assets given
|
|
#
|
|
return 0 unless %$assets;
|
|
|
|
#
|
|
# Known archive types (from what's been seen so far). Declared as
|
|
# a 'state' variable so it's only initialised once.
|
|
#
|
|
state @mime_types = (
|
|
#{{{
|
|
'application/x-tar',
|
|
'application/x-bzip-compressed-tar',
|
|
'application/x-compressed-tar',
|
|
'application/x-gtar',
|
|
'application/zip',
|
|
'application/x-zip-compressed',
|
|
'application/x-bzip2',
|
|
'application/gzip',
|
|
#}}}
|
|
);
|
|
|
|
$count = 0;
|
|
foreach my $file ( keys(%$assets) ) {
|
|
$type = $assets->{$file}->{type};
|
|
if ( any { $_ eq $type } @mime_types ) {
|
|
push( @$archfiles, $file );
|
|
$count++;
|
|
}
|
|
}
|
|
|
|
return $count;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: extract_archives
|
|
# PURPOSE: Given a list of archive files, extract their contents and save
|
|
# the file names
|
|
# PARAMETERS: $assets hashref containing the filename list and some
|
|
# attributes. Read and written.
|
|
# $archfiles arrayref holding the archive files found (by
|
|
# find_archive). Read only.
|
|
# $extracted arrayref to hold the files extracted from the
|
|
# archives.
|
|
# $showdir path to find the archive file(s)
|
|
# RETURNS: Number of files extracted. Alters %$assets to reflect the new
|
|
# files and remove the extracted archive files.
|
|
# DESCRIPTION: Just return with zero if there are no archives.
|
|
# Process the archives one by one. Open the current one with
|
|
# Archive::Any and save the file contents to return to the
|
|
# caller. Extract the archive contents to the 'uploads'
|
|
# directory in the directory for the show.
|
|
# Remove all the archive file references from %$assets. Add in
|
|
# the extracted files using MIME::Types to get the mime details
|
|
# and 'stat' to get the sizes. Keep a cound of the extracted
|
|
# files to return to the caller.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub extract_archives {
|
|
my ( $assets, $archfiles, $extracted, $showdir ) = @_;
|
|
|
|
my ( $aa, $mt, $typename, $filecount );
|
|
|
|
#
|
|
# Return if no archives given
|
|
#
|
|
return 0 unless @$archfiles;
|
|
|
|
#
|
|
# Loop through the archives we were given. There could be more than one,
|
|
# though it's likely to be rare.
|
|
#
|
|
foreach my $arch (@$archfiles) {
|
|
#print "D> Archive file: $arch\n";
|
|
|
|
#
|
|
# Open the archive file
|
|
#
|
|
$aa = Archive::Any->new("$showdir/$arch");
|
|
|
|
#
|
|
# Get and store a list of the contents of the archive
|
|
#
|
|
push( @$extracted, $aa->files );
|
|
|
|
#
|
|
# Extract everything to the show directory
|
|
#
|
|
#print "About to extract to $assetdir\n";
|
|
$aa->extract($showdir);
|
|
}
|
|
|
|
#print "D> Extracted files: ",join("\nD> ", @$extracted),"\n";
|
|
|
|
#
|
|
# Now update the %$assets hash, first deleting the archive files we have
|
|
# extracted the contents of.
|
|
#
|
|
foreach my $arch (@$archfiles) {
|
|
delete( $assets->{$arch} );
|
|
}
|
|
# _debug(1,'%$assets after deletions: ' . Dumper($assets));
|
|
|
|
#
|
|
# Now add in the extracted files with their details
|
|
#
|
|
foreach my $asset (@$extracted) {
|
|
#
|
|
# Rationalise the filename
|
|
#
|
|
$asset = canonpath($asset);
|
|
|
|
#
|
|
# Get the MIME type of the file (stringifying the result of mimeTypeOf
|
|
# otherwise it's a MIME::Type object)
|
|
#
|
|
$mt = MIME::Types->new;
|
|
$typename = "" . $mt->mimeTypeOf("$showdir/$asset");
|
|
$assets->{$asset} = {
|
|
type => $typename,
|
|
size => ( stat("$showdir/$asset") )[7],
|
|
};
|
|
|
|
$filecount++;
|
|
}
|
|
# _debug(1,'%$assets after additions: ' . Dumper($assets));
|
|
|
|
return $filecount;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: array_difference
|
|
# PURPOSE: Determine the members of the left array not in the right array
|
|
# PARAMETERS: $left arrayref to the left-hand array
|
|
# $right arrayref to the right-hand array
|
|
# RETURNS: The difference between the two arrays (@$left - @$right in
|
|
# concept). All the elements that are in @$left but not in
|
|
# @$right are returned.
|
|
# DESCRIPTION: Loads a hash with keys from the $left array. This will remove
|
|
# duplicates as a by-product. Using the $right array to provide
|
|
# keys we delete all hash elements that match. This leaves the
|
|
# hash containing the difference between the two arrays by
|
|
# "subtracting" the right from the left. Another consequence of
|
|
# the method used is that the resulting list is not in the same
|
|
# order as the $left array.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub array_difference {
|
|
my ( $left, $right ) = @_;
|
|
|
|
my %files;
|
|
|
|
unless (ref($left) eq 'ARRAY' && ref($right) eq 'ARRAY') {
|
|
warn "Arguments to array_difference must both be arrayrefs\n";
|
|
return;
|
|
}
|
|
|
|
@files{@$left} = (); # All files are the keys.
|
|
delete( @files{@$right} ); # Remove the links.
|
|
|
|
return keys(%files);
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: prepare_assets
|
|
# PURPOSE: Tidy assets, removing directories if necessary, and place the
|
|
# files in the upload directory
|
|
# PARAMETERS: $assets_h hashref containing asset details
|
|
# $assets_a arrayref containing asset names
|
|
# $showdir string containing the show directory
|
|
# $assetdir string containing the asset upload directory
|
|
# path
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub prepare_assets {
|
|
my ( $assets_h, $assets_a, $showdir, $assetdir ) = @_;
|
|
|
|
my ( $file, $dir, $base, @dels, %dirs, %moves );
|
|
|
|
#
|
|
# Create the asset directory
|
|
#
|
|
unless ( -e $assetdir ) {
|
|
mkdir($assetdir) or die "Failed to create $assetdir\n";
|
|
}
|
|
|
|
#
|
|
# Remove directory stuff from @assets elements and %assets keys saving
|
|
# directories, and moving files as we go.
|
|
#
|
|
# foreach my $file (@$assets_a) {
|
|
for ( my $i = 0; $i <= $#{$assets_a}; $i++ ) {
|
|
#
|
|
# Get and parse an asset filename (using Path::Class). Force the
|
|
# directory and base part to become strings ("stringify")
|
|
#
|
|
$file = file( $assets_a->[$i] );
|
|
$dir = "" . $file->dir;
|
|
$base = "" . $file->basename;
|
|
|
|
#
|
|
# Does the asset have a directory part and a basename part? If so we
|
|
# want to move the file to the asset directory and eventually delete
|
|
# the directory the asset came from.
|
|
#
|
|
# if ( ( $dir, $base ) = ( $file =~ qr{^(.*/)(.+)$} ) ) {
|
|
if ( $dir !~ /^\.?$/ && $base ne '' ) {
|
|
#
|
|
# The asset filename starts with a directory.
|
|
# Save it.
|
|
#
|
|
$dirs{$dir} = 1 unless exists( $dirs{$dir} );
|
|
# push( @dirs, $dir );
|
|
|
|
#
|
|
# Update the hash key
|
|
#
|
|
$assets_h->{$base} = delete( $assets_h->{$assets_a->[$i]} );
|
|
|
|
#
|
|
# Save the move
|
|
#
|
|
$moves{"$showdir/$file"} = "$assetdir/$base";
|
|
|
|
#
|
|
# Alter the asset name in @assets
|
|
#
|
|
# $file = $base;
|
|
$assets_a->[$i] = $base;
|
|
}
|
|
elsif ( $base eq '' ) {
|
|
#
|
|
# We just have a directory, no basename, so queue to delete it
|
|
#
|
|
push(@dels,$i);
|
|
}
|
|
else {
|
|
#
|
|
# No directory (or just '.'), just a basename, so queue the move
|
|
#
|
|
$moves{"$showdir/$base"} = "$assetdir/$base";
|
|
}
|
|
|
|
}
|
|
|
|
#
|
|
# Perform any moves (using File::Copy)
|
|
#
|
|
foreach my $move (keys(%moves)) {
|
|
# TODO: Is the 'unless' needed?
|
|
move ($move,$moves{$move}) unless ($move =~ qr{/$});
|
|
}
|
|
|
|
#
|
|
# Clean out unwanted directories from @$assets_a
|
|
#
|
|
foreach my $del (@dels) {
|
|
splice(@$assets_a,$del,1);
|
|
}
|
|
|
|
#
|
|
#
|
|
# Remove any real directories created by unpacking archives
|
|
#
|
|
foreach my $d (keys(%dirs)) {
|
|
rmdir("$showdir/$d") or
|
|
warn "Problem deleting unwanted directory: $d\n";
|
|
}
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: sanitise_filenames
|
|
# PURPOSE: Deal with filenames with spaces to make life easier
|
|
# PARAMETERS: $assets_h hashref containing asset details
|
|
# $assets_a arrayref containing asset names
|
|
# $assetdir string containing the asset upload directory
|
|
# path
|
|
# RETURNS: Number of fixes
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub sanitise_filenames {
|
|
my ( $assets_h, $assets_a, $assetdir ) = @_;
|
|
|
|
my ( $file, $newfile, %moves, $changes );
|
|
|
|
#
|
|
# Look at all asset filenames
|
|
#
|
|
for ( my $i = 0; $i <= $#{$assets_a}; $i++ ) {
|
|
$file = $assets_a->[$i];
|
|
|
|
#
|
|
# Does the name have whitespace?
|
|
#
|
|
if ($file =~ /\s/) {
|
|
#
|
|
# Sanitise and save the change
|
|
#
|
|
($newfile = $file) =~ s/ /_/g;
|
|
|
|
#
|
|
# Adjust asset array and hash
|
|
#
|
|
$assets_a->[$i] = $newfile;
|
|
$assets_h->{$newfile} = delete( $assets_h->{$file} );
|
|
|
|
#
|
|
# Save the file move
|
|
#
|
|
$moves{"$assetdir/$file"} = "$assetdir/$newfile";
|
|
}
|
|
}
|
|
|
|
if (%moves) {
|
|
#
|
|
# Perform any moves (using File::Copy)
|
|
#
|
|
$changes = 0;
|
|
foreach my $move (keys(%moves)) {
|
|
move ($move,$moves{$move});
|
|
$changes++;
|
|
}
|
|
|
|
return $changes;
|
|
}
|
|
else {
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: choose_markup
|
|
# PURPOSE: Given a list of possible markup types, choose which one to
|
|
# recommend
|
|
# PARAMETERS: $markup_found Arrayref containing markup names
|
|
# RETURNS: The markup name to use
|
|
# DESCRIPTION: The array referenced by $markup_found should have one or more
|
|
# instances of a markup type. We want to decide which one to
|
|
# recommend for processing.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub choose_markup {
|
|
my ($markup_found) = @_;
|
|
|
|
my %freq;
|
|
my @res;
|
|
|
|
return 'none' unless $markup_found && scalar(@$markup_found) > 0;
|
|
return 'none' if $markup_found->[0] eq 'none';
|
|
|
|
#
|
|
# Collect frequencies of each markup type
|
|
#
|
|
foreach my $mu (@$markup_found) {
|
|
if ( exists( $freq{$mu} ) ) {
|
|
$freq{$mu}++;
|
|
}
|
|
else {
|
|
$freq{$mu} = 1;
|
|
}
|
|
}
|
|
|
|
#
|
|
# If there's only one key then that's the recommendation
|
|
#
|
|
return ( keys(%freq) )[0] if scalar( keys(%freq) ) == 1;
|
|
|
|
#
|
|
# Find the highest frequency
|
|
#
|
|
my $max = 0;
|
|
foreach my $key ( keys(%freq) ) {
|
|
if ( $freq{$key} > $max ) {
|
|
$max = $freq{$key};
|
|
}
|
|
}
|
|
|
|
#
|
|
# Collect all the keys with the maximum frequency
|
|
#
|
|
foreach my $key ( keys(%freq) ) {
|
|
if ( $freq{$key} == $max ) {
|
|
push( @res, $key );
|
|
}
|
|
}
|
|
|
|
#
|
|
# Return the matching markup types
|
|
#
|
|
return join( ' or ', @res );
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: summarise_markup
|
|
# PURPOSE: Summarise the list of matches to markup types
|
|
# PARAMETERS: $markup_found Arrayref containing markup names
|
|
# RETURNS: A string containing markup types and their frequencies
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub summarise_markup {
|
|
my ($markup_found) = @_;
|
|
|
|
my %freq;
|
|
my @res;
|
|
|
|
return 'none' unless $markup_found && scalar(@$markup_found) > 0;
|
|
return 'none' if $markup_found->[0] eq 'none';
|
|
|
|
#
|
|
# Collect frequencies of each markup type
|
|
#
|
|
foreach my $mu (@$markup_found) {
|
|
if ( exists( $freq{$mu} ) ) {
|
|
$freq{$mu}++;
|
|
}
|
|
else {
|
|
$freq{$mu} = 1;
|
|
}
|
|
}
|
|
|
|
foreach my $key ( sort( keys(%freq) ) ) {
|
|
push( @res, "$key($freq{$key})" );
|
|
}
|
|
|
|
return join( ", ", @res );
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: detect_markup
|
|
# PURPOSE: Tries to detect whether there is any markup in the string and
|
|
# if so what it is
|
|
# PARAMETERS: $string String to examine
|
|
# $markup_types Arrayref pointing to regular expression
|
|
# structures to be used in the detection process
|
|
# $ofmt Output format for 'printf'
|
|
# RETURNS: A string containing the markup type(s) or 'none'
|
|
# DESCRIPTION: Applies regular expressions to the string (notes) looking for
|
|
# markup signatures. Array @markup_types contains one hash per
|
|
# regex, along with an array of markdown types the regex relates
|
|
# to and a comment explaining what's being detected. The regex
|
|
# is matched against the string and a count of matches returned.
|
|
# If there are matches then an analysis is printed on STDERR.
|
|
# The variable MARKUP_DETECTED is a global variable accessed
|
|
# through 'our' which controls whether the analyses have
|
|
# a single header.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub detect_markup {
|
|
my ( $string, $markup_types, $ofmt ) = @_;
|
|
|
|
my ( @type, $count );
|
|
our $MARKUP_DETECTED;
|
|
|
|
return ['none'] unless $string;
|
|
|
|
foreach my $test (@$markup_types) {
|
|
if ( $count = () = ( $string =~ /$test->{regex}/g ) ) {
|
|
print STDERR "Markup check:\n" if ( $MARKUP_DETECTED++ == 0 );
|
|
printf STDERR $ofmt, "- Matched:",
|
|
$test->{regex} . " -> " . join( ",", @{ $test->{type} } );
|
|
printf STDERR $ofmt, " Comment:", $test->{comment};
|
|
printf STDERR $ofmt, " Matches:", $count;
|
|
push( @type, @{ $test->{type} } ) for 1 .. $count;
|
|
}
|
|
}
|
|
|
|
return ( @type ? \@type : ['none'] );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: count_tags
|
|
# PURPOSE: Counts the number of start tags in what might be HTML
|
|
# PARAMETERS: $string String to examine
|
|
# RETURNS: An integer count of the number of start tags found
|
|
# DESCRIPTION: Uses HTML::Parser to parse the input string. A handler is
|
|
# defined which accumulates start and end tags in an array. The
|
|
# number of start tags found is returned to the caller.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub count_tags {
|
|
my ($string) = @_;
|
|
|
|
my @accum;
|
|
chomp($string);
|
|
return 0 unless $string;
|
|
|
|
my $p = HTML::Parser->new(
|
|
api_version => 3,
|
|
handlers => {
|
|
start => [ \@accum, "event,text" ],
|
|
end => [ \@accum, "event,text" ]
|
|
}
|
|
);
|
|
$p->parse($string);
|
|
$p->eof;
|
|
|
|
return scalar(@accum);
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: html_level
|
|
# PURPOSE: Returns the proportion of HTML in a string
|
|
# PARAMETERS: $string String to examine
|
|
# $tag_count Number of start & end tags
|
|
# RETURNS: A floating point number representing the (very) rough
|
|
# percentage of HTML tags in the string
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub html_level {
|
|
my ( $string, $tag_count ) = @_;
|
|
|
|
return '0.0' unless $string;
|
|
|
|
chomp($string);
|
|
my @words = split( /\s+/, $string );
|
|
my $wc = scalar(@words);
|
|
|
|
return sprintf( "%.2f", ( $tag_count / $wc ) * 100 );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: check_html
|
|
# PURPOSE: Checks an HTML string to ensure HPR links are valid
|
|
# PARAMETERS: $html HTML string
|
|
# RETURNS: True if valid, otherwise false
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub check_html {
|
|
my ( $html, $base_url ) = @_;
|
|
|
|
my ( $parser, @links, $result, $etype, $aname, $avalue );
|
|
my ( $base_re, $show_re, $host_re, $anchor_re, $asset_re );
|
|
|
|
chomp($html);
|
|
return 1 unless $html;
|
|
|
|
$parser = HTML::LinkExtor->new( undef, $base_url );
|
|
$parser->parse($html);
|
|
@links = $parser->links;
|
|
|
|
# print Dumper( \@links ), "\n";
|
|
|
|
#
|
|
# Make some regular expressions. Note that the $base_url is expected to
|
|
# end with a '/'
|
|
#
|
|
$base_re = qr{^${base_url}};
|
|
$host_re = qr{${base_re}correspondents.php\?hostid=\d{1,4}$};
|
|
$show_re = qr{${base_re}eps.php\?id=\d{1,4}$};
|
|
$anchor_re = qr{${base_re}\#.+$};
|
|
$asset_re = qr{${base_re}eps/hpr\d{1,4}/.+$};
|
|
|
|
#
|
|
# Walk the collected links
|
|
#
|
|
$result = 0;
|
|
foreach my $element (@links) {
|
|
#
|
|
# Collect the parsed links
|
|
#
|
|
( $etype, $aname, $avalue ) = @$element;
|
|
|
|
#
|
|
# Only check HPR links
|
|
#
|
|
if ( $avalue =~ /$base_re/ ) {
|
|
|
|
#
|
|
# Not interested in correctly formed asset or show links
|
|
#
|
|
return 1
|
|
if ( $avalue =~ /$show_re|$host_re|$anchor_re|$asset_re/ );
|
|
|
|
#
|
|
# Something seems to be wrong with a link
|
|
#
|
|
if ( $etype eq 'a' ) {
|
|
$result++;
|
|
print "<A> tag $avalue\n" unless $silent;
|
|
}
|
|
elsif ( $etype eq 'img' ) {
|
|
$result++;
|
|
print "<A> tag $avalue\n" unless $silent;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return $result == 0;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: find_Unicode
|
|
# PURPOSE: Scans a text string for Unicode in case we want to remove it
|
|
# PARAMETERS: $string String to examine
|
|
# RETURNS: True (1) if Unicode is found, otherwise False (0)
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub find_Unicode {
|
|
my ($string) = @_;
|
|
|
|
return ( $string =~ /[^\x{00}-\x{7F}]/ );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: trim
|
|
# PURPOSE: Trims leading and trailing spaces from a string
|
|
# PARAMETERS: string string to trim
|
|
# RETURNS: Trimmed string
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub trim {
|
|
my ($str) = @_;
|
|
|
|
$str =~ s/^\s+|\s+$//g;
|
|
return $str;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: output_file_name
|
|
# PURPOSE: Generate an output file name with three choices
|
|
# PARAMETERS: $optarg the argument to the option choosing the filename
|
|
# $showno the show number to add to certain name types
|
|
# $template a default 'sprintf' template for the name
|
|
# RETURNS: The name of the output file
|
|
# DESCRIPTION: If there's a defined output filename then there are three
|
|
# options: a null string, a plain filename and a substitution
|
|
# string with '%d' sequences. The null string means the user used
|
|
# '-option' without a value, so we want to generate a substitution
|
|
# string. A string with '%d' requires a check to ensure there's
|
|
# the right number, just one. The plain filename needs no more
|
|
# work.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub output_file_name {
|
|
my ( $optarg, $showno, $template ) = @_;
|
|
|
|
my ( $filename, $count );
|
|
|
|
#
|
|
# We shouldn't be called with a null option argument
|
|
#
|
|
return unless defined($optarg);
|
|
|
|
#
|
|
# Does the option have an argument?
|
|
#
|
|
if ( $optarg =~ /^$/ ) {
|
|
#
|
|
# No argument; use the show number from the -episode=N option
|
|
#
|
|
$filename = sprintf( $template, $showno );
|
|
}
|
|
elsif ( $optarg =~ /%d/ ) {
|
|
#
|
|
# There's an argument, does it have a '%d' in it?
|
|
#
|
|
$count = () = $optarg =~ /%d/g;
|
|
die "Invalid - too many '%d' sequences in '$optarg'\n"
|
|
if ( $count > 1 );
|
|
$filename = sprintf( $optarg, $showno );
|
|
}
|
|
else {
|
|
#
|
|
# It's a plain filename, just return it
|
|
#
|
|
$filename = $optarg;
|
|
}
|
|
|
|
return $filename;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: update_status
|
|
# PURPOSE: Updates the status file
|
|
# PARAMETERS: $sfile Name and path of the status file
|
|
# $state State to set, such as 'parsed'
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Appends the state string to the named file.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub update_status {
|
|
my ($sfile, $state) = @_;
|
|
|
|
open( my $sfh, '>>:encoding(UTF-8)', $sfile )
|
|
or die "Unable to open output file $sfile $!\n";
|
|
print $sfh "$state\n";
|
|
close($sfh);
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: spellcheck
|
|
# PURPOSE: Perform simple spell checks on strings
|
|
# PARAMETERS: $string Input string to check
|
|
# RETURNS: List containing any problem words or undef if none
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub spellcheck {
|
|
my ($string) = @_;
|
|
|
|
my @errors;
|
|
|
|
my $checker = Text::SpellChecker->new(
|
|
text => $string,
|
|
lang => 'en_GB.UTF-8'
|
|
);
|
|
|
|
while ( my $word = $checker->next_word ) {
|
|
push( @errors, $word );
|
|
}
|
|
|
|
return @errors;
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: textFormat
|
|
# PURPOSE: Formats a block of text in an indented, wrapped style with
|
|
# a label in the left column
|
|
# PARAMETERS: $text The text to be formatted, as a scalar string
|
|
# $tag The label to be added to the left of the top
|
|
# line
|
|
# $align Tag alignment, 'L' for left, otherwise right
|
|
# $lmargin Width of the left margin (assumed to be big
|
|
# enough for the tag)
|
|
# $textwidth The width of all of the text plus left margin
|
|
# (i.e. the right margin)
|
|
# RETURNS: The formatted result as a string
|
|
# DESCRIPTION: Chops the incoming text into words (thereby removing any
|
|
# formatting). Removes any leading spaces. Loops through the
|
|
# wordlist building them into lines of the right length to fit
|
|
# between the left and right margins. Saves the lines in an
|
|
# array. Adds the tag to the first line with the alignment
|
|
# requested then returns the array joined into a string.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: Inspired by Text::Format but *much* simpler. In fact T::F is
|
|
# a nasty thing to have to use; I couldn't get it to do what
|
|
# this routine does.
|
|
# TODO Make the routine more resilient to silly input values.
|
|
# SEE ALSO:
|
|
#===============================================================================
|
|
sub textFormat {
|
|
my ( $text, $tag, $align, $lmargin, $textwidth ) = @_;
|
|
|
|
my ( $width, $word );
|
|
my ( @words, @buff, @wrap );
|
|
|
|
#
|
|
# Build the tag early. If there's no text we'll just return the tag.
|
|
#
|
|
$tag = sprintf( "%*s",
|
|
( $align =~ /L/i ? ( $lmargin - 1 ) * -1 : $lmargin - 1 ), $tag );
|
|
|
|
return $tag unless $text;
|
|
|
|
$text =~ s/(^\s+|\s+$)//g;
|
|
return $tag unless $text;
|
|
|
|
#
|
|
# Chop up the incoming text removing leading spaces
|
|
#
|
|
@words = split( /\s+/, $text );
|
|
shift(@words) if ( @words && $words[0] eq '' );
|
|
|
|
#
|
|
# Compute the width of the active text
|
|
#
|
|
$width = $textwidth - $lmargin;
|
|
|
|
#
|
|
# Format the words into lines with a blank left margin
|
|
#
|
|
while ( defined( $word = shift(@words) ) ) {
|
|
if ( length( join( ' ', @buff, $word ) ) < $width ) {
|
|
push( @buff, $word );
|
|
}
|
|
else {
|
|
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) );
|
|
@buff = ($word);
|
|
}
|
|
}
|
|
|
|
#
|
|
# Append any remainder
|
|
#
|
|
push( @wrap, ' ' x $lmargin . join( ' ', @buff ) ) if @buff;
|
|
|
|
#
|
|
# Insert the tag into the first line
|
|
#
|
|
substr( $wrap[0], 0, $lmargin - 1 ) = $tag;
|
|
|
|
#
|
|
# Return the formatted array as a string
|
|
#
|
|
return join( "\n", @wrap );
|
|
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: colour_if
|
|
# PURPOSE: Colours a string if a condition is true
|
|
# PARAMETERS: $test criterion to determine colouring
|
|
# $colour colour code to use
|
|
# $string string to colour
|
|
# RETURNS: The (non-)coloured string
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub colour_if {
|
|
my ( $test, $colour, $string ) = @_;
|
|
|
|
if ($test) {
|
|
return colored( $string, $colour );
|
|
}
|
|
else {
|
|
return $string;
|
|
}
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: colour_switch
|
|
# PURPOSE: Chooses colours depending on multiple tests
|
|
# PARAMETERS: $tests arrayref containing a list of tests
|
|
# $colours arrayref containing a list of colour codes
|
|
# $string string to colour
|
|
# RETURNS: The string with the desired colour (or none)
|
|
# DESCRIPTION: The first test in @$tests which returns True causes the
|
|
# corresponding colour to be chosen (unless it's missing)
|
|
# otherwise, if no tests are True or the colour is missing,
|
|
# there's no colour applied.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub colour_switch {
|
|
my ( $tests, $colours, $string ) = @_;
|
|
|
|
for ( my $i = 0; $i <= $#{$tests}; $i++ ) {
|
|
if ( $tests->[$i] ) {
|
|
if ( exists( $colours->[$i] ) ) {
|
|
return colored( $string, $colours->[$i] );
|
|
}
|
|
else {
|
|
return $string;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $string;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: alert
|
|
# PURPOSE: Print an alert message using a format and colours
|
|
# PARAMETERS: $fmt the format to use. The first field is where
|
|
# the string 'ALERT' goes
|
|
# $message the alert message
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub alert {
|
|
my ( $fmt, $message ) = @_;
|
|
|
|
$fmt = "%-16s %s\n" unless $fmt;
|
|
print STDERR colored( sprintf( $fmt, "** ALERT **:", $message ),
|
|
'bold red' );
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: coalesce
|
|
# PURPOSE: To find the first defined argument and return it
|
|
# PARAMETERS: Arbitrary number of arguments
|
|
# RETURNS: The first defined argument or undef if there are none
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub coalesce {
|
|
foreach (@_) {
|
|
return $_ if defined($_);
|
|
}
|
|
return undef; ## no critic
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: quote_crlf
|
|
# PURPOSE: Display TAB (\t) and CRLF (\r\n) sequences for debugging
|
|
# PARAMETERS: $string String to process
|
|
# RETURNS: The string with TAB, CR and LF characters made visible for
|
|
# printing
|
|
# DESCRIPTION:
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub quote_crlf {
|
|
my ($string) = @_;
|
|
|
|
my $quoted = $string;
|
|
$quoted =~ s/\t/\\t/g;
|
|
$quoted =~ s/\n/\\n/g;
|
|
$quoted =~ s/\r/\\r/g;
|
|
|
|
return "$quoted\n";
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: _debug
|
|
# PURPOSE: Prints debug reports
|
|
# PARAMETERS: $active Boolean: 1 for print, 0 for no print
|
|
# $message Message to print
|
|
# RETURNS: Nothing
|
|
# DESCRIPTION: Outputs a message if $active is true. It removes any trailing
|
|
# newline and then adds one in the 'print' to the caller doesn't
|
|
# have to bother. Prepends the message with 'D> ' to show it's
|
|
# a debug message.
|
|
# THROWS: No exceptions
|
|
# COMMENTS: None
|
|
# SEE ALSO: N/A
|
|
#===============================================================================
|
|
sub _debug {
|
|
my ( $active, $message ) = @_;
|
|
|
|
chomp($message);
|
|
print STDERR "D> $message\n" if $active;
|
|
}
|
|
|
|
#=== FUNCTION ================================================================
|
|
# NAME: Options
|
|
# PURPOSE: Processes command-line options
|
|
# PARAMETERS: $optref Hash reference to hold the options
|
|
# RETURNS: Undef
|
|
# DESCRIPTION:
|
|
# THROWS: no exceptions
|
|
# COMMENTS: none
|
|
# SEE ALSO: n/a
|
|
#===============================================================================
|
|
sub Options {
|
|
my ($optref) = @_;
|
|
|
|
my @options = (
|
|
"help", "documentation|manpage",
|
|
"debug=i", "silent!",
|
|
"episode=i", "infile=s",
|
|
"shownotes:s", "format=s",
|
|
"release=s", "assets=s",
|
|
"pictures=s", "zip=s",
|
|
"test!",
|
|
);
|
|
|
|
if ( !GetOptions( $optref, @options ) ) {
|
|
pod2usage(
|
|
-msg => "$PROG version $VERSION\n",
|
|
-verbose => 0,
|
|
-exitval => 1
|
|
);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
__END__
|
|
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# Application Documentation
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
#{{{
|
|
|
|
=head1 NAME
|
|
|
|
parse_JSON - parse the JSON output file from the HPR show submission form
|
|
|
|
=head1 VERSION
|
|
|
|
This documentation refers to parse_JSON version 0.0.17
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
./parse_JSON [-help] [-documentation|manpage] -episode=N -infile=FILE
|
|
[-shownotes[=FILE]] [-format=FILE] [-release=FILE] [-assets=FILE]
|
|
[-pictures=FILE] [-zip=FILE] [-debug=N] [-[no]silent]
|
|
|
|
|
|
./parse_JSON -episode=2384 -infile=shownotes/hpr2384/shownotes.json
|
|
./parse_JSON -episode=2379 -infile=shownotes/hpr2379/shownotes.json -shownotes
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<-help>
|
|
|
|
Prints a brief help message describing the usage of the program, and then exits.
|
|
|
|
=item B<-manpage>
|
|
|
|
Prints the entire documentation for the script. Note that this mode should use
|
|
I<perldoc> but there seems to be a bug in this tool at the moment; it does not
|
|
render properly. Currently we are using a simpler type of formatter.
|
|
|
|
=item B<-episode=N>
|
|
|
|
This mandatory option specifies the number of the HPR episode that is being
|
|
parsed.
|
|
|
|
=item B<-infile=FILE>
|
|
|
|
This mandatory option provides the file containing the JSON-format output from
|
|
the submission form on the HPR website. It is expected that the file has been
|
|
collected by external scripts and saved in a local file.
|
|
|
|
=item B<-shownotes[=FILE]>
|
|
|
|
This option, which may be omitted, defines the location where the parsed show
|
|
notes are to be written. If omitted no show notes are written, the script
|
|
simply parses the input file and reports on what it finds.
|
|
|
|
If the option is given as B<-shownotes=FILE> the notes are written to the
|
|
nominated file.
|
|
|
|
The B<FILE> portion may be given as a template string containing a I<%d>
|
|
sequence. This sequence is replaced by the show number provided by the
|
|
B<-episode=N> option.
|
|
|
|
If the B<=FILE> portion is omitted a default template of 'hpr%d.out' is used
|
|
and again the I<%d> sequence is replaced by the show number provided by the
|
|
B<-episode=N> option.
|
|
|
|
If the notes are declared as HTML5 then a symbolic link is made from the
|
|
parsed shownote file to a file where the extension is 'html'. This is to allow
|
|
the 'do_vim' script to edit the output file as in other cases but to ensure
|
|
that the file to be uploaded to the server is also edited.
|
|
|
|
=item B<-format=FILE>
|
|
|
|
This option, which may be omitted, causes the declared format to be written to
|
|
a nominated file. This is to allow other scripts to take action depending on
|
|
the format of the notes without having to determine this information
|
|
themselves.
|
|
|
|
If the option is omitted no format information is written.
|
|
|
|
=item B<-release=FILE>
|
|
|
|
This option, which may be omitted, causes the show's release date to be
|
|
written to a nominated file. This is to allow other scripts to take action
|
|
depending on the release date of the show without having to determine this
|
|
information themselves.
|
|
|
|
If the option is omitted no release date is written.
|
|
|
|
=item B<-assets=FILE>
|
|
|
|
This option, which may be omitted, causes a list of the show's assets to be
|
|
written to a nominated file. This is to allow other scripts to take action
|
|
depending on the assets of the show without having to determine this
|
|
information themselves.
|
|
|
|
Note that the assets do not include the main audio file.
|
|
|
|
If no assets exist or the option is omitted no asset list is written.
|
|
|
|
=item B<-pictures=FILE>
|
|
|
|
This option, which may be omitted, causes a list of any picture assets to be
|
|
written to a nominated file. This is to allow other scripts to take action
|
|
depending on pictures sent with the show without having to determine this
|
|
information themselves.
|
|
|
|
If no pictures exist or the option is omitted no picture list is written.
|
|
|
|
=item B<-zip=FILE>
|
|
|
|
This option, which may be omitted, causes assets to written to a Zip file
|
|
with the name given. This only happens if the assets were sent separately, and
|
|
not in an archive file of some kind. This is because the processing of some
|
|
assets (only pictures at the moment) can cause changes to be made to the
|
|
originals. This step can be taken to preserve the original files.
|
|
|
|
If no assets exist or an archive file already exists, or the option is omitted
|
|
no zip file is written.
|
|
|
|
=item B<-debug=N>
|
|
|
|
Causes certain debugging information to be displayed.
|
|
|
|
0 (the default) no debug output
|
|
1 N/A
|
|
2 reports lines being parsed from the input file
|
|
3 dumps two data structures:
|
|
- @content: an array containing lines from the input file
|
|
- %showdetails: a hash containing the parsed data
|
|
|
|
=item B<-[no]silent>
|
|
|
|
Controls the amount of information generated. If B<-silent> is selected the
|
|
output is reduced, otherwise B<-nosilent> is assumed or may be provided when
|
|
information about the generation of output files is reported.
|
|
|
|
=item B<-[no]test>
|
|
|
|
Used when running tests on the script. Controls the log output, adding 'T ' at
|
|
the start of the log record. If B<-test> is selected the
|
|
prefix is added, otherwise B<-notest> is assumed and no prefix is added.
|
|
|
|
=back
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This script is meant to be a component in a workflow dealing with the notes
|
|
submitted by HPR hosts with the shows they are uploading.
|
|
|
|
The script reads a JSON file generated by the show upload form on the HPR
|
|
website. It decodes this into a hash structure containing all of the elements
|
|
of the file.
|
|
|
|
Regardless of settings a check is made to ensure that the episode number
|
|
provided to the script matches that found in the input file.
|
|
|
|
An analysis of the show notes is carried out to determine how much HTML they
|
|
contain. The HTML level is determined as a percentage. If it is above 5% then
|
|
the notes are deemed to be HTML. If below 5% then an attempt is made to
|
|
determine what type of markup has been used.
|
|
|
|
An analysis of the host details is also made to determine the composition of
|
|
the text therein.
|
|
|
|
Reports are generated for all of these components. A recommendation of the
|
|
type of processing the notes will require is also made. This was intended to
|
|
assist with automatic processing, but has not proved to be reliable enough.
|
|
|
|
The author of the notes declares what format they are. This is not currently
|
|
compared with what the analysis reports, but this may be added in a later
|
|
version.
|
|
|
|
The notes are written to the nominated file (if there is one) for further
|
|
processing, database insertion, etc.
|
|
|
|
If the notes are declared to be HTML then a copy is written to a file with the
|
|
same name as the output file but with an 'html' suffix. This can be rendered
|
|
to see if the notes really are HTML and look good enough for uploading.
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
Error and warning messages generated by the script.
|
|
|
|
=over 4
|
|
|
|
=item B<Unable to access input file ...: ...>
|
|
|
|
Type: fatal
|
|
|
|
The script attempted to open the input file but failed. The message will
|
|
contain details of the failure.
|
|
|
|
=item B<Input file ... is empty>
|
|
|
|
Type: fatal
|
|
|
|
The input file exists but is empty
|
|
|
|
=item B<Invalid - too many '%d' sequences in '...'>
|
|
|
|
Type: fatal
|
|
|
|
The template provided to the B<-shownotes=FILE> option contained too many '%d'
|
|
sequences. Ensure that there is only one of these. See above for details.
|
|
|
|
=item B<The JSON doesn't have the right structure>
|
|
|
|
Type: fatal
|
|
|
|
After a few basic checks on the components of the JSON file it does not seem
|
|
to be valid. The script expects there at least to be top-level sections with
|
|
the keys: I<episode>, I<metadata> and I<host>.
|
|
|
|
=item B<The episode specified (...) doesn't match the one in the JSON (...)>
|
|
|
|
Type: fatal
|
|
|
|
The episode number in the B<-episode=N> option does not match the episode
|
|
number found in the JSON. Both numbers are reported in the error message.
|
|
|
|
=item B<Unable to open output file ...: ...>
|
|
|
|
Type: fatal
|
|
|
|
The script was unable to open one or both of the output files or the file of
|
|
format information. These will be named according to the B<-shownotes=FILE>
|
|
option with a suffix of '.out' or '.html' or specified by the B<-format=FILE>
|
|
option.
|
|
|
|
=back
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
Data::Dumper
|
|
Getopt::Long
|
|
HTML::Parser
|
|
JSON
|
|
Link::Extor
|
|
Log::Handler
|
|
Pod::Usage
|
|
Term::ANSIColor
|
|
Text::CSV
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
There are no known bugs in this module.
|
|
Please report problems to Dave Morriss (Dave.Morriss@gmail.com)
|
|
Patches are welcome.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Dave Morriss (Dave.Morriss@gmail.com)
|
|
|
|
=head1 LICENCE AND COPYRIGHT
|
|
|
|
Copyright (c) 2021 Dave Morriss (Dave.Morriss@gmail.com). All rights reserved.
|
|
|
|
This module is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself. See perldoc perlartistic.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
=cut
|
|
|
|
#}}}
|
|
|
|
# [zo to open fold, zc to close]
|
|
|
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|
|
|