394 lines
12 KiB
Plaintext
394 lines
12 KiB
Plaintext
|
#!/usr/bin/env perl
|
||
|
#===============================================================================
|
||
|
#
|
||
|
# FILE: make_markdown
|
||
|
#
|
||
|
# USAGE: ./make_markdown [-[no]simple] [-[no]entities] [-[no]stamp] file
|
||
|
#
|
||
|
# DESCRIPTION: Turn plain text to Markdown. Designed to be used as a filter
|
||
|
# in vim.
|
||
|
# Finds all bare URLs, ignoring lines with HTML or Markdown
|
||
|
# links. It checks the path part of the URL to see if it's an
|
||
|
# image. If it is it uses an image link, otherwise it uses
|
||
|
# a plain link. Links are all to HPR standard where the text
|
||
|
# part and the URL part are the same.
|
||
|
# Unless the '-simple' option is present all HTML URLs are
|
||
|
# accumulated, and if the host hasn't provided a 'Links' section
|
||
|
# they are added to one. This happens even if the overall text
|
||
|
# is short, but it's easy to remove if not wanted.
|
||
|
# By default, in -nosimple mode an HTML comment with details of
|
||
|
# this script is added to the output. This can be turned off
|
||
|
# with the -nostamp option.
|
||
|
#
|
||
|
# OPTIONS: ---
|
||
|
# REQUIREMENTS: ---
|
||
|
# BUGS: ---
|
||
|
# NOTES: ---
|
||
|
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
|
||
|
# VERSION: 0.0.9
|
||
|
# CREATED: 2015-10-07 16:05:21
|
||
|
# REVISION: 2024-01-14 15:59:34
|
||
|
#
|
||
|
#===============================================================================
|
||
|
|
||
|
use 5.010;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use utf8;
|
||
|
|
||
|
use version 0.77; # Planning to experiment with this new feature
|
||
|
|
||
|
use open ':encoding(UTF-8)'; # Make all IO UTF-8
|
||
|
|
||
|
use Getopt::Long;
|
||
|
use Regexp::Common qw{URI pattern};
|
||
|
use HTML::Entities;
|
||
|
#use Encoding::FixLatin qw(fix_latin);
|
||
|
use URI::Find;
|
||
|
use URI;
|
||
|
use List::MoreUtils qw{uniq};
|
||
|
|
||
|
#
|
||
|
# Version number (manually incremented)
|
||
|
#
|
||
|
our $VERSION = 'v0.0.9';
|
||
|
|
||
|
#
|
||
|
# Script and directory names
|
||
|
#
|
||
|
( my $PROG = $0 ) =~ s|.*/||mx;
|
||
|
( my $DIR = $0 ) =~ s|/?[^/]*$||mx;
|
||
|
$DIR = '.' unless $DIR;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Declarations
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Constants and other declarations
|
||
|
#
|
||
|
my $basedir = "$ENV{HOME}/HPR/Show_Submission";
|
||
|
|
||
|
my @urls;
|
||
|
|
||
|
my $have_links = 0;
|
||
|
my $uri_count;
|
||
|
my $new_url;
|
||
|
|
||
|
#
|
||
|
# Enable Unicode mode
|
||
|
#
|
||
|
#binmode STDIN, ":encoding(UTF-8)";
|
||
|
#binmode STDOUT, ":encoding(UTF-8)";
|
||
|
#binmode STDERR, ":encoding(UTF-8)";
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Patterns
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# 1. Look for a line beginning with 'Links:'
|
||
|
#
|
||
|
pattern
|
||
|
name => ['links'],
|
||
|
create => '^\s*Links:?\s*$',
|
||
|
;
|
||
|
|
||
|
#
|
||
|
# 2. Look for an HTML link (some people embed these in their notes)
|
||
|
#
|
||
|
pattern
|
||
|
name => ['html_link'],
|
||
|
create => '<a[^>]*>',
|
||
|
;
|
||
|
|
||
|
#
|
||
|
# 3. Look for existing Markdown links
|
||
|
#
|
||
|
pattern
|
||
|
name => ['md_link'],
|
||
|
create => '\[([^]]+)\]\(\1\)',
|
||
|
;
|
||
|
|
||
|
#
|
||
|
# 4. Look for mail addresses
|
||
|
#
|
||
|
pattern
|
||
|
name => ['mail_link'],
|
||
|
create => '(?i)[a-z0-9_+.](?:[a-z0-9_+.]+[a-z0-9_+])?\@[a-z0-9.-]+',
|
||
|
;
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Options and arguments
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Process options
|
||
|
#
|
||
|
my %options;
|
||
|
Options( \%options );
|
||
|
|
||
|
#
|
||
|
# Collect options
|
||
|
#
|
||
|
Usage() if ( $options{'help'} );
|
||
|
|
||
|
my $simple = ( defined( $options{simple} ) ? $options{simple} : 0 );
|
||
|
my $entities = ( defined( $options{entities} ) ? $options{entities} : 0 );
|
||
|
my $stamp = ( defined( $options{stamp} ) ? $options{stamp} : 0 );
|
||
|
|
||
|
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# Process the data on STDIN
|
||
|
#-------------------------------------------------------------------------------
|
||
|
#
|
||
|
# Set up the URI finder to call back to the 'cb_general' routine. We use
|
||
|
# a closure to allow the passing of the $simple variable
|
||
|
#
|
||
|
my $finder = URI::Find->new( sub { cb_general( shift, shift, $simple ) } );
|
||
|
|
||
|
#
|
||
|
# Slurp everything on STDIN
|
||
|
#
|
||
|
my @text = <>;
|
||
|
|
||
|
if ($stamp) {
|
||
|
printf "<!-- Processed by %s %s -->\n", $PROG, $VERSION unless $simple;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Process the stuff we got
|
||
|
#
|
||
|
foreach my $line (@text) {
|
||
|
chomp($line);
|
||
|
|
||
|
#
|
||
|
# Look for a user-provided 'Links' section
|
||
|
#
|
||
|
if ( $line =~ /($RE{links})/ ) {
|
||
|
$have_links = 1;
|
||
|
$line = "### $1";
|
||
|
print "$line\n";
|
||
|
next;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Check whether the line contains some HTML. If it does we don't want to
|
||
|
# touch it.
|
||
|
#
|
||
|
if ( $line =~ /$RE{html_link}/ ) {
|
||
|
print "$line\n";
|
||
|
next;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# An existing Markdown link means we'll skip the line
|
||
|
#
|
||
|
if ( $line =~ /$RE{md_link}/ ) {
|
||
|
print "$line\n";
|
||
|
next;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Special action for mail addresses.
|
||
|
#
|
||
|
# TODO: Loop through the line extracting each mail address and checking it
|
||
|
# with Email::Valid. If valid turn into a Markdown linkgg of the form
|
||
|
# <mailaddr>, otherwise ignore it.
|
||
|
#
|
||
|
if ( $line =~ /$RE{mail_link}/ ) {
|
||
|
$line =~ s/($RE{mail_link})/<$1>/g;
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Look for URLs of any sort and process them in the callback 'cb_general'
|
||
|
# defined earlier
|
||
|
#
|
||
|
$uri_count = $finder->find( \$line );
|
||
|
|
||
|
#
|
||
|
# Print the line. It might have been edited by the URI::Find callback or
|
||
|
# might just be as it was.
|
||
|
#
|
||
|
encode_entities($line) if ($entities);
|
||
|
print "$line\n";
|
||
|
}
|
||
|
|
||
|
#
|
||
|
# Unless we're in 'simple' mode generate links if there's data and the host
|
||
|
# hasn't provided them. Make sure they are unique (since there may have been
|
||
|
# multiple references in the notes)
|
||
|
#
|
||
|
unless ($simple) {
|
||
|
if ( @urls && !$have_links ) {
|
||
|
@urls = uniq @urls;
|
||
|
print "\n### Links\n\n";
|
||
|
|
||
|
foreach my $url (@urls) {
|
||
|
#$new_url = uri_reformat($url); # New format is not accepted
|
||
|
print "- [$url]($url)\n";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
exit;
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: cb_general
|
||
|
# PURPOSE: Process a URI found by URI::Find containing a link
|
||
|
# PARAMETERS: $uri_obj a URI object
|
||
|
# $uri_text the URI text
|
||
|
# $simple Boolean that controls what type of Markdown
|
||
|
# link is generated. If true (1) we use <URI>
|
||
|
# form, otherwise we use [URI](URI) form
|
||
|
# RETURNS: The modified URI text
|
||
|
# DESCRIPTION: The URI::Find 'find' method calls this callback for every URI
|
||
|
# it finds in the text it discovers. We only care about
|
||
|
# http/https here (for now anyway). If the text is changed and
|
||
|
# returned then the changed item is placed in the original text.
|
||
|
# We differentiate between what look like images and plain HTML
|
||
|
# URLs and generate different markup. We also save the HTML URL
|
||
|
# in a global array for building a list of links later.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub cb_general {
|
||
|
my ( $uri_obj, $uri_text, $simple ) = @_;
|
||
|
|
||
|
my $path;
|
||
|
#my $new_uri = uri_reformat($uri_text);
|
||
|
|
||
|
#
|
||
|
# For http: or https:
|
||
|
#
|
||
|
if ( $uri_obj->scheme =~ /^https?/ ) {
|
||
|
$path = $uri_obj->path;
|
||
|
if ( $path =~ /(?i:\.(jpg|png|gif)$)/ ) {
|
||
|
$uri_text = "![$uri_text]($uri_text)";
|
||
|
}
|
||
|
else {
|
||
|
push( @urls, $uri_text );
|
||
|
if ($simple) {
|
||
|
$uri_text = "<$uri_text>";
|
||
|
}
|
||
|
else {
|
||
|
$uri_text = "[$uri_text]($uri_text)";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $uri_text;
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: uri_reformat
|
||
|
# PURPOSE: Turns a URI into a form which is better for Markdown
|
||
|
# PARAMETERS: $uri_str - the URI string
|
||
|
# RETURNS: The URI string reformatted
|
||
|
# DESCRIPTION: Places the URI string in an URI object to parse it. Copies
|
||
|
# each of the elements to another empty URI object but takes
|
||
|
# advantage of the setting of URI::DEFAULT_QUERY_FORM_DELIMITER
|
||
|
# which forces the '&' delimiters to ';'. Returns the string
|
||
|
# version of this new URI.
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub uri_reformat {
|
||
|
my ($uri_str) = @_;
|
||
|
|
||
|
local $URI::DEFAULT_QUERY_FORM_DELIMITER = ';';
|
||
|
|
||
|
my $u1 = URI->new($uri_str);
|
||
|
my $u2 = URI->new;
|
||
|
|
||
|
$u2->scheme($u1->scheme);
|
||
|
$u2->authority($u1->authority);
|
||
|
$u2->path($u1->path);
|
||
|
$u2->query_form($u1->query_form);
|
||
|
$u2->fragment($u1->fragment);
|
||
|
|
||
|
return $u2->as_string;
|
||
|
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: coalesce
|
||
|
# PURPOSE: To find the first defined argument and return it
|
||
|
# PARAMETERS: Arbitrary number of arguments
|
||
|
# RETURNS: The first defined argument or undef if there are none
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: No exceptions
|
||
|
# COMMENTS: None
|
||
|
# SEE ALSO: N/A
|
||
|
#===============================================================================
|
||
|
sub coalesce {
|
||
|
foreach (@_) {
|
||
|
return $_ if defined($_);
|
||
|
}
|
||
|
return undef; ## no critic
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Usage
|
||
|
# PURPOSE: Display a usage message and exit
|
||
|
# PARAMETERS: None
|
||
|
# RETURNS: To command line level with exit value 1
|
||
|
# DESCRIPTION: Builds the usage message using global values
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Usage {
|
||
|
print STDERR <<EOD;
|
||
|
$PROG v$VERSION
|
||
|
|
||
|
Usage: $PROG [options]
|
||
|
|
||
|
Scans STDIN in the assumption that it is plain text or some kind of hybrid
|
||
|
Markdown looking for URLs, mail links, HTML and Markdown elements.
|
||
|
|
||
|
Options:
|
||
|
|
||
|
-help Display this information
|
||
|
-[no]simple Turns 'simple' mode on or off; it is off by default.
|
||
|
In 'simple' mode links are not accumulated and written
|
||
|
at the end of the data in a 'Links' section. URLs are
|
||
|
converted to a Markdown form though.
|
||
|
-[no]entities Turns 'entities' mode on and off; it is off by
|
||
|
default. When on this mode causes each line to be
|
||
|
scanned for characters best represented as HTML
|
||
|
entities and to change them appropriately. For example
|
||
|
the '<' character becomes '<'.
|
||
|
-[no]stamp Turns 'stamp' mode on and off; it is off by default.
|
||
|
When this mode is on an HTML comment is written at the
|
||
|
start of the output containing the name of the script
|
||
|
and its version number.
|
||
|
|
||
|
|
||
|
EOD
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
#=== FUNCTION ================================================================
|
||
|
# NAME: Options
|
||
|
# PURPOSE: Processes command-line options
|
||
|
# PARAMETERS: $optref Hash reference to hold the options
|
||
|
# RETURNS: Undef
|
||
|
# DESCRIPTION:
|
||
|
# THROWS: no exceptions
|
||
|
# COMMENTS: none
|
||
|
# SEE ALSO: n/a
|
||
|
#===============================================================================
|
||
|
sub Options {
|
||
|
my ($optref) = @_;
|
||
|
|
||
|
my @options = ( "help", "simple!", "entities!", "stamp!", );
|
||
|
|
||
|
if ( !GetOptions( $optref, @options ) ) {
|
||
|
die "Failed to process options\n";
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
|