#!/usr/bin/env perl
#===============================================================================
#
#         FILE: make_markdown
#
#        USAGE: ./make_markdown [-[no]simple] [-[no]entities]  [-[no]stamp] file
#
#  DESCRIPTION: Turn plain text to Markdown. Designed to be used as a filter
#               in vim.
#               Finds all bare URLs, ignoring lines with HTML or Markdown
#               links. It checks the path part of the URL to see if it's an
#               image. If it is it uses an image link, otherwise it uses
#               a plain link. Links are all to HPR standard where the text
#               part and the URL part are the same.
#               Unless the '-simple' option is present all HTML URLs are
#               accumulated, and if the host hasn't provided a 'Links' section
#               they are added to one. This happens even if the overall text
#               is short, but it's easy to remove if not wanted.
#               By default, in -nosimple mode an HTML comment with details of
#               this script is added to the output. This can be turned off
#               with the -nostamp option.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.0.9
#      CREATED: 2015-10-07 16:05:21
#     REVISION: 2024-01-14 15:59:34
#
#===============================================================================

use 5.010;
use strict;
use warnings;
use utf8;

use version 0.77; # Planning to experiment with this new feature

use open ':encoding(UTF-8)'; # Make all IO UTF-8

use Getopt::Long;
use Regexp::Common qw{URI pattern};
use HTML::Entities;
#use Encoding::FixLatin qw(fix_latin);
use URI::Find;
use URI;
use List::MoreUtils qw{uniq};

#
# Version number (manually incremented)
#
our $VERSION = 'v0.0.9';

#
# Script and directory names
#
( my $PROG = $0 ) =~ s|.*/||mx;
( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
$DIR = '.' unless $DIR;

#-------------------------------------------------------------------------------
# Declarations
#-------------------------------------------------------------------------------
#
# Constants and other declarations
#
my $basedir = "$ENV{HOME}/HPR/Show_Submission";

my @urls;

my $have_links = 0;
my $uri_count;
my $new_url;

#
# Enable Unicode mode
#
#binmode STDIN, ":encoding(UTF-8)";
#binmode STDOUT, ":encoding(UTF-8)";
#binmode STDERR, ":encoding(UTF-8)";

#-------------------------------------------------------------------------------
# Patterns
#-------------------------------------------------------------------------------
#
# 1. Look for a line beginning with 'Links:'
#
pattern
    name   => ['links'],
    create => '^\s*Links:?\s*$',
    ;

#
# 2. Look for an HTML link (some people embed these in their notes)
#
pattern
    name   => ['html_link'],
    create => '<a[^>]*>',
    ;

#
# 3. Look for existing Markdown links
#
pattern
    name   => ['md_link'],
    create => '\[([^]]+)\]\(\1\)',
    ;

#
# 4. Look for mail addresses
#
pattern
    name   => ['mail_link'],
    create => '(?i)[a-z0-9_+.](?:[a-z0-9_+.]+[a-z0-9_+])?\@[a-z0-9.-]+',
    ;

#-------------------------------------------------------------------------------
# Options and arguments
#-------------------------------------------------------------------------------
#
# Process options
#
my %options;
Options( \%options );

#
# Collect options
#
Usage() if ( $options{'help'} );

my $simple   = ( defined( $options{simple} )   ? $options{simple}   : 0 );
my $entities = ( defined( $options{entities} ) ? $options{entities} : 0 );
my $stamp    = ( defined( $options{stamp} )    ? $options{stamp}    : 0 );


#-------------------------------------------------------------------------------
# Process the data on STDIN
#-------------------------------------------------------------------------------
#
# Set up the URI finder to call back to the 'cb_general' routine. We use
# a closure to allow the passing of the $simple variable
#
my $finder = URI::Find->new( sub { cb_general( shift, shift, $simple ) } );

#
# Slurp everything on STDIN
#
my @text = <>;

if ($stamp) {
    printf "<!-- Processed by %s %s -->\n", $PROG, $VERSION unless $simple;
}

#
# Process the stuff we got
#
foreach my $line (@text) {
    chomp($line);

    #
    # Look for a user-provided 'Links' section
    #
    if ( $line =~ /($RE{links})/ ) {
        $have_links = 1;
        $line       = "### $1";
        print "$line\n";
        next;
    }

    #
    # Check whether the line contains some HTML. If it does we don't want to
    # touch it.
    #
    if ( $line =~ /$RE{html_link}/ ) {
        print "$line\n";
        next;
    }

    #
    # An existing Markdown link means we'll skip the line
    #
    if ( $line =~ /$RE{md_link}/ ) {
        print "$line\n";
        next;
    }

    #
    # Special action for mail addresses.
    #
    # TODO: Loop through the line extracting each mail address and checking it
    # with Email::Valid. If valid turn into a Markdown linkgg of the form
    # <mailaddr>, otherwise ignore it.
    #
    if ( $line =~ /$RE{mail_link}/ ) {
        $line =~ s/($RE{mail_link})/<$1>/g;
    }

    #
    # Look for URLs of any sort and process them in the callback 'cb_general'
    # defined earlier
    #
    $uri_count = $finder->find( \$line );

    #
    # Print the line. It might have been edited by the URI::Find callback or
    # might just be as it was.
    #
    encode_entities($line) if ($entities);
    print "$line\n";
}

#
# Unless we're in 'simple' mode generate links if there's data and the host
# hasn't provided them. Make sure they are unique (since there may have been
# multiple references in the notes)
#
unless ($simple) {
    if ( @urls && !$have_links ) {
        @urls = uniq @urls;
        print "\n### Links\n\n";

        foreach my $url (@urls) {
            #$new_url = uri_reformat($url); # New format is not accepted
            print "- [$url]($url)\n";
        }
    }
}

exit;

#===  FUNCTION  ================================================================
#         NAME: cb_general
#      PURPOSE: Process a URI found by URI::Find containing a link
#   PARAMETERS: $uri_obj        a URI object
#               $uri_text       the URI text
#               $simple         Boolean that controls what type of Markdown
#                               link is generated. If true (1) we use <URI>
#                               form, otherwise we use [URI](URI) form
#      RETURNS: The modified URI text
#  DESCRIPTION: The URI::Find 'find' method calls this callback for every URI
#               it finds in the text it discovers. We only care about
#               http/https here (for now anyway). If the text is changed and
#               returned then the changed item is placed in the original text.
#               We differentiate between what look like images and plain HTML
#               URLs and generate different markup. We also save the HTML URL
#               in a global array for building a list of links later.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub cb_general {
    my ( $uri_obj, $uri_text, $simple ) = @_;

    my $path;
    #my $new_uri = uri_reformat($uri_text);

    #
    # For http: or https:
    #
    if ( $uri_obj->scheme =~ /^https?/ ) {
        $path = $uri_obj->path;
        if ( $path =~ /(?i:\.(jpg|png|gif)$)/ ) {
            $uri_text = "![$uri_text]($uri_text)";
        }
        else {
            push( @urls, $uri_text );
            if ($simple) {
                $uri_text = "<$uri_text>";
            }
            else {
                $uri_text = "[$uri_text]($uri_text)";
            }
        }
    }

    return $uri_text;
}

#===  FUNCTION  ================================================================
#         NAME: uri_reformat
#      PURPOSE: Turns a URI into a form which is better for Markdown
#   PARAMETERS: $uri_str - the URI string
#      RETURNS: The URI string reformatted
#  DESCRIPTION: Places the URI string in an URI object to parse it. Copies
#               each of the elements to another empty URI object but takes
#               advantage of the setting of URI::DEFAULT_QUERY_FORM_DELIMITER
#               which forces the '&' delimiters to ';'. Returns the string
#               version of this new URI.
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub uri_reformat {
    my ($uri_str) = @_;

    local $URI::DEFAULT_QUERY_FORM_DELIMITER = ';';

    my $u1 = URI->new($uri_str);
    my $u2 = URI->new;

    $u2->scheme($u1->scheme);
    $u2->authority($u1->authority);
    $u2->path($u1->path);
    $u2->query_form($u1->query_form);
    $u2->fragment($u1->fragment);

    return $u2->as_string;

}

#===  FUNCTION  ================================================================
#         NAME: coalesce
#      PURPOSE: To find the first defined argument and return it
#   PARAMETERS: Arbitrary number of arguments
#      RETURNS: The first defined argument or undef if there are none
#  DESCRIPTION:
#       THROWS: No exceptions
#     COMMENTS: None
#     SEE ALSO: N/A
#===============================================================================
sub coalesce {
    foreach (@_) {
        return $_ if defined($_);
    }
    return undef;    ## no critic
}

#===  FUNCTION  ================================================================
#         NAME: Usage
#      PURPOSE: Display a usage message and exit
#   PARAMETERS: None
#      RETURNS: To command line level with exit value 1
#  DESCRIPTION: Builds the usage message using global values
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Usage {
    print STDERR <<EOD;
$PROG v$VERSION

Usage: $PROG [options]

Scans STDIN in the assumption that it is plain text or some kind of hybrid
Markdown looking for URLs, mail links, HTML and Markdown elements.

Options:

    -help               Display this information
    -[no]simple         Turns 'simple' mode on or off; it is off by default.
                        In 'simple' mode links are not accumulated and written
                        at the end of the data in a 'Links' section. URLs are
                        converted to a Markdown form though.
    -[no]entities       Turns 'entities' mode on and off; it is off by
                        default. When on this mode causes each line to be
                        scanned for characters best represented as HTML
                        entities and to change them appropriately. For example
                        the '<' character becomes '&lt;'.
    -[no]stamp          Turns 'stamp' mode on and off; it is off by default.
                        When this mode is on an HTML comment is written at the
                        start of the output containing the name of the script
                        and its version number.


EOD
    exit(1);
}

#===  FUNCTION  ================================================================
#         NAME: Options
#      PURPOSE: Processes command-line options
#   PARAMETERS: $optref     Hash reference to hold the options
#      RETURNS: Undef
#  DESCRIPTION:
#       THROWS: no exceptions
#     COMMENTS: none
#     SEE ALSO: n/a
#===============================================================================
sub Options {
    my ($optref) = @_;

    my @options = ( "help", "simple!", "entities!", "stamp!", );

    if ( !GetOptions( $optref, @options ) ) {
        die "Failed to process options\n";
    }

    return;
}

# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker