#!/usr/bin/env perl #=============================================================================== # # FILE: make_markdown # # USAGE: ./make_markdown [-[no]simple] [-[no]entities] [-[no]stamp] file # # DESCRIPTION: Turn plain text to Markdown. Designed to be used as a filter # in vim. # Finds all bare URLs, ignoring lines with HTML or Markdown # links. It checks the path part of the URL to see if it's an # image. If it is it uses an image link, otherwise it uses # a plain link. Links are all to HPR standard where the text # part and the URL part are the same. # Unless the '-simple' option is present all HTML URLs are # accumulated, and if the host hasn't provided a 'Links' section # they are added to one. This happens even if the overall text # is short, but it's easy to remove if not wanted. # By default, in -nosimple mode an HTML comment with details of # this script is added to the output. This can be turned off # with the -nostamp option. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: --- # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.0.9 # CREATED: 2015-10-07 16:05:21 # REVISION: 2024-01-14 15:59:34 # #=============================================================================== use 5.010; use strict; use warnings; use utf8; use version 0.77; # Planning to experiment with this new feature use open ':encoding(UTF-8)'; # Make all IO UTF-8 use Getopt::Long; use Regexp::Common qw{URI pattern}; use HTML::Entities; #use Encoding::FixLatin qw(fix_latin); use URI::Find; use URI; use List::MoreUtils qw{uniq}; # # Version number (manually incremented) # our $VERSION = 'v0.0.9'; # # Script and directory names # ( my $PROG = $0 ) =~ s|.*/||mx; ( my $DIR = $0 ) =~ s|/?[^/]*$||mx; $DIR = '.' unless $DIR; #------------------------------------------------------------------------------- # Declarations #------------------------------------------------------------------------------- # # Constants and other declarations # my $basedir = "$ENV{HOME}/HPR/Show_Submission"; my @urls; my $have_links = 0; my $uri_count; my $new_url; # # Enable Unicode mode # #binmode STDIN, ":encoding(UTF-8)"; #binmode STDOUT, ":encoding(UTF-8)"; #binmode STDERR, ":encoding(UTF-8)"; #------------------------------------------------------------------------------- # Patterns #------------------------------------------------------------------------------- # # 1. Look for a line beginning with 'Links:' # pattern name => ['links'], create => '^\s*Links:?\s*$', ; # # 2. Look for an HTML link (some people embed these in their notes) # pattern name => ['html_link'], create => ']*>', ; # # 3. Look for existing Markdown links # pattern name => ['md_link'], create => '\[([^]]+)\]\(\1\)', ; # # 4. Look for mail addresses # pattern name => ['mail_link'], create => '(?i)[a-z0-9_+.](?:[a-z0-9_+.]+[a-z0-9_+])?\@[a-z0-9.-]+', ; #------------------------------------------------------------------------------- # Options and arguments #------------------------------------------------------------------------------- # # Process options # my %options; Options( \%options ); # # Collect options # Usage() if ( $options{'help'} ); my $simple = ( defined( $options{simple} ) ? $options{simple} : 0 ); my $entities = ( defined( $options{entities} ) ? $options{entities} : 0 ); my $stamp = ( defined( $options{stamp} ) ? $options{stamp} : 0 ); #------------------------------------------------------------------------------- # Process the data on STDIN #------------------------------------------------------------------------------- # # Set up the URI finder to call back to the 'cb_general' routine. We use # a closure to allow the passing of the $simple variable # my $finder = URI::Find->new( sub { cb_general( shift, shift, $simple ) } ); # # Slurp everything on STDIN # my @text = <>; if ($stamp) { printf "\n", $PROG, $VERSION unless $simple; } # # Process the stuff we got # foreach my $line (@text) { chomp($line); # # Look for a user-provided 'Links' section # if ( $line =~ /($RE{links})/ ) { $have_links = 1; $line = "### $1"; print "$line\n"; next; } # # Check whether the line contains some HTML. If it does we don't want to # touch it. # if ( $line =~ /$RE{html_link}/ ) { print "$line\n"; next; } # # An existing Markdown link means we'll skip the line # if ( $line =~ /$RE{md_link}/ ) { print "$line\n"; next; } # # Special action for mail addresses. # # TODO: Loop through the line extracting each mail address and checking it # with Email::Valid. If valid turn into a Markdown linkgg of the form # , otherwise ignore it. # if ( $line =~ /$RE{mail_link}/ ) { $line =~ s/($RE{mail_link})/<$1>/g; } # # Look for URLs of any sort and process them in the callback 'cb_general' # defined earlier # $uri_count = $finder->find( \$line ); # # Print the line. It might have been edited by the URI::Find callback or # might just be as it was. # encode_entities($line) if ($entities); print "$line\n"; } # # Unless we're in 'simple' mode generate links if there's data and the host # hasn't provided them. Make sure they are unique (since there may have been # multiple references in the notes) # unless ($simple) { if ( @urls && !$have_links ) { @urls = uniq @urls; print "\n### Links\n\n"; foreach my $url (@urls) { #$new_url = uri_reformat($url); # New format is not accepted print "- [$url]($url)\n"; } } } exit; #=== FUNCTION ================================================================ # NAME: cb_general # PURPOSE: Process a URI found by URI::Find containing a link # PARAMETERS: $uri_obj a URI object # $uri_text the URI text # $simple Boolean that controls what type of Markdown # link is generated. If true (1) we use # form, otherwise we use [URI](URI) form # RETURNS: The modified URI text # DESCRIPTION: The URI::Find 'find' method calls this callback for every URI # it finds in the text it discovers. We only care about # http/https here (for now anyway). If the text is changed and # returned then the changed item is placed in the original text. # We differentiate between what look like images and plain HTML # URLs and generate different markup. We also save the HTML URL # in a global array for building a list of links later. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub cb_general { my ( $uri_obj, $uri_text, $simple ) = @_; my $path; #my $new_uri = uri_reformat($uri_text); # # For http: or https: # if ( $uri_obj->scheme =~ /^https?/ ) { $path = $uri_obj->path; if ( $path =~ /(?i:\.(jpg|png|gif)$)/ ) { $uri_text = "![$uri_text]($uri_text)"; } else { push( @urls, $uri_text ); if ($simple) { $uri_text = "<$uri_text>"; } else { $uri_text = "[$uri_text]($uri_text)"; } } } return $uri_text; } #=== FUNCTION ================================================================ # NAME: uri_reformat # PURPOSE: Turns a URI into a form which is better for Markdown # PARAMETERS: $uri_str - the URI string # RETURNS: The URI string reformatted # DESCRIPTION: Places the URI string in an URI object to parse it. Copies # each of the elements to another empty URI object but takes # advantage of the setting of URI::DEFAULT_QUERY_FORM_DELIMITER # which forces the '&' delimiters to ';'. Returns the string # version of this new URI. # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub uri_reformat { my ($uri_str) = @_; local $URI::DEFAULT_QUERY_FORM_DELIMITER = ';'; my $u1 = URI->new($uri_str); my $u2 = URI->new; $u2->scheme($u1->scheme); $u2->authority($u1->authority); $u2->path($u1->path); $u2->query_form($u1->query_form); $u2->fragment($u1->fragment); return $u2->as_string; } #=== FUNCTION ================================================================ # NAME: coalesce # PURPOSE: To find the first defined argument and return it # PARAMETERS: Arbitrary number of arguments # RETURNS: The first defined argument or undef if there are none # DESCRIPTION: # THROWS: No exceptions # COMMENTS: None # SEE ALSO: N/A #=============================================================================== sub coalesce { foreach (@_) { return $_ if defined($_); } return undef; ## no critic } #=== FUNCTION ================================================================ # NAME: Usage # PURPOSE: Display a usage message and exit # PARAMETERS: None # RETURNS: To command line level with exit value 1 # DESCRIPTION: Builds the usage message using global values # THROWS: no exceptions # COMMENTS: none # SEE ALSO: n/a #=============================================================================== sub Usage { print STDERR <