forked from HPR/hpr-tools
		
	
		
			
				
	
	
		
			394 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			394 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env perl
 | |
| #===============================================================================
 | |
| #
 | |
| #         FILE: make_markdown
 | |
| #
 | |
| #        USAGE: ./make_markdown [-[no]simple] [-[no]entities]  [-[no]stamp] file
 | |
| #
 | |
| #  DESCRIPTION: Turn plain text to Markdown. Designed to be used as a filter
 | |
| #               in vim.
 | |
| #               Finds all bare URLs, ignoring lines with HTML or Markdown
 | |
| #               links. It checks the path part of the URL to see if it's an
 | |
| #               image. If it is it uses an image link, otherwise it uses
 | |
| #               a plain link. Links are all to HPR standard where the text
 | |
| #               part and the URL part are the same.
 | |
| #               Unless the '-simple' option is present all HTML URLs are
 | |
| #               accumulated, and if the host hasn't provided a 'Links' section
 | |
| #               they are added to one. This happens even if the overall text
 | |
| #               is short, but it's easy to remove if not wanted.
 | |
| #               By default, in -nosimple mode an HTML comment with details of
 | |
| #               this script is added to the output. This can be turned off
 | |
| #               with the -nostamp option.
 | |
| #
 | |
| #      OPTIONS: ---
 | |
| # REQUIREMENTS: ---
 | |
| #         BUGS: ---
 | |
| #        NOTES: ---
 | |
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
 | |
| #      VERSION: 0.0.9
 | |
| #      CREATED: 2015-10-07 16:05:21
 | |
| #     REVISION: 2024-01-14 15:59:34
 | |
| #
 | |
| #===============================================================================
 | |
| 
 | |
| use 5.010;
 | |
| use strict;
 | |
| use warnings;
 | |
| use utf8;
 | |
| 
 | |
| use version 0.77; # Planning to experiment with this new feature
 | |
| 
 | |
| use open ':encoding(UTF-8)'; # Make all IO UTF-8
 | |
| 
 | |
| use Getopt::Long;
 | |
| use Regexp::Common qw{URI pattern};
 | |
| use HTML::Entities;
 | |
| #use Encoding::FixLatin qw(fix_latin);
 | |
| use URI::Find;
 | |
| use URI;
 | |
| use List::MoreUtils qw{uniq};
 | |
| 
 | |
| #
 | |
| # Version number (manually incremented)
 | |
| #
 | |
| our $VERSION = 'v0.0.9';
 | |
| 
 | |
| #
 | |
| # Script and directory names
 | |
| #
 | |
| ( my $PROG = $0 ) =~ s|.*/||mx;
 | |
| ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx;
 | |
| $DIR = '.' unless $DIR;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Declarations
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Constants and other declarations
 | |
| #
 | |
| my $basedir = "$ENV{HOME}/HPR/Show_Submission";
 | |
| 
 | |
| my @urls;
 | |
| 
 | |
| my $have_links = 0;
 | |
| my $uri_count;
 | |
| my $new_url;
 | |
| 
 | |
| #
 | |
| # Enable Unicode mode
 | |
| #
 | |
| #binmode STDIN, ":encoding(UTF-8)";
 | |
| #binmode STDOUT, ":encoding(UTF-8)";
 | |
| #binmode STDERR, ":encoding(UTF-8)";
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Patterns
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # 1. Look for a line beginning with 'Links:'
 | |
| #
 | |
| pattern
 | |
|     name   => ['links'],
 | |
|     create => '^\s*Links:?\s*$',
 | |
|     ;
 | |
| 
 | |
| #
 | |
| # 2. Look for an HTML link (some people embed these in their notes)
 | |
| #
 | |
| pattern
 | |
|     name   => ['html_link'],
 | |
|     create => '<a[^>]*>',
 | |
|     ;
 | |
| 
 | |
| #
 | |
| # 3. Look for existing Markdown links
 | |
| #
 | |
| pattern
 | |
|     name   => ['md_link'],
 | |
|     create => '\[([^]]+)\]\(\1\)',
 | |
|     ;
 | |
| 
 | |
| #
 | |
| # 4. Look for mail addresses
 | |
| #
 | |
| pattern
 | |
|     name   => ['mail_link'],
 | |
|     create => '(?i)[a-z0-9_+.](?:[a-z0-9_+.]+[a-z0-9_+])?\@[a-z0-9.-]+',
 | |
|     ;
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Options and arguments
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Process options
 | |
| #
 | |
| my %options;
 | |
| Options( \%options );
 | |
| 
 | |
| #
 | |
| # Collect options
 | |
| #
 | |
| Usage() if ( $options{'help'} );
 | |
| 
 | |
| my $simple   = ( defined( $options{simple} )   ? $options{simple}   : 0 );
 | |
| my $entities = ( defined( $options{entities} ) ? $options{entities} : 0 );
 | |
| my $stamp    = ( defined( $options{stamp} )    ? $options{stamp}    : 0 );
 | |
| 
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| # Process the data on STDIN
 | |
| #-------------------------------------------------------------------------------
 | |
| #
 | |
| # Set up the URI finder to call back to the 'cb_general' routine. We use
 | |
| # a closure to allow the passing of the $simple variable
 | |
| #
 | |
| my $finder = URI::Find->new( sub { cb_general( shift, shift, $simple ) } );
 | |
| 
 | |
| #
 | |
| # Slurp everything on STDIN
 | |
| #
 | |
| my @text = <>;
 | |
| 
 | |
| if ($stamp) {
 | |
|     printf "<!-- Processed by %s %s -->\n", $PROG, $VERSION unless $simple;
 | |
| }
 | |
| 
 | |
| #
 | |
| # Process the stuff we got
 | |
| #
 | |
| foreach my $line (@text) {
 | |
|     chomp($line);
 | |
| 
 | |
|     #
 | |
|     # Look for a user-provided 'Links' section
 | |
|     #
 | |
|     if ( $line =~ /($RE{links})/ ) {
 | |
|         $have_links = 1;
 | |
|         $line       = "### $1";
 | |
|         print "$line\n";
 | |
|         next;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Check whether the line contains some HTML. If it does we don't want to
 | |
|     # touch it.
 | |
|     #
 | |
|     if ( $line =~ /$RE{html_link}/ ) {
 | |
|         print "$line\n";
 | |
|         next;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # An existing Markdown link means we'll skip the line
 | |
|     #
 | |
|     if ( $line =~ /$RE{md_link}/ ) {
 | |
|         print "$line\n";
 | |
|         next;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Special action for mail addresses.
 | |
|     #
 | |
|     # TODO: Loop through the line extracting each mail address and checking it
 | |
|     # with Email::Valid. If valid turn into a Markdown linkgg of the form
 | |
|     # <mailaddr>, otherwise ignore it.
 | |
|     #
 | |
|     if ( $line =~ /$RE{mail_link}/ ) {
 | |
|         $line =~ s/($RE{mail_link})/<$1>/g;
 | |
|     }
 | |
| 
 | |
|     #
 | |
|     # Look for URLs of any sort and process them in the callback 'cb_general'
 | |
|     # defined earlier
 | |
|     #
 | |
|     $uri_count = $finder->find( \$line );
 | |
| 
 | |
|     #
 | |
|     # Print the line. It might have been edited by the URI::Find callback or
 | |
|     # might just be as it was.
 | |
|     #
 | |
|     encode_entities($line) if ($entities);
 | |
|     print "$line\n";
 | |
| }
 | |
| 
 | |
| #
 | |
| # Unless we're in 'simple' mode generate links if there's data and the host
 | |
| # hasn't provided them. Make sure they are unique (since there may have been
 | |
| # multiple references in the notes)
 | |
| #
 | |
| unless ($simple) {
 | |
|     if ( @urls && !$have_links ) {
 | |
|         @urls = uniq @urls;
 | |
|         print "\n### Links\n\n";
 | |
| 
 | |
|         foreach my $url (@urls) {
 | |
|             #$new_url = uri_reformat($url); # New format is not accepted
 | |
|             print "- [$url]($url)\n";
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| exit;
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: cb_general
 | |
| #      PURPOSE: Process a URI found by URI::Find containing a link
 | |
| #   PARAMETERS: $uri_obj        a URI object
 | |
| #               $uri_text       the URI text
 | |
| #               $simple         Boolean that controls what type of Markdown
 | |
| #                               link is generated. If true (1) we use <URI>
 | |
| #                               form, otherwise we use [URI](URI) form
 | |
| #      RETURNS: The modified URI text
 | |
| #  DESCRIPTION: The URI::Find 'find' method calls this callback for every URI
 | |
| #               it finds in the text it discovers. We only care about
 | |
| #               http/https here (for now anyway). If the text is changed and
 | |
| #               returned then the changed item is placed in the original text.
 | |
| #               We differentiate between what look like images and plain HTML
 | |
| #               URLs and generate different markup. We also save the HTML URL
 | |
| #               in a global array for building a list of links later.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub cb_general {
 | |
|     my ( $uri_obj, $uri_text, $simple ) = @_;
 | |
| 
 | |
|     my $path;
 | |
|     #my $new_uri = uri_reformat($uri_text);
 | |
| 
 | |
|     #
 | |
|     # For http: or https:
 | |
|     #
 | |
|     if ( $uri_obj->scheme =~ /^https?/ ) {
 | |
|         $path = $uri_obj->path;
 | |
|         if ( $path =~ /(?i:\.(jpg|png|gif)$)/ ) {
 | |
|             $uri_text = "";
 | |
|         }
 | |
|         else {
 | |
|             push( @urls, $uri_text );
 | |
|             if ($simple) {
 | |
|                 $uri_text = "<$uri_text>";
 | |
|             }
 | |
|             else {
 | |
|                 $uri_text = "[$uri_text]($uri_text)";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return $uri_text;
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: uri_reformat
 | |
| #      PURPOSE: Turns a URI into a form which is better for Markdown
 | |
| #   PARAMETERS: $uri_str - the URI string
 | |
| #      RETURNS: The URI string reformatted
 | |
| #  DESCRIPTION: Places the URI string in an URI object to parse it. Copies
 | |
| #               each of the elements to another empty URI object but takes
 | |
| #               advantage of the setting of URI::DEFAULT_QUERY_FORM_DELIMITER
 | |
| #               which forces the '&' delimiters to ';'. Returns the string
 | |
| #               version of this new URI.
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub uri_reformat {
 | |
|     my ($uri_str) = @_;
 | |
| 
 | |
|     local $URI::DEFAULT_QUERY_FORM_DELIMITER = ';';
 | |
| 
 | |
|     my $u1 = URI->new($uri_str);
 | |
|     my $u2 = URI->new;
 | |
| 
 | |
|     $u2->scheme($u1->scheme);
 | |
|     $u2->authority($u1->authority);
 | |
|     $u2->path($u1->path);
 | |
|     $u2->query_form($u1->query_form);
 | |
|     $u2->fragment($u1->fragment);
 | |
| 
 | |
|     return $u2->as_string;
 | |
| 
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: coalesce
 | |
| #      PURPOSE: To find the first defined argument and return it
 | |
| #   PARAMETERS: Arbitrary number of arguments
 | |
| #      RETURNS: The first defined argument or undef if there are none
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: No exceptions
 | |
| #     COMMENTS: None
 | |
| #     SEE ALSO: N/A
 | |
| #===============================================================================
 | |
| sub coalesce {
 | |
|     foreach (@_) {
 | |
|         return $_ if defined($_);
 | |
|     }
 | |
|     return undef;    ## no critic
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Usage
 | |
| #      PURPOSE: Display a usage message and exit
 | |
| #   PARAMETERS: None
 | |
| #      RETURNS: To command line level with exit value 1
 | |
| #  DESCRIPTION: Builds the usage message using global values
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Usage {
 | |
|     print STDERR <<EOD;
 | |
| $PROG v$VERSION
 | |
| 
 | |
| Usage: $PROG [options]
 | |
| 
 | |
| Scans STDIN in the assumption that it is plain text or some kind of hybrid
 | |
| Markdown looking for URLs, mail links, HTML and Markdown elements.
 | |
| 
 | |
| Options:
 | |
| 
 | |
|     -help               Display this information
 | |
|     -[no]simple         Turns 'simple' mode on or off; it is off by default.
 | |
|                         In 'simple' mode links are not accumulated and written
 | |
|                         at the end of the data in a 'Links' section. URLs are
 | |
|                         converted to a Markdown form though.
 | |
|     -[no]entities       Turns 'entities' mode on and off; it is off by
 | |
|                         default. When on this mode causes each line to be
 | |
|                         scanned for characters best represented as HTML
 | |
|                         entities and to change them appropriately. For example
 | |
|                         the '<' character becomes '<'.
 | |
|     -[no]stamp          Turns 'stamp' mode on and off; it is off by default.
 | |
|                         When this mode is on an HTML comment is written at the
 | |
|                         start of the output containing the name of the script
 | |
|                         and its version number.
 | |
| 
 | |
| 
 | |
| EOD
 | |
|     exit(1);
 | |
| }
 | |
| 
 | |
| #===  FUNCTION  ================================================================
 | |
| #         NAME: Options
 | |
| #      PURPOSE: Processes command-line options
 | |
| #   PARAMETERS: $optref     Hash reference to hold the options
 | |
| #      RETURNS: Undef
 | |
| #  DESCRIPTION:
 | |
| #       THROWS: no exceptions
 | |
| #     COMMENTS: none
 | |
| #     SEE ALSO: n/a
 | |
| #===============================================================================
 | |
| sub Options {
 | |
|     my ($optref) = @_;
 | |
| 
 | |
|     my @options = ( "help", "simple!", "entities!", "stamp!", );
 | |
| 
 | |
|     if ( !GetOptions( $optref, @options ) ) {
 | |
|         die "Failed to process options\n";
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
 |