forked from HPR/hpr-tools
		
	
		
			
	
	
		
			394 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			394 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | #!/usr/bin/env perl | ||
|  | #=============================================================================== | ||
|  | # | ||
|  | #         FILE: make_markdown | ||
|  | # | ||
|  | #        USAGE: ./make_markdown [-[no]simple] [-[no]entities]  [-[no]stamp] file | ||
|  | # | ||
|  | #  DESCRIPTION: Turn plain text to Markdown. Designed to be used as a filter | ||
|  | #               in vim. | ||
|  | #               Finds all bare URLs, ignoring lines with HTML or Markdown | ||
|  | #               links. It checks the path part of the URL to see if it's an | ||
|  | #               image. If it is it uses an image link, otherwise it uses | ||
|  | #               a plain link. Links are all to HPR standard where the text | ||
|  | #               part and the URL part are the same. | ||
|  | #               Unless the '-simple' option is present all HTML URLs are | ||
|  | #               accumulated, and if the host hasn't provided a 'Links' section | ||
|  | #               they are added to one. This happens even if the overall text | ||
|  | #               is short, but it's easy to remove if not wanted. | ||
|  | #               By default, in -nosimple mode an HTML comment with details of | ||
|  | #               this script is added to the output. This can be turned off | ||
|  | #               with the -nostamp option. | ||
|  | # | ||
|  | #      OPTIONS: --- | ||
|  | # REQUIREMENTS: --- | ||
|  | #         BUGS: --- | ||
|  | #        NOTES: --- | ||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||
|  | #      VERSION: 0.0.9 | ||
|  | #      CREATED: 2015-10-07 16:05:21 | ||
|  | #     REVISION: 2024-01-14 15:59:34 | ||
|  | # | ||
|  | #=============================================================================== | ||
|  | 
 | ||
|  | use 5.010; | ||
|  | use strict; | ||
|  | use warnings; | ||
|  | use utf8; | ||
|  | 
 | ||
|  | use version 0.77; # Planning to experiment with this new feature | ||
|  | 
 | ||
|  | use open ':encoding(UTF-8)'; # Make all IO UTF-8 | ||
|  | 
 | ||
|  | use Getopt::Long; | ||
|  | use Regexp::Common qw{URI pattern}; | ||
|  | use HTML::Entities; | ||
|  | #use Encoding::FixLatin qw(fix_latin); | ||
|  | use URI::Find; | ||
|  | use URI; | ||
|  | use List::MoreUtils qw{uniq}; | ||
|  | 
 | ||
|  | # | ||
|  | # Version number (manually incremented) | ||
|  | # | ||
|  | our $VERSION = 'v0.0.9'; | ||
|  | 
 | ||
|  | # | ||
|  | # Script and directory names | ||
|  | # | ||
|  | ( my $PROG = $0 ) =~ s|.*/||mx; | ||
|  | ( my $DIR  = $0 ) =~ s|/?[^/]*$||mx; | ||
|  | $DIR = '.' unless $DIR; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Declarations | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Constants and other declarations | ||
|  | # | ||
|  | my $basedir = "$ENV{HOME}/HPR/Show_Submission"; | ||
|  | 
 | ||
|  | my @urls; | ||
|  | 
 | ||
|  | my $have_links = 0; | ||
|  | my $uri_count; | ||
|  | my $new_url; | ||
|  | 
 | ||
|  | # | ||
|  | # Enable Unicode mode | ||
|  | # | ||
|  | #binmode STDIN, ":encoding(UTF-8)"; | ||
|  | #binmode STDOUT, ":encoding(UTF-8)"; | ||
|  | #binmode STDERR, ":encoding(UTF-8)"; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Patterns | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # 1. Look for a line beginning with 'Links:' | ||
|  | # | ||
|  | pattern | ||
|  |     name   => ['links'], | ||
|  |     create => '^\s*Links:?\s*$', | ||
|  |     ; | ||
|  | 
 | ||
|  | # | ||
|  | # 2. Look for an HTML link (some people embed these in their notes) | ||
|  | # | ||
|  | pattern | ||
|  |     name   => ['html_link'], | ||
|  |     create => '<a[^>]*>', | ||
|  |     ; | ||
|  | 
 | ||
|  | # | ||
|  | # 3. Look for existing Markdown links | ||
|  | # | ||
|  | pattern | ||
|  |     name   => ['md_link'], | ||
|  |     create => '\[([^]]+)\]\(\1\)', | ||
|  |     ; | ||
|  | 
 | ||
|  | # | ||
|  | # 4. Look for mail addresses | ||
|  | # | ||
|  | pattern | ||
|  |     name   => ['mail_link'], | ||
|  |     create => '(?i)[a-z0-9_+.](?:[a-z0-9_+.]+[a-z0-9_+])?\@[a-z0-9.-]+', | ||
|  |     ; | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Options and arguments | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Process options | ||
|  | # | ||
|  | my %options; | ||
|  | Options( \%options ); | ||
|  | 
 | ||
|  | # | ||
|  | # Collect options | ||
|  | # | ||
|  | Usage() if ( $options{'help'} ); | ||
|  | 
 | ||
|  | my $simple   = ( defined( $options{simple} )   ? $options{simple}   : 0 ); | ||
|  | my $entities = ( defined( $options{entities} ) ? $options{entities} : 0 ); | ||
|  | my $stamp    = ( defined( $options{stamp} )    ? $options{stamp}    : 0 ); | ||
|  | 
 | ||
|  | 
 | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # Process the data on STDIN | ||
|  | #------------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Set up the URI finder to call back to the 'cb_general' routine. We use | ||
|  | # a closure to allow the passing of the $simple variable | ||
|  | # | ||
|  | my $finder = URI::Find->new( sub { cb_general( shift, shift, $simple ) } ); | ||
|  | 
 | ||
|  | # | ||
|  | # Slurp everything on STDIN | ||
|  | # | ||
|  | my @text = <>; | ||
|  | 
 | ||
|  | if ($stamp) { | ||
|  |     printf "<!-- Processed by %s %s -->\n", $PROG, $VERSION unless $simple; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Process the stuff we got | ||
|  | # | ||
|  | foreach my $line (@text) { | ||
|  |     chomp($line); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Look for a user-provided 'Links' section | ||
|  |     # | ||
|  |     if ( $line =~ /($RE{links})/ ) { | ||
|  |         $have_links = 1; | ||
|  |         $line       = "### $1"; | ||
|  |         print "$line\n"; | ||
|  |         next; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Check whether the line contains some HTML. If it does we don't want to | ||
|  |     # touch it. | ||
|  |     # | ||
|  |     if ( $line =~ /$RE{html_link}/ ) { | ||
|  |         print "$line\n"; | ||
|  |         next; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # An existing Markdown link means we'll skip the line | ||
|  |     # | ||
|  |     if ( $line =~ /$RE{md_link}/ ) { | ||
|  |         print "$line\n"; | ||
|  |         next; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Special action for mail addresses. | ||
|  |     # | ||
|  |     # TODO: Loop through the line extracting each mail address and checking it | ||
|  |     # with Email::Valid. If valid turn into a Markdown linkgg of the form | ||
|  |     # <mailaddr>, otherwise ignore it. | ||
|  |     # | ||
|  |     if ( $line =~ /$RE{mail_link}/ ) { | ||
|  |         $line =~ s/($RE{mail_link})/<$1>/g; | ||
|  |     } | ||
|  | 
 | ||
|  |     # | ||
|  |     # Look for URLs of any sort and process them in the callback 'cb_general' | ||
|  |     # defined earlier | ||
|  |     # | ||
|  |     $uri_count = $finder->find( \$line ); | ||
|  | 
 | ||
|  |     # | ||
|  |     # Print the line. It might have been edited by the URI::Find callback or | ||
|  |     # might just be as it was. | ||
|  |     # | ||
|  |     encode_entities($line) if ($entities); | ||
|  |     print "$line\n"; | ||
|  | } | ||
|  | 
 | ||
|  | # | ||
|  | # Unless we're in 'simple' mode generate links if there's data and the host | ||
|  | # hasn't provided them. Make sure they are unique (since there may have been | ||
|  | # multiple references in the notes) | ||
|  | # | ||
|  | unless ($simple) { | ||
|  |     if ( @urls && !$have_links ) { | ||
|  |         @urls = uniq @urls; | ||
|  |         print "\n### Links\n\n"; | ||
|  | 
 | ||
|  |         foreach my $url (@urls) { | ||
|  |             #$new_url = uri_reformat($url); # New format is not accepted | ||
|  |             print "- [$url]($url)\n"; | ||
|  |         } | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | exit; | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: cb_general | ||
|  | #      PURPOSE: Process a URI found by URI::Find containing a link | ||
|  | #   PARAMETERS: $uri_obj        a URI object | ||
|  | #               $uri_text       the URI text | ||
|  | #               $simple         Boolean that controls what type of Markdown | ||
|  | #                               link is generated. If true (1) we use <URI> | ||
|  | #                               form, otherwise we use [URI](URI) form | ||
|  | #      RETURNS: The modified URI text | ||
|  | #  DESCRIPTION: The URI::Find 'find' method calls this callback for every URI | ||
|  | #               it finds in the text it discovers. We only care about | ||
|  | #               http/https here (for now anyway). If the text is changed and | ||
|  | #               returned then the changed item is placed in the original text. | ||
|  | #               We differentiate between what look like images and plain HTML | ||
|  | #               URLs and generate different markup. We also save the HTML URL | ||
|  | #               in a global array for building a list of links later. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub cb_general { | ||
|  |     my ( $uri_obj, $uri_text, $simple ) = @_; | ||
|  | 
 | ||
|  |     my $path; | ||
|  |     #my $new_uri = uri_reformat($uri_text); | ||
|  | 
 | ||
|  |     # | ||
|  |     # For http: or https: | ||
|  |     # | ||
|  |     if ( $uri_obj->scheme =~ /^https?/ ) { | ||
|  |         $path = $uri_obj->path; | ||
|  |         if ( $path =~ /(?i:\.(jpg|png|gif)$)/ ) { | ||
|  |             $uri_text = ""; | ||
|  |         } | ||
|  |         else { | ||
|  |             push( @urls, $uri_text ); | ||
|  |             if ($simple) { | ||
|  |                 $uri_text = "<$uri_text>"; | ||
|  |             } | ||
|  |             else { | ||
|  |                 $uri_text = "[$uri_text]($uri_text)"; | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     return $uri_text; | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: uri_reformat | ||
|  | #      PURPOSE: Turns a URI into a form which is better for Markdown | ||
|  | #   PARAMETERS: $uri_str - the URI string | ||
|  | #      RETURNS: The URI string reformatted | ||
|  | #  DESCRIPTION: Places the URI string in an URI object to parse it. Copies | ||
|  | #               each of the elements to another empty URI object but takes | ||
|  | #               advantage of the setting of URI::DEFAULT_QUERY_FORM_DELIMITER | ||
|  | #               which forces the '&' delimiters to ';'. Returns the string | ||
|  | #               version of this new URI. | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub uri_reformat { | ||
|  |     my ($uri_str) = @_; | ||
|  | 
 | ||
|  |     local $URI::DEFAULT_QUERY_FORM_DELIMITER = ';'; | ||
|  | 
 | ||
|  |     my $u1 = URI->new($uri_str); | ||
|  |     my $u2 = URI->new; | ||
|  | 
 | ||
|  |     $u2->scheme($u1->scheme); | ||
|  |     $u2->authority($u1->authority); | ||
|  |     $u2->path($u1->path); | ||
|  |     $u2->query_form($u1->query_form); | ||
|  |     $u2->fragment($u1->fragment); | ||
|  | 
 | ||
|  |     return $u2->as_string; | ||
|  | 
 | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: coalesce | ||
|  | #      PURPOSE: To find the first defined argument and return it | ||
|  | #   PARAMETERS: Arbitrary number of arguments | ||
|  | #      RETURNS: The first defined argument or undef if there are none | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: No exceptions | ||
|  | #     COMMENTS: None | ||
|  | #     SEE ALSO: N/A | ||
|  | #=============================================================================== | ||
|  | sub coalesce { | ||
|  |     foreach (@_) { | ||
|  |         return $_ if defined($_); | ||
|  |     } | ||
|  |     return undef;    ## no critic | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: Usage | ||
|  | #      PURPOSE: Display a usage message and exit | ||
|  | #   PARAMETERS: None | ||
|  | #      RETURNS: To command line level with exit value 1 | ||
|  | #  DESCRIPTION: Builds the usage message using global values | ||
|  | #       THROWS: no exceptions | ||
|  | #     COMMENTS: none | ||
|  | #     SEE ALSO: n/a | ||
|  | #=============================================================================== | ||
|  | sub Usage { | ||
|  |     print STDERR <<EOD; | ||
|  | $PROG v$VERSION | ||
|  | 
 | ||
|  | Usage: $PROG [options] | ||
|  | 
 | ||
|  | Scans STDIN in the assumption that it is plain text or some kind of hybrid | ||
|  | Markdown looking for URLs, mail links, HTML and Markdown elements. | ||
|  | 
 | ||
|  | Options: | ||
|  | 
 | ||
|  |     -help               Display this information | ||
|  |     -[no]simple         Turns 'simple' mode on or off; it is off by default. | ||
|  |                         In 'simple' mode links are not accumulated and written | ||
|  |                         at the end of the data in a 'Links' section. URLs are | ||
|  |                         converted to a Markdown form though. | ||
|  |     -[no]entities       Turns 'entities' mode on and off; it is off by | ||
|  |                         default. When on this mode causes each line to be | ||
|  |                         scanned for characters best represented as HTML | ||
|  |                         entities and to change them appropriately. For example | ||
|  |                         the '<' character becomes '<'. | ||
|  |     -[no]stamp          Turns 'stamp' mode on and off; it is off by default. | ||
|  |                         When this mode is on an HTML comment is written at the | ||
|  |                         start of the output containing the name of the script | ||
|  |                         and its version number. | ||
|  | 
 | ||
|  | 
 | ||
|  | EOD | ||
|  |     exit(1); | ||
|  | } | ||
|  | 
 | ||
|  | #===  FUNCTION  ================================================================ | ||
|  | #         NAME: Options | ||
|  | #      PURPOSE: Processes command-line options | ||
|  | #   PARAMETERS: $optref     Hash reference to hold the options | ||
|  | #      RETURNS: Undef | ||
|  | #  DESCRIPTION: | ||
|  | #       THROWS: no exceptions | ||
|  | #     COMMENTS: none | ||
|  | #     SEE ALSO: n/a | ||
|  | #=============================================================================== | ||
|  | sub Options { | ||
|  |     my ($optref) = @_; | ||
|  | 
 | ||
|  |     my @options = ( "help", "simple!", "entities!", "stamp!", ); | ||
|  | 
 | ||
|  |     if ( !GetOptions( $optref, @options ) ) { | ||
|  |         die "Failed to process options\n"; | ||
|  |     } | ||
|  | 
 | ||
|  |     return; | ||
|  | } | ||
|  | 
 | ||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker |