#!/bin/bash - #=============================================================================== # # FILE: do_pandoc # # USAGE: ./do_pandoc [option] # # DESCRIPTION: Performs an "intelligent" Pandoc run on various types of show # notes. Converts various markup formats into HTML. Treate plain # text as Markdown, though this depends on the prior editing # step doing the right thing. Handles pictures and other assets # in plain text shows that have them - this is done by the edit # phase adding TT² macros and this script processing them with # 'tpage'. # Version 0:2:5 (released 2022-12-04) has not yet been fully # tested. Seems reliable 2023-03-03. # # OPTIONS: --- # REQUIREMENTS: --- # BUGS: --- # NOTES: 2021-04-03: removed the TOC option # 2021-11-07: Added --strip-comments to the HTML snippet # generation stage # 2022-11-01: Big rewrite over the month. Refer to # do_pandoc_0.1.6 for the previous version since there have been # some big changes. # 2022-12-17: Ending reliance on shownotes.txt, and using # shownotes.json instead into the future. Some massive tidying # and rationalisation are still required. # 2023-03-03: If the title contained a quote then the previous # algorithm made bad YAML which caused the 'full' html to fail. # Fixed. # 2023-11-15: The 'prefix' setting is wrong. It needs another # 'hpr1234' directory level. This is needed because the # top-level stuff for the show is already in such a directory, # and while we have assets in a sub-directory we need to be # careful about collisions. # # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com # VERSION: 0.2.10 # CREATED: 2016-08-16 15:34:30 # REVISION: 2024-02-18 13:27:40 # #=============================================================================== set -o nounset # Treat unset variables as an error SCRIPT=${0##*/} #DIR=${0%/*} VERSION='0.2.10' STDOUT="/dev/fd/2" # # Load library functions # LIB="$HOME/bin/function_lib.sh" [ -e "$LIB" ] || { echo "$SCRIPT: Unable to source functions"; exit 1; } # shellcheck source=/home/cendjm/bin/function_lib.sh source "$LIB" # # Colour codes # define_colours # {{{ Functions: -- _usage -- _DEBUG -- #=== FUNCTION ================================================================ # NAME: _usage # DESCRIPTION: Report usage # PARAMETERS: None # RETURNS: Nothing #=============================================================================== _usage () { cat >$STDOUT <<-endusage Usage: ./${SCRIPT} [-h] [-d] [-D] shownumber Version: $VERSION Runs Pandoc against a particular show, choosing a format as defined by the declared format (in the file '.format'). (In this version there is no method to force an explicit input format) Options: -h Print this help -d Select dry run mode -D Turn on debug mode with lots of extra output The default behaviour is now to access the '.format' file in the show directory. Arguments: shownumber Examples ./${SCRIPT} -h ./${SCRIPT} -d 2240 ./${SCRIPT} -D 2250 endusage exit } #=== FUNCTION ================================================================ # NAME: _DEBUG # DESCRIPTION: Writes one or more message lines if in DEBUG mode # PARAMETERS: List of messages # RETURNS: Nothing #=============================================================================== _DEBUG () { [ "$DEBUG" == 0 ] && return for msg in "$@"; do printf 'D> %s\n' "$msg" done } # }}} #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Formats offered by the web form on the HPR site: # # Keyword Explanation # ------- ----------- # plain_text Plain text # html5 HTML5 (preferred) # markdown_standard Markdown (standard) # Markdown_GitHub Markdown (GitHub flavoured) # Markdown_Pandoc Markdown (Pandoc flavoured) # restructured_text RestructuredText # txt2tags txt2tags # # # Hash to perform translation from declared format to Pandoc "-from" value # # {{{ -- 'lookup' hash -- declare -A lookup lookup[plain_text]='markdown-implicit_figures' lookup[html5]='html' lookup[markdown_standard]='markdown_strict' lookup[Markdown_GitHub]='gfm' # Extensions are limited lookup[Markdown_Pandoc]='markdown-implicit_figures' lookup[restructured_text]='rst' lookup[txt2tags]='t2t' # }}} # # Hash for options when generating standalone readable HTML # # (The 'smart' extension is only applicable to markdown, commonmark, latex, # mediawiki, org, rst, twiki; we want to turn it off to remove smart quotes) # # {{{ -- 'options' hash -- declare -A options options[plain_text]='-smart' options[html5]='' options[markdown_standard]='+yaml_metadata_block' options[Markdown_GitHub]='' options[Markdown_Pandoc]='-smart+yaml_metadata_block' options[restructured_text]='-smart' options[txt2tags]='' # }}} #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Sanity checks # JQ=$(command -v jq) [ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; } # YQ=$(command -v yq) # [ -n "$YQ" ] || { echo "Program 'yq' was not found"; exit 1; } # # Process options first # while getopts :dDh opt do case "${opt}" in d) DRYRUN=1;; D) DEBUG=1;; h) _usage;; ?) echo "$SCRIPT: Invalid option; aborting"; exit 1;; esac done shift $((OPTIND - 1)) # # Default options if not provided # DEBUG=${DEBUG:-0} DRYRUN=${DRYRUN:-0} # # Check there's an argument after removing any options. Abort if not # if [[ $# -ne 1 ]]; then _usage fi # # Declare variables for later # TODO: Is this necessary? # declare SHOWID FROM POPTIONS # # Make the explicit show id, catering for leading zeroes (belt & braces) # printf -v SHOWID 'hpr%04d' "$1" # # Make temporary files and set traps to delete them # TMP1=$(mktemp) || { echo "$SCRIPT: ${red}creation of temporary file failed!${reset}" exit 1 } TMP2=$(mktemp) || { echo "$SCRIPT: ${red}creation of temporary file failed!${reset}" exit 1 } TMP3=$(mktemp) || { echo "$SCRIPT: ${red}creation of temporary file failed!${reset}" exit 1 } trap 'cleanup_temp $TMP1 $TMP2 $TMP3' SIGHUP SIGINT SIGPIPE SIGTERM EXIT # # Paths to files # # ------------------------------------------------------------------------------ # Main directory BASENAME="$HOME/HPR/Show_Submission" # JSON to YAML Perl script J2Y="$BASENAME/author_title.pl" [ -e "$J2Y" ] || { echo "Program '$J2Y' was not found"; exit 1; } # The notes for all shows are here SHOWNOTES="$BASENAME/shownotes" # Notes for this show are here SHOWDIR="$SHOWNOTES/$SHOWID" # Paths to all files already created or being created here #RAWFILE="$SHOWDIR/shownotes.txt" JSONFILE="$SHOWDIR/shownotes.json" FMTFILE="$SHOWDIR/.format" PICLIST="$SHOWDIR/.pictures" STATUS="$SHOWDIR/.status" EXTRACT="$SHOWDIR/${SHOWID}.out" FULLHTML="$SHOWDIR/${SHOWID}_full.html" HTML="$SHOWDIR/${SHOWID}.html" # TT² macros and paths for adding pictures PICTPL="$BASENAME/pic_definitions.tpl" MANIFEST="$SHOWDIR/.pictures.mf" # From do_pictures # ------------------------------------------------------------------------------ # # The partial URL for pictures on the HPR site # BASEURL='https://hackerpublicradio.org/eps/' #{{{ --- Obsolete 2022-12-17 --- # # Make a metadata file by parsing the raw data file fields with awk. Save it # in a temporary file. # # See the original do_pandoc_0.1.6 for the extended comments that led to this # design. We were trying to make acceptable YAML, but ended up extracting # metadata from the result. # ---- # TODO: 2022-12-01 Rationalise all of this; it's full of debris from previous # attempts to solve the problem of passing metadata to Pandoc. # ---- # # awk -f - "$RAWFILE" > "$TMP1" <<'ENDAWK' # BEGIN {print "---"} # /^Title:/ && got_title == 0 { # sub(/^Title:\s+/,"") # printf "#title: %s\n",$0 # gsub(/'/,"''") # printf "title: '%s'\n",$0 # got_title = 1 # } # /^Host_Name:/ && got_author == 0 { # sub(/^Host_Name:\s+/,"") # printf "#author: %s\n",$0 # gsub(/'/,"''") # printf "author: '%s'\n",$0 # got_author = 1 # } # END {print "---"} # ENDAWK #}}} # # Use 'jq' to parse the JSON and make the metadata (in the form of YAML) # needed for Pandoc # # Non-YAML alternative - not chosen #jqprog="@text \"author: \(.host.Host_Name)\ntitle: \(.episode.Title)\"" # # Testing another formatter (Journal 2023-03-03) #jqprog="@sh \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\"" # Added quotes around the generated strings (2023-03-31) # jqprog="@text \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\"" # Moved to 'yq' 2023-04-01 # jqprog="@text \"---\nauthor: '\(.host.Host_Name)'\ntitle: '\(.episode.Title)'\n---\"" # jq -r "$jqprog" "$JSONFILE" > "$TMP1" # On 2023-10-01 wrote a Perl JSON to YAML generator just for these two # elements. It's called 'author_title.pl' # yqprog='{author:.host.Host_Name,title:.episode.Title}' # ( echo "---"; $YQ -y "$yqprog" "$JSONFILE"; echo "---"; ) > "$TMP1" $J2Y "$JSONFILE" "$TMP1" _DEBUG "YAML:" "$(cat "$TMP1")" # # Check the main output file from do_parse exists # if [[ ! -e $EXTRACT ]]; then echo "$SCRIPT: ${red}File not found: $EXTRACT${reset}" exit 1 fi # # Get the format or fail with an error # if [[ -e $FMTFILE ]]; then FORMAT=$(cat "$FMTFILE") else # TODO: Should we default to something rather than abort? echo "$SCRIPT: ${red}Could not find declared format (.format file)${reset}" echo "${yellow}Has do_parse been run? If so try and fix the .format file.${reset}" exit fi # # Need to match plain text and Markdown variants when deciding to use the # manifest file and 'tpage' # FMTRE='^(plain_text|[Mm]arkdown_)' # # Determine if there are pictures # if [[ -e $PICLIST ]]; then hasPictures=1 else hasPictures=0 fi # # Here we use the declared format to determine what to do. # # This code now blocks HTML->HTML processing since it just confuses matters. # The files hprNNNN.out and hprNNNN.html are linked to one another, so editing # the former will edit the latter in preparation for uploading if there are # changes that need to be made. See the journal discussion for 2018-05-24. # --- # TODO: 2022-12-01 If the notes are HTML but declared as something else then # this check will not work. Trouble is, parse_JSON will have worked this out # but not saved it so we can't avoid running Pandoc on HTML in this case. # --- # TODO: 2022-12-01 Look at resolving this in parse_JSON? # --- # if [[ $FORMAT == 'html5' ]]; then echo "${red}Running Pandoc on HTML is not allowed${reset}" echo "${yellow}Run do_edit to edit as necessary and do_browser to view${reset}" exit elif [[ $FORMAT == 'plain_text' ]]; then echo "${yellow}Format chosen is plain text${reset}" if [[ $hasPictures -eq 0 ]]; then echo "${yellow}This will be treated as Markdown${reset}" else echo "${yellow}This will be treated as Markdown and will need special action${reset}" echo "${yellow}since there are pictures${reset}" fi fi FROM=${lookup[$FORMAT]} POPTIONS=${options[$FORMAT]} echo "${green}Will process $FORMAT with 'pandoc -f ${FROM}'${reset}" echo "${yellow}Options chosen for --standalone are '${POPTIONS}'${reset}" # # Only for plain text notes, process pictures for the HTML we'll be adding to # the database. # # We need the following things: # - The notes to be plain text format # - Some pictures # - Files written by do_parse and do_pictures: # - .assets (not used here) # - .pictures (were previously used here, but no longer) # - .pictures.mf (needed by the TT² macros) # - .pictures.tt2 (the TT² macro calls - already edited into the notes) # - pic_definitions.tpl (macro definitions, common to all shows) # - Not to be in dry run mode; if we are we just report intentions # if [[ $DRYRUN -eq 0 ]]; then # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then # # Deal with pictures using the TT² macros # _DEBUG "Processing TT² inclusions" # Make a picture manifest with a header awk 'BEGIN{print "file : thumb"}{p1=$0; getline p2; printf "%s : %s\n",p1,p2}' \ "$MANIFEST" > "$TMP2" _DEBUG "Picture list:" "$(cat "$TMP2")" "---" \ "BASEURL=${BASEURL}${SHOWID}/" \ "EXTRACT=$EXTRACT" \ "Extract file contents:" "$(cat "$EXTRACT")" "---" # Run the macros on the notes to make $TMP3 for Pandoc tpage --pre_process="$PICTPL" \ --define "piclist=$TMP2" \ --define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \ "$EXTRACT" > "$TMP3" _DEBUG "Processed by tpage" "$(cat "$TMP3")" "---" else # Not plain text or a Markdown variant and no pictures, so put the # notes in $TMP3 where Pandoc will look for them cat "$EXTRACT" > "$TMP3" fi else # # We would not have used TT² unless the notes were plain text (or # a Markdown variant) and there were pictures # # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then echo "${yellow}Would have prepared TT² code for pandoc${reset}" fi fi # # Generate an HTML snippet for adding to the database. # (Note 2021-11-24: Added --ascii option.) # if [[ $DRYRUN -eq 0 ]]; then # shellcheck disable=SC2086 pandoc -f "$FROM"-smart -t html5 --ascii --no-highlight --strip-comments \ "$TMP3" -o "$HTML" # $EXTRAS RES=$? if [[ $RES -eq 0 ]]; then echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${HTML##*/}${reset}" else echo "$SCRIPT: ${red}Oops! Something went wrong! (line $LINENO)${reset}" echo "${yellow}$SCRIPT: Aborting now${reset}" exit 1 fi else echo "${yellow}Would have run pandoc to make HTML for upload${reset}" fi # # Make HTML for proof reading. All pictures referenced are now on the HPR # server (we ran 'do_asset_upload'), so we want to refer to them here. # # File $TMP2 contains the .pictures.mf contents with a header line; and it # contains data for the macros. It was created when we prepared the main HTML # for the database. We use $BASEURL again here because we want to reference # the pictures on the server. # # We use the awk-formatted file (now yq-formatted) in $TMP1 from earlier to do # this. At the end TMP3 contains Markdown for Pandoc. # if [[ $DRYRUN -eq 0 ]]; then # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then tpage --pre_process="$PICTPL" \ --define "piclist=$TMP2" \ --define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \ "$EXTRACT" > "$TMP3" else cat "$EXTRACT" > "$TMP3" fi # # Generate complete HTML that we can proofread. We need metadata for this # stand-alone HTML which is in the form of YAML in this version. # # ---------------------------------------------------------------------- # Original options below when using 'awk' to parse shownotes.txt: # --metadata="$(sed -n '/^#author:/{s/#//;p}' "$TMP1")" \ # --metadata="$(sed -n '/^#title:/{s/#//;p}' "$TMP1")" \ # # shellcheck disable=SC2086 pandoc -f ${FROM}${POPTIONS} -t html5 --ascii \ --standalone --template=hpr.html5 --no-highlight \ -c https://hackerpublicradio.org/css/hpr.css \ --metadata-file="$TMP1" -o "$FULLHTML" "$TMP3" RES=$? if [[ $RES -eq 0 ]]; then echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${FULLHTML##*/}${reset}" else echo "$SCRIPT: ${red}Oops! Something went wrong making the full HTML! (line $LINENO)${reset}" fi else # Dry run echo "${yellow}Would have run pandoc to make HTML for proof reading${reset}" fi # # Set the status for this show # if [[ $DRYRUN -eq 0 ]]; then echo "converted" >> "$STATUS" fi exit # vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker