hpr-tools/Show_Submission/do_pandoc

#!/bin/bash -
#===============================================================================
#
#         FILE: do_pandoc
#
#        USAGE: ./do_pandoc [option] <epno>
#
#  DESCRIPTION: Performs an "intelligent" Pandoc run on various types of show
#               notes. Converts various markup formats into HTML. Treate plain
#               text as Markdown, though this depends on the prior editing
#               step doing the right thing. Handles pictures and other assets
#               in plain text shows that have them - this is done by the edit
#               phase adding TT² macros and this script processing them with
#               'tpage'.
#               Version 0:2:5 (released 2022-12-04) has not yet been fully
#               tested. Seems reliable 2023-03-03.
#
#      OPTIONS: ---
# REQUIREMENTS: ---
#         BUGS: ---
#        NOTES: 2021-04-03: removed the TOC option
#               2021-11-07: Added --strip-comments to the HTML snippet
#               generation stage
#               2022-11-01: Big rewrite over the month. Refer to
#               do_pandoc_0.1.6 for the previous version since there have been
#               some big changes.
#               2022-12-17: Ending reliance on shownotes.txt, and using
#               shownotes.json instead into the future. Some massive tidying
#               and rationalisation are still required.
#               2023-03-03: If the title contained a quote then the previous
#               algorithm made bad YAML which caused the 'full' html to fail.
#               Fixed.
#               2023-11-15: The 'prefix' setting is wrong. It needs another
#               'hpr1234' directory level. This is needed because the
#               top-level stuff for the show is already in such a directory,
#               and while we have assets in a sub-directory we need to be
#               careful about collisions.
#
#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
#      VERSION: 0.2.11
#      CREATED: 2016-08-16 15:34:30
#     REVISION: 2024-10-18 23:03:25
#
#===============================================================================

set -o nounset                              # Treat unset variables as an error

SCRIPT=${0##*/}
#DIR=${0%/*}

VERSION='0.2.11'

STDOUT="/dev/fd/2"

#
# Load library functions
#
LIB="$HOME/HPR/function_lib.sh"
[ -e "$LIB" ] || { echo "$SCRIPT: Unable to source functions"; exit 1; }
# shellcheck source=/home/cendjm/HPR/function_lib.sh
source "$LIB"

#
# Colour codes
#
define_colours

# {{{ Functions: -- _usage --
#===  FUNCTION  ================================================================
#         NAME: _usage
#  DESCRIPTION: Report usage
#   PARAMETERS: None
#      RETURNS: Nothing
#===============================================================================
_usage () {
    cat >$STDOUT <<-endusage
Usage: ./${SCRIPT} [-h] [-d] [-D] shownumber

Version: $VERSION

Runs Pandoc against a particular show, choosing a format as
defined by the declared format (in the file '.format').

(In this version there is no method to force an explicit input format)

Options:
  -h                    Print this help
  -d                    Select dry run mode
  -D                    Turn on debug mode with lots of extra output

The default behaviour is now to access the '.format' file in the show
directory.

Arguments:
    shownumber

Examples
  ./${SCRIPT} -h
  ./${SCRIPT} -d 2240
  ./${SCRIPT} -D 2250

endusage
    exit
}
# }}}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Formats offered by the web form on the HPR site:
#
# Keyword               Explanation
# -------               -----------
# plain_text            Plain text
# html5                 HTML5 (preferred)
# markdown_standard     Markdown (standard)
# Markdown_GitHub       Markdown (GitHub flavoured)
# Markdown_Pandoc       Markdown (Pandoc flavoured)
# restructured_text     RestructuredText
# txt2tags              txt2tags
#

#
# Hash to perform translation from declared format to Pandoc "-from" value
#
# {{{ -- 'lookup' hash --
declare -A lookup
lookup[plain_text]='markdown-implicit_figures'
lookup[html5]='html'
lookup[markdown_standard]='markdown_strict'
lookup[Markdown_GitHub]='gfm' # Extensions are limited
lookup[Markdown_Pandoc]='markdown-implicit_figures'
lookup[restructured_text]='rst'
lookup[txt2tags]='t2t'
# }}}

#
# Hash for options when generating standalone readable HTML
#
# (The 'smart' extension is only applicable to markdown, commonmark, latex,
# mediawiki, org, rst, twiki; we want to turn it off to remove smart quotes)
#
# {{{ -- 'options' hash --
declare -A options
options[plain_text]='-smart'
options[html5]=''
options[markdown_standard]='+yaml_metadata_block'
options[Markdown_GitHub]=''
options[Markdown_Pandoc]='-smart+yaml_metadata_block'
options[restructured_text]='-smart'
options[txt2tags]=''
# }}}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#
# Sanity checks
#
# JQ=$(command -v jq)
# [ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
# YQ=$(command -v yq)
# [ -n "$YQ" ] || { echo "Program 'yq' was not found"; exit 1; }

#
# Process options first
#
while getopts :dDh opt
do
    case "${opt}" in
        d) DRYRUN=1;;
        D) DEBUG=1;;
        h) _usage;;
        ?) echo "$SCRIPT: Invalid option; aborting"; exit 1;;
    esac
done
shift $((OPTIND - 1))

#
# Default options if not provided
#
DEBUG=${DEBUG:-0}
DRYRUN=${DRYRUN:-0}

#
# Check there's an argument after removing any options. Abort if not
#
if [[ $# -ne 1 ]]; then
    _usage
fi

#
# Declare variables for later
# TODO: Is this necessary?
#
declare SHOWID FROM POPTIONS

#
# Make the explicit show id, catering for leading zeroes (belt & braces)
#
printf -v SHOWID 'hpr%04d' "$1"

#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || {
    echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
    exit 1
}
TMP2=$(mktemp) || {
    echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
    exit 1
}
TMP3=$(mktemp) || {
    echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
    exit 1
}
trap 'cleanup_temp $TMP1 $TMP2 $TMP3' SIGHUP SIGINT SIGPIPE SIGTERM EXIT

#
# Paths to files
#
# ------------------------------------------------------------------------------
# Main directory
BASENAME="$HOME/HPR/Show_Submission"

# JSON to YAML Perl script - sanity check
J2Y="$BASENAME/author_title.pl"
[ -e "$J2Y" ] || { echo "Program '$J2Y' was not found"; exit 1; }

# The notes for all shows are here
SHOWNOTES="$BASENAME/shownotes"

# Notes for this show are here
SHOWDIR="$SHOWNOTES/$SHOWID"

# Paths to all files already created or being created here
#RAWFILE="$SHOWDIR/shownotes.txt"
JSONFILE="$SHOWDIR/shownotes.json"
FMTFILE="$SHOWDIR/.format"
PICLIST="$SHOWDIR/.pictures"
STATUS="$SHOWDIR/.status"
EXTRACT="$SHOWDIR/${SHOWID}.out"
FULLHTML="$SHOWDIR/${SHOWID}_full.html"
HTML="$SHOWDIR/${SHOWID}.html"

# TT² macros and paths for adding pictures
PICTPL="$BASENAME/pic_definitions.tpl"
MANIFEST="$SHOWDIR/.pictures.mf"        # From do_pictures
# ------------------------------------------------------------------------------

#
# The partial URL for pictures on the HPR site
#
BASEURL='https://hackerpublicradio.org/eps/'

#{{{ --- Obsolete 2022-12-17 ---
#
# Make a metadata file by parsing the raw data file fields with awk. Save it
# in a temporary file.
#
# See the original do_pandoc_0.1.6 for the extended comments that led to this
# design. We were trying to make acceptable YAML, but ended up extracting
# metadata from the result.
# ----
# TODO: 2022-12-01 Rationalise all of this; it's full of debris from previous
# attempts to solve the problem of passing metadata to Pandoc.
# ----
#
# awk -f - "$RAWFILE" > "$TMP1" <<'ENDAWK'
# BEGIN {print "---"}
# /^Title:/ && got_title == 0 {
#     sub(/^Title:\s+/,"")
#     printf "#title: %s\n",$0
#     gsub(/'/,"''")
#     printf "title: '%s'\n",$0
#     got_title = 1
# }
# /^Host_Name:/ && got_author == 0 {
#     sub(/^Host_Name:\s+/,"")
#     printf "#author: %s\n",$0
#     gsub(/'/,"''")
#     printf "author: '%s'\n",$0
#     got_author = 1
# }
# END  {print "---"}
# ENDAWK
#}}}

#
# Use 'jq' to parse the JSON and make the metadata (in the form of YAML)
# needed for Pandoc
#
# Non-YAML alternative - not chosen
# jqprog="@text \"author: \(.host.Host_Name)\ntitle: \(.episode.Title)\""
#
# Testing another formatter (Journal 2023-03-03)
# jqprog="@sh \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\""
# Added quotes around the generated strings (2023-03-31)
# jqprog="@text \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\""
#
# Moved to 'yq' 2023-04-01
# jqprog="@text \"---\nauthor: '\(.host.Host_Name)'\ntitle: '\(.episode.Title)'\n---\""
# jq -r "$jqprog" "$JSONFILE" > "$TMP1"
#
# On 2023-10-01 wrote a Perl JSON to YAML generator just for these two
# elements. It's called 'author_title.pl'
#
# yqprog='{author:.host.Host_Name,title:.episode.Title}'
# ( echo "---"; $YQ -y "$yqprog" "$JSONFILE"; echo "---"; ) > "$TMP1"
#
$J2Y "$JSONFILE" "$TMP1"
_DEBUG "YAML:" "$(cat "$TMP1")"

#
# Check the main output file from do_parse exists
#
if [[ ! -e $EXTRACT ]]; then
    echo "$SCRIPT: ${red}File not found: $EXTRACT${reset}"
    exit 1
fi

#
# Get the format or fail with an error
#
if [[ -e $FMTFILE ]]; then
    FORMAT=$(cat "$FMTFILE")
else
    # TODO: Should we default to something rather than abort?
    echo "$SCRIPT: ${red}Could not find declared format (.format file)${reset}"
    echo "${yellow}Has do_parse been run? If so try and fix the .format file.${reset}"
    exit
fi

#
# Need to match plain text and Markdown variants when deciding to use the
# manifest file and 'tpage'
#
FMTRE='^(plain_text|[Mm]arkdown_)'

#
# Determine if there are pictures
#
if [[ -e $PICLIST ]]; then
    hasPictures=1
else
    hasPictures=0
fi

#
# Here we use the declared format to determine what to do.
#
# This code now blocks HTML->HTML processing since it just confuses matters.
# The files hprNNNN.out and hprNNNN.html are linked to one another, so editing
# the former will edit the latter in preparation for uploading if there are
# changes that need to be made. See the journal discussion for 2018-05-24.
# ---
# TODO: 2022-12-01 If the notes are HTML but declared as something else then
# this check will not work. Trouble is, parse_JSON will have worked this out
# but not saved it so we can't avoid running Pandoc on HTML in this case.
# ---
# TODO: 2022-12-01 Look at resolving this in parse_JSON?
# ---
#
if [[ $FORMAT == 'html5' ]]; then
    echo "${red}Running Pandoc on HTML is not allowed${reset}"
    echo "${yellow}Run do_edit to edit as necessary and do_browser to view${reset}"
    exit
elif [[ $FORMAT == 'plain_text' ]]; then
    echo "${yellow}Format chosen is plain text${reset}"
    if [[ $hasPictures -eq 0 ]]; then
        echo "${yellow}This will be treated as Markdown${reset}"
    else
        echo "${yellow}This will be treated as Markdown and will need special action${reset}"
        echo "${yellow}since there are pictures${reset}"
    fi
fi
FROM=${lookup[$FORMAT]}
POPTIONS=${options[$FORMAT]}
echo "${green}Will process $FORMAT with 'pandoc -f ${FROM}'${reset}"
echo "${yellow}Options chosen for --standalone are '${POPTIONS}'${reset}"

#
# Only for plain text notes, process pictures for the HTML we'll be adding to
# the database.
#
# We need the following things:
# - The notes to be plain text format
# - Some pictures
# - Files written by do_parse and do_pictures:
#       - .assets (not used here)
#       - .pictures (were previously used here, but no longer)
#       - .pictures.mf (needed by the TT² macros)
#       - .pictures.tt2 (the TT² macro calls - already edited into the notes)
#       - pic_definitions.tpl (macro definitions, common to all shows)
# - Not to be in dry run mode; if we are we just report intentions
#
if [[ $DRYRUN -eq 0 ]]; then
    # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
    if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
        #
        # Deal with pictures using the TT² macros
        #
        _DEBUG "Processing TT² inclusions"

        # Make a picture manifest with a header
        awk 'BEGIN{print "file : thumb"}{p1=$0; getline p2; printf "%s : %s\n",p1,p2}' \
            "$MANIFEST" > "$TMP2"

        _DEBUG "Picture list:" "$(cat "$TMP2")" "---" \
            "BASEURL=${BASEURL}${SHOWID}/" \
            "EXTRACT=$EXTRACT" \
            "Extract file contents:" "$(cat "$EXTRACT")" "---"

        # Run the macros on the notes to make $TMP3 for Pandoc
        tpage --pre_process="$PICTPL" \
            --define "piclist=$TMP2" \
            --define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \
            "$EXTRACT" > "$TMP3"

        _DEBUG "Processed by tpage" "$(cat "$TMP3")" "---"
    else
        # Not plain text or a Markdown variant and no pictures, so put the
        # notes in $TMP3 where Pandoc will look for them
        cat "$EXTRACT" > "$TMP3"
    fi
else
    #
    # We would not have used TT² unless the notes were plain text (or
    # a Markdown variant) and there were pictures
    #
    # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
    if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
        echo "${yellow}Would have prepared TT² code for pandoc${reset}"
    fi
fi

#
# Generate an HTML snippet for adding to the database.
# (Note 2021-11-24: Added --ascii option.)
#
if [[ $DRYRUN -eq 0 ]]; then
    # shellcheck disable=SC2086
    pandoc -f "$FROM"-smart -t html5 --ascii --no-highlight --strip-comments \
        "$TMP3" -o "$HTML" # $EXTRAS
    RES=$?

    if [[ $RES -eq 0 ]]; then
        echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${HTML##*/}${reset}"
    else
        echo "$SCRIPT: ${red}Oops! Something went wrong! (line $LINENO)${reset}"
        echo "${yellow}$SCRIPT: Aborting now${reset}"
        exit 1
    fi
else
    echo "${yellow}Would have run pandoc to make HTML for upload${reset}"
fi

#
# Make HTML for proof reading. All pictures referenced are now on the HPR
# server (we ran 'do_asset_upload'), so we want to refer to them here.
#
# File $TMP2 contains the .pictures.mf contents with a header line; and it
# contains data for the macros. It was created when we prepared the main HTML
# for the database. We use $BASEURL again here because we want to reference
# the pictures on the server.
#
# We use the awk-formatted file (now yq-formatted) in $TMP1 from earlier to do
# this. At the end TMP3 contains Markdown for Pandoc.
#
if [[ $DRYRUN -eq 0 ]]; then
    # if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
    if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
        tpage --pre_process="$PICTPL" \
            --define "piclist=$TMP2" \
            --define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \
            "$EXTRACT" > "$TMP3"
    else
        cat "$EXTRACT" > "$TMP3"
    fi

    #
    # Generate complete HTML that we can proofread. We need metadata for this
    # stand-alone HTML which is in the form of YAML in this version.
    #
    # ----------------------------------------------------------------------
    # Original options below when using 'awk' to parse shownotes.txt:
    # --metadata="$(sed -n '/^#author:/{s/#//;p}' "$TMP1")" \
    # --metadata="$(sed -n '/^#title:/{s/#//;p}' "$TMP1")" \
    #
    # shellcheck disable=SC2086
    pandoc -f ${FROM}${POPTIONS} -t html5 --ascii \
        --standalone --template=hpr_dev.html5 --no-highlight \
        -c https://hackerpublicradio.org/css/hpr.css \
        --metadata-file="$TMP1" -o "$FULLHTML" "$TMP3"
    RES=$?

    if [[ $RES -eq 0 ]]; then
        echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${FULLHTML##*/}${reset}"
    else
        echo "$SCRIPT: ${red}Oops! Something went wrong making the full HTML! (line $LINENO)${reset}"
    fi
else
    # Dry run
    echo "${yellow}Would have run pandoc to make HTML for proof reading${reset}"
fi

#
# Set the status for this show
#
if [[ $DRYRUN -eq 0 ]]; then
    echo "converted" >> "$STATUS"
fi

exit

# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker