hpr-tools/Show_Submission/do_pandoc

525 lines
16 KiB
Bash
Executable File

#!/bin/bash -
#===============================================================================
#
# FILE: do_pandoc
#
# USAGE: ./do_pandoc [option] <epno>
#
# DESCRIPTION: Performs an "intelligent" Pandoc run on various types of show
# notes. Converts various markup formats into HTML. Treate plain
# text as Markdown, though this depends on the prior editing
# step doing the right thing. Handles pictures and other assets
# in plain text shows that have them - this is done by the edit
# phase adding TT² macros and this script processing them with
# 'tpage'.
# Version 0:2:5 (released 2022-12-04) has not yet been fully
# tested. Seems reliable 2023-03-03.
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: 2021-04-03: removed the TOC option
# 2021-11-07: Added --strip-comments to the HTML snippet
# generation stage
# 2022-11-01: Big rewrite over the month. Refer to
# do_pandoc_0.1.6 for the previous version since there have been
# some big changes.
# 2022-12-17: Ending reliance on shownotes.txt, and using
# shownotes.json instead into the future. Some massive tidying
# and rationalisation are still required.
# 2023-03-03: If the title contained a quote then the previous
# algorithm made bad YAML which caused the 'full' html to fail.
# Fixed.
# 2023-11-15: The 'prefix' setting is wrong. It needs another
# 'hpr1234' directory level. This is needed because the
# top-level stuff for the show is already in such a directory,
# and while we have assets in a sub-directory we need to be
# careful about collisions.
#
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.2.10
# CREATED: 2016-08-16 15:34:30
# REVISION: 2024-02-18 13:27:40
#
#===============================================================================
set -o nounset # Treat unset variables as an error
SCRIPT=${0##*/}
#DIR=${0%/*}
VERSION='0.2.10'
STDOUT="/dev/fd/2"
#
# Load library functions
#
LIB="$HOME/bin/function_lib.sh"
[ -e "$LIB" ] || { echo "$SCRIPT: Unable to source functions"; exit 1; }
# shellcheck source=/home/cendjm/bin/function_lib.sh
source "$LIB"
#
# Colour codes
#
define_colours
# {{{ Functions: -- _usage -- _DEBUG --
#=== FUNCTION ================================================================
# NAME: _usage
# DESCRIPTION: Report usage
# PARAMETERS: None
# RETURNS: Nothing
#===============================================================================
_usage () {
cat >$STDOUT <<-endusage
Usage: ./${SCRIPT} [-h] [-d] [-D] shownumber
Version: $VERSION
Runs Pandoc against a particular show, choosing a format as
defined by the declared format (in the file '.format').
(In this version there is no method to force an explicit input format)
Options:
-h Print this help
-d Select dry run mode
-D Turn on debug mode with lots of extra output
The default behaviour is now to access the '.format' file in the show
directory.
Arguments:
shownumber
Examples
./${SCRIPT} -h
./${SCRIPT} -d 2240
./${SCRIPT} -D 2250
endusage
exit
}
#=== FUNCTION ================================================================
# NAME: _DEBUG
# DESCRIPTION: Writes one or more message lines if in DEBUG mode
# PARAMETERS: List of messages
# RETURNS: Nothing
#===============================================================================
_DEBUG () {
[ "$DEBUG" == 0 ] && return
for msg in "$@"; do
printf 'D> %s\n' "$msg"
done
}
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Formats offered by the web form on the HPR site:
#
# Keyword Explanation
# ------- -----------
# plain_text Plain text
# html5 HTML5 (preferred)
# markdown_standard Markdown (standard)
# Markdown_GitHub Markdown (GitHub flavoured)
# Markdown_Pandoc Markdown (Pandoc flavoured)
# restructured_text RestructuredText
# txt2tags txt2tags
#
#
# Hash to perform translation from declared format to Pandoc "-from" value
#
# {{{ -- 'lookup' hash --
declare -A lookup
lookup[plain_text]='markdown-implicit_figures'
lookup[html5]='html'
lookup[markdown_standard]='markdown_strict'
lookup[Markdown_GitHub]='gfm' # Extensions are limited
lookup[Markdown_Pandoc]='markdown-implicit_figures'
lookup[restructured_text]='rst'
lookup[txt2tags]='t2t'
# }}}
#
# Hash for options when generating standalone readable HTML
#
# (The 'smart' extension is only applicable to markdown, commonmark, latex,
# mediawiki, org, rst, twiki; we want to turn it off to remove smart quotes)
#
# {{{ -- 'options' hash --
declare -A options
options[plain_text]='-smart'
options[html5]=''
options[markdown_standard]='+yaml_metadata_block'
options[Markdown_GitHub]=''
options[Markdown_Pandoc]='-smart+yaml_metadata_block'
options[restructured_text]='-smart'
options[txt2tags]=''
# }}}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Sanity checks
#
JQ=$(command -v jq)
[ -n "$JQ" ] || { echo "Program 'jq' was not found"; exit 1; }
# YQ=$(command -v yq)
# [ -n "$YQ" ] || { echo "Program 'yq' was not found"; exit 1; }
#
# Process options first
#
while getopts :dDh opt
do
case "${opt}" in
d) DRYRUN=1;;
D) DEBUG=1;;
h) _usage;;
?) echo "$SCRIPT: Invalid option; aborting"; exit 1;;
esac
done
shift $((OPTIND - 1))
#
# Default options if not provided
#
DEBUG=${DEBUG:-0}
DRYRUN=${DRYRUN:-0}
#
# Check there's an argument after removing any options. Abort if not
#
if [[ $# -ne 1 ]]; then
_usage
fi
#
# Declare variables for later
# TODO: Is this necessary?
#
declare SHOWID FROM POPTIONS
#
# Make the explicit show id, catering for leading zeroes (belt & braces)
#
printf -v SHOWID 'hpr%04d' "$1"
#
# Make temporary files and set traps to delete them
#
TMP1=$(mktemp) || {
echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
exit 1
}
TMP2=$(mktemp) || {
echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
exit 1
}
TMP3=$(mktemp) || {
echo "$SCRIPT: ${red}creation of temporary file failed!${reset}"
exit 1
}
trap 'cleanup_temp $TMP1 $TMP2 $TMP3' SIGHUP SIGINT SIGPIPE SIGTERM EXIT
#
# Paths to files
#
# ------------------------------------------------------------------------------
# Main directory
BASENAME="$HOME/HPR/Show_Submission"
# JSON to YAML Perl script
J2Y="$BASENAME/author_title.pl"
[ -e "$J2Y" ] || { echo "Program '$J2Y' was not found"; exit 1; }
# The notes for all shows are here
SHOWNOTES="$BASENAME/shownotes"
# Notes for this show are here
SHOWDIR="$SHOWNOTES/$SHOWID"
# Paths to all files already created or being created here
#RAWFILE="$SHOWDIR/shownotes.txt"
JSONFILE="$SHOWDIR/shownotes.json"
FMTFILE="$SHOWDIR/.format"
PICLIST="$SHOWDIR/.pictures"
STATUS="$SHOWDIR/.status"
EXTRACT="$SHOWDIR/${SHOWID}.out"
FULLHTML="$SHOWDIR/${SHOWID}_full.html"
HTML="$SHOWDIR/${SHOWID}.html"
# TT² macros and paths for adding pictures
PICTPL="$BASENAME/pic_definitions.tpl"
MANIFEST="$SHOWDIR/.pictures.mf" # From do_pictures
# ------------------------------------------------------------------------------
#
# The partial URL for pictures on the HPR site
#
BASEURL='https://hackerpublicradio.org/eps/'
#{{{ --- Obsolete 2022-12-17 ---
#
# Make a metadata file by parsing the raw data file fields with awk. Save it
# in a temporary file.
#
# See the original do_pandoc_0.1.6 for the extended comments that led to this
# design. We were trying to make acceptable YAML, but ended up extracting
# metadata from the result.
# ----
# TODO: 2022-12-01 Rationalise all of this; it's full of debris from previous
# attempts to solve the problem of passing metadata to Pandoc.
# ----
#
# awk -f - "$RAWFILE" > "$TMP1" <<'ENDAWK'
# BEGIN {print "---"}
# /^Title:/ && got_title == 0 {
# sub(/^Title:\s+/,"")
# printf "#title: %s\n",$0
# gsub(/'/,"''")
# printf "title: '%s'\n",$0
# got_title = 1
# }
# /^Host_Name:/ && got_author == 0 {
# sub(/^Host_Name:\s+/,"")
# printf "#author: %s\n",$0
# gsub(/'/,"''")
# printf "author: '%s'\n",$0
# got_author = 1
# }
# END {print "---"}
# ENDAWK
#}}}
#
# Use 'jq' to parse the JSON and make the metadata (in the form of YAML)
# needed for Pandoc
#
# Non-YAML alternative - not chosen
#jqprog="@text \"author: \(.host.Host_Name)\ntitle: \(.episode.Title)\""
#
# Testing another formatter (Journal 2023-03-03)
#jqprog="@sh \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\""
# Added quotes around the generated strings (2023-03-31)
# jqprog="@text \"---\nauthor: \(.host.Host_Name)\ntitle: \(.episode.Title)\n---\""
# Moved to 'yq' 2023-04-01
# jqprog="@text \"---\nauthor: '\(.host.Host_Name)'\ntitle: '\(.episode.Title)'\n---\""
# jq -r "$jqprog" "$JSONFILE" > "$TMP1"
# On 2023-10-01 wrote a Perl JSON to YAML generator just for these two
# elements. It's called 'author_title.pl'
# yqprog='{author:.host.Host_Name,title:.episode.Title}'
# ( echo "---"; $YQ -y "$yqprog" "$JSONFILE"; echo "---"; ) > "$TMP1"
$J2Y "$JSONFILE" "$TMP1"
_DEBUG "YAML:" "$(cat "$TMP1")"
#
# Check the main output file from do_parse exists
#
if [[ ! -e $EXTRACT ]]; then
echo "$SCRIPT: ${red}File not found: $EXTRACT${reset}"
exit 1
fi
#
# Get the format or fail with an error
#
if [[ -e $FMTFILE ]]; then
FORMAT=$(cat "$FMTFILE")
else
# TODO: Should we default to something rather than abort?
echo "$SCRIPT: ${red}Could not find declared format (.format file)${reset}"
echo "${yellow}Has do_parse been run? If so try and fix the .format file.${reset}"
exit
fi
#
# Need to match plain text and Markdown variants when deciding to use the
# manifest file and 'tpage'
#
FMTRE='^(plain_text|[Mm]arkdown_)'
#
# Determine if there are pictures
#
if [[ -e $PICLIST ]]; then
hasPictures=1
else
hasPictures=0
fi
#
# Here we use the declared format to determine what to do.
#
# This code now blocks HTML->HTML processing since it just confuses matters.
# The files hprNNNN.out and hprNNNN.html are linked to one another, so editing
# the former will edit the latter in preparation for uploading if there are
# changes that need to be made. See the journal discussion for 2018-05-24.
# ---
# TODO: 2022-12-01 If the notes are HTML but declared as something else then
# this check will not work. Trouble is, parse_JSON will have worked this out
# but not saved it so we can't avoid running Pandoc on HTML in this case.
# ---
# TODO: 2022-12-01 Look at resolving this in parse_JSON?
# ---
#
if [[ $FORMAT == 'html5' ]]; then
echo "${red}Running Pandoc on HTML is not allowed${reset}"
echo "${yellow}Run do_edit to edit as necessary and do_browser to view${reset}"
exit
elif [[ $FORMAT == 'plain_text' ]]; then
echo "${yellow}Format chosen is plain text${reset}"
if [[ $hasPictures -eq 0 ]]; then
echo "${yellow}This will be treated as Markdown${reset}"
else
echo "${yellow}This will be treated as Markdown and will need special action${reset}"
echo "${yellow}since there are pictures${reset}"
fi
fi
FROM=${lookup[$FORMAT]}
POPTIONS=${options[$FORMAT]}
echo "${green}Will process $FORMAT with 'pandoc -f ${FROM}'${reset}"
echo "${yellow}Options chosen for --standalone are '${POPTIONS}'${reset}"
#
# Only for plain text notes, process pictures for the HTML we'll be adding to
# the database.
#
# We need the following things:
# - The notes to be plain text format
# - Some pictures
# - Files written by do_parse and do_pictures:
# - .assets (not used here)
# - .pictures (were previously used here, but no longer)
# - .pictures.mf (needed by the TT² macros)
# - .pictures.tt2 (the TT² macro calls - already edited into the notes)
# - pic_definitions.tpl (macro definitions, common to all shows)
# - Not to be in dry run mode; if we are we just report intentions
#
if [[ $DRYRUN -eq 0 ]]; then
# if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
#
# Deal with pictures using the TT² macros
#
_DEBUG "Processing TT² inclusions"
# Make a picture manifest with a header
awk 'BEGIN{print "file : thumb"}{p1=$0; getline p2; printf "%s : %s\n",p1,p2}' \
"$MANIFEST" > "$TMP2"
_DEBUG "Picture list:" "$(cat "$TMP2")" "---" \
"BASEURL=${BASEURL}${SHOWID}/" \
"EXTRACT=$EXTRACT" \
"Extract file contents:" "$(cat "$EXTRACT")" "---"
# Run the macros on the notes to make $TMP3 for Pandoc
tpage --pre_process="$PICTPL" \
--define "piclist=$TMP2" \
--define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \
"$EXTRACT" > "$TMP3"
_DEBUG "Processed by tpage" "$(cat "$TMP3")" "---"
else
# Not plain text or a Markdown variant and no pictures, so put the
# notes in $TMP3 where Pandoc will look for them
cat "$EXTRACT" > "$TMP3"
fi
else
#
# We would not have used TT² unless the notes were plain text (or
# a Markdown variant) and there were pictures
#
# if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
echo "${yellow}Would have prepared TT² code for pandoc${reset}"
fi
fi
#
# Generate an HTML snippet for adding to the database.
# (Note 2021-11-24: Added --ascii option.)
#
if [[ $DRYRUN -eq 0 ]]; then
# shellcheck disable=SC2086
pandoc -f "$FROM"-smart -t html5 --ascii --no-highlight --strip-comments \
"$TMP3" -o "$HTML" # $EXTRAS
RES=$?
if [[ $RES -eq 0 ]]; then
echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${HTML##*/}${reset}"
else
echo "$SCRIPT: ${red}Oops! Something went wrong! (line $LINENO)${reset}"
echo "${yellow}$SCRIPT: Aborting now${reset}"
exit 1
fi
else
echo "${yellow}Would have run pandoc to make HTML for upload${reset}"
fi
#
# Make HTML for proof reading. All pictures referenced are now on the HPR
# server (we ran 'do_asset_upload'), so we want to refer to them here.
#
# File $TMP2 contains the .pictures.mf contents with a header line; and it
# contains data for the macros. It was created when we prepared the main HTML
# for the database. We use $BASEURL again here because we want to reference
# the pictures on the server.
#
# We use the awk-formatted file (now yq-formatted) in $TMP1 from earlier to do
# this. At the end TMP3 contains Markdown for Pandoc.
#
if [[ $DRYRUN -eq 0 ]]; then
# if [[ $FORMAT == 'plain_text' && -e $MANIFEST ]]; then
if [[ $FORMAT =~ $FMTRE && -e $MANIFEST ]]; then
tpage --pre_process="$PICTPL" \
--define "piclist=$TMP2" \
--define "prefix=${BASEURL}${SHOWID}/${SHOWID}/" \
"$EXTRACT" > "$TMP3"
else
cat "$EXTRACT" > "$TMP3"
fi
#
# Generate complete HTML that we can proofread. We need metadata for this
# stand-alone HTML which is in the form of YAML in this version.
#
# ----------------------------------------------------------------------
# Original options below when using 'awk' to parse shownotes.txt:
# --metadata="$(sed -n '/^#author:/{s/#//;p}' "$TMP1")" \
# --metadata="$(sed -n '/^#title:/{s/#//;p}' "$TMP1")" \
#
# shellcheck disable=SC2086
pandoc -f ${FROM}${POPTIONS} -t html5 --ascii \
--standalone --template=hpr.html5 --no-highlight \
-c https://hackerpublicradio.org/css/hpr.css \
--metadata-file="$TMP1" -o "$FULLHTML" "$TMP3"
RES=$?
if [[ $RES -eq 0 ]]; then
echo "$SCRIPT: ${green}Created shownotes/$SHOWID/${FULLHTML##*/}${reset}"
else
echo "$SCRIPT: ${red}Oops! Something went wrong making the full HTML! (line $LINENO)${reset}"
fi
else
# Dry run
echo "${yellow}Would have run pandoc to make HTML for proof reading${reset}"
fi
#
# Set the status for this show
#
if [[ $DRYRUN -eq 0 ]]; then
echo "converted" >> "$STATUS"
fi
exit
# vim: syntax=sh:ts=8:sw=4:ai:et:tw=78:fo=tcrqn21:fdm=marker