1
0
forked from HPR/hpr-tools

New 'reformat_html', plus some cleaning

InternetArchive/future_upload: now updates the state of shows

InternetArchive/reformat_html: new Perl script to reformat the HTML
    originally found in the HPR database in the 'notes' field to the format
    required in the 'description' field of an item on the IA. It reads
    from STDIN and writes to STDOUT.
This commit is contained in:
Dave Morriss
2025-02-13 11:24:27 +00:00
parent 4feae03fee
commit 0f1e727487
4 changed files with 334 additions and 81 deletions

View File

@@ -1,9 +1,10 @@
#!/bin/bash -
# shellcheck disable=SC2317
#===============================================================================
#
# FILE: future_upload
#
# USAGE: ./future_upload
# USAGE: ./future_upload [-h] [-v] [-D] [-d {0|1}] [-F] [-r] [-l cp]
#
# DESCRIPTION: Uploads future HPR shows based on what is in the upload area
#
@@ -13,9 +14,9 @@
# NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as
# well as 'update_state'
# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
# VERSION: 0.0.16
# VERSION: 0.0.17
# CREATED: 2021-01-07 12:11:02
# REVISION: 2025-01-01 11:48:40
# REVISION: 2025-01-06 17:51:57
#
#===============================================================================
@@ -26,7 +27,7 @@ SCRIPT=${0##*/}
STDOUT="/dev/fd/2"
VERSION="0.0.16"
VERSION="0.0.17"
#
# Load library functions
@@ -36,7 +37,7 @@ LIB="$HOME/bin/function_lib.sh"
# shellcheck disable=SC1090
source "$LIB"
# {{{ -- Functions -- check_uploads, _log, _usage
# {{{ -- Functions -- check_uploads, update_show_state, _log, _usage
#=== FUNCTION ================================================================
# NAME: check_uploads
@@ -72,6 +73,36 @@ check_uploads () {
return 0
}
#=== FUNCTION ================================================================
# NAME: update_show_state
# DESCRIPTION: Updates the status of a single show in the HPR database.
# It is assumed the caller has found the show number in the
# 'reservations' table with the required status of
# 'MEDIA_TRANSCODED'. All this function does is to change this
# to 'UPLOADED_TO_IA', returning true if successful, otherwise
# false.
# PARAMETERS: $show Show number to update
# RETURNS: True if the update worked, otherwise false
#===============================================================================
update_show_state () {
local show=${1:?Usage: update_state show}
local BASECOM URL QUERY COMMAND RES
BASECOM='curl -K ./.hpradmin_curlrc -s'
URL="https://hub.hackerpublicradio.org/cms/status.php"
QUERY="${BASECOM} ${URL}"
COMMAND="${QUERY}?ep_num=${show}&status=UPLOADED_TO_IA"
$COMMAND
RES=$?
if [[ $RES -ne 0 ]]; then
return 1
fi
return 0
}
#=== FUNCTION ================================================================
# NAME: _log
# DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script
@@ -83,7 +114,7 @@ check_uploads () {
# PARAMETERS: 1 - the message to write
# RETURNS: Nothing
#===============================================================================
# shellcheck disable=SC2317 disable=SC2059
# shellcheck disable=SC2059
_log () {
local msg="$1"
@@ -180,7 +211,7 @@ BASECOM='curl -K ./.hpradmin_curlrc -s'
URL="https://hub.hackerpublicradio.org/cms/status.php"
# QUERY1="${BASECOM} ${URL}"
QUERY2="${BASECOM} -o - ${URL}"
UPSTATE="$BASEDIR/update_state"
# UPSTATE="$BASEDIR/update_state"
#
# Fallback URL
@@ -199,10 +230,10 @@ ia=$(command -v ia)
echo "Needs the 'make_metadata' script"
exit 1
}
[ -e "$UPSTATE" ] || {
echo "Needs the 'update_state' script"
exit 1
}
# [ -e "$UPSTATE" ] || {
# echo "Needs the 'update_state' script"
# exit 1
# }
#
# File of processed shows
@@ -234,6 +265,9 @@ do
done
shift $((OPTIND - 1))
#
# Check and set option variables
#
DRYRUN=${DRYRUN:-1}
if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
echo "** Use '-d 0' or '-d 1'"
@@ -272,6 +306,7 @@ fi
#
# Declarations
# ------------
#
declare -A processed
declare -A ready
@@ -282,6 +317,7 @@ lastitem=
#
# Load array of processed shows
# ---- ----- -- --------- -----
#
while read -r item; do
processed+=([$item]=1)
@@ -289,46 +325,17 @@ done < "$PROCFILE"
[ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}"
#
# TODO: Create the associative array 'ready' containing the numbers of shows
# ready for upload. This is a way to ensure that we don't try and upload shows
# in transit to the upload area.
# Populate the associative array 'ready' with the numbers of shows ready for
# upload. This is a way to ensure that we don't try and upload shows in
# transit to the upload area. Only do this if force mode is off.
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Proposed code. Not sure what the actual URL will be nor what will be
# returned if nothing is ready for upload yet
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# json=$(curl http://hackerpublicradio.org/queue.php -s -o -)
# while read -r showno; do
# ready+=([$showno]=1)
# done < <(echo "${json}" | jq '.READY_FOR_IA_UPLOAD[] | tonumber')
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Change of plan. Now we have a list of CSV values, so we need to do something
# like this:
#
# reservations=$($BASECOM -o - $URL)
# while read -r line; do
# if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then
# state="${BASH_REMATCH[5]}"
# show="${BASH_REMATCH[2]}"
# fi
# if [[ $state = 'MEDIA_TRANSCODED' ]]; then
# ready+=([$show]=1)
# fi
# done <<< $reservations
#
# At the end of this the associative array 'ready' will contain the keys of
# shows that are ready for upload (presumably) so we can look in this array to
# double check.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if [[ $FORCE -eq 0 ]]; then
#
# Collect the current table of shows requiring work. We expect something like:
# timestamp_epoc,ep_num,ep_date,key,status,email
# 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com
# 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com
# TODO: Check for a failure in the query?A
# TODO: Reinstate the check for a failure in the query? Se update_state
# NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired
#
reservations=$($QUERY2) || {
@@ -353,8 +360,8 @@ if [[ $FORCE -eq 0 ]]; then
fi
#
# The query returns the bare number, but we're using 'hprxxxx' as the key in
# the 'ready' array.
# The query returns the bare show number, but we're using 'hprxxxx' as the
# key in the 'ready' array.
#
while read -r line; do
if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then
@@ -374,7 +381,10 @@ fi
#
# Process files. There will be several with the same prefix so look for
# a change of prefix
# a change of prefix.
#
# The loop is reading from the following pipeline:
# find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort
#
while read -r path; do
#
@@ -390,8 +400,8 @@ while read -r path; do
_DEBUG "Item $item"
#
# Detect that the item prefix has changed. If it has we're processing
# a new IA identifier, so work on this one
# Detect that the item prefix has changed. If it has we've found a new IA
# identifier, so work on the previous one
#
if [[ $item != "$lastitem" ]]; then
lastitem=$item
@@ -425,7 +435,8 @@ while read -r path; do
processed+=([$lastitem]=1)
else
#
# Is the show ready for upload?
# Is the show ready for upload? We don't check if force mode
# is on. If not ready we skip this show.
#
if [[ $FORCE -eq 0 ]]; then
if [[ ! -v "ready[$lastitem]" ]]; then
@@ -472,10 +483,9 @@ while read -r path; do
done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort)
#
#-------------------------------------------------------------------------------
# Write the processed array to the cache file unless in dry-run mode
#
# [ $DEBUG -eq 1 ] && { echo -n 'D> '; declare -p processed; }
#-------------------------------------------------------------------------------
_DEBUG "processed = ${!processed[*]}"
[ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}"
if [[ $DRYRUN -ne 1 ]]; then
@@ -484,24 +494,26 @@ if [[ $DRYRUN -ne 1 ]]; then
done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE"
fi
#
#-------------------------------------------------------------------------------
# Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'.
# The show numbers are keys in the associative array 'uploads'. The
# end-product is a comma-separated list of the keys in the variable '$list'.
# Order is unimportant because make_metadata sorts internally.
#
#-------------------------------------------------------------------------------
_DEBUG "uploads = ${!uploads[*]}"
[ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}"
printf -v list '%s,' "${!uploads[@]}"
list="${list:0:-1}"
#
#-------------------------------------------------------------------------------
# If there are no uploads to do we can stop
#
#-------------------------------------------------------------------------------
[[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; }
#
#-------------------------------------------------------------------------------
# Check that the shows being uploaded have all their files and log what is
# happening.
#
#-------------------------------------------------------------------------------
while read -r show; do
echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE"
@@ -512,10 +524,10 @@ while read -r show; do
fi
done < <(printf '%s\n' "${!uploads[@]}" | sort)
#
#-------------------------------------------------------------------------------
# Define output files. If the list contains one element then it's a different
# name from the multi-element case (make_metadata does this too).
#
#-------------------------------------------------------------------------------
if [[ ${#uploads[@]} -eq 1 ]]; then
metadata="metadata_${minshow}.csv"
script="script_${minshow}.sh"
@@ -524,9 +536,9 @@ else
script="script_${minshow}-${maxshow}.sh"
fi
#
#-------------------------------------------------------------------------------
# Perform the uploads or report what would be done
#
#-------------------------------------------------------------------------------
if [[ $DRYRUN -eq 1 ]]; then
echo "Dry run: Would have uploaded list '$list'"
echo "Dry run: Would have created $metadata and $script"
@@ -573,17 +585,17 @@ else
echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE"
#
# Update the state in the HPR database, unless we're using
# FORCE. Pass the limit used here to this script so it can
# stop looking for work unnecessarily
# Update the state of all the shows being processed in the
# HPR database, unless we're using FORCE.
#
if [[ $FORCE -eq 0 ]]; then
$UPSTATE -l$LIMIT
RES=$?
if [[ $RES -ne 0 ]]; then
echo "Problem updating database state"
exit 1
fi
while read -r show; do
if update_show_state $show; then
echo "Updated state for show $show"
else
echo "Failed to update state for show $show"
fi
done < <(printf '%s\n' "${!uploads[@]}" | sort)
else
echo "Not updating the database, FORCE mode is on"
fi