New 'reformat_html', plus some cleaning

InternetArchive/future_upload: now updates the state of shows InternetArchive/reformat_html: new Perl script to reformat the HTML originally found in the HPR database in the 'notes' field to the format required in the 'description' field of an item on the IA. It reads from STDIN and writes to STDOUT.
2025-02-13 11:24:27 +00:00 · 2025-02-13 11:24:27 +00:00 · 0f1e727487
commit 0f1e727487
parent 4feae03fee
4 changed files with 334 additions and 81 deletions
--- a/InternetArchive/future_upload
+++ b/InternetArchive/future_upload
@ -1,9 +1,10 @@
 #!/bin/bash -
+# shellcheck disable=SC2317
 #===============================================================================
 #
 #         FILE: future_upload
 #
-#        USAGE: ./future_upload
+#        USAGE: ./future_upload [-h] [-v] [-D] [-d {0|1}] [-F] [-r] [-l cp]
 #
 #  DESCRIPTION: Uploads future HPR shows based on what is in the upload area
 #
@ -13,9 +14,9 @@
 #        NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as
 #               well as 'update_state'
 #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
-#      VERSION: 0.0.16
+#      VERSION: 0.0.17
 #      CREATED: 2021-01-07 12:11:02
-#     REVISION: 2025-01-01 11:48:40
+#     REVISION: 2025-01-06 17:51:57
 #
 #===============================================================================

@ -26,7 +27,7 @@ SCRIPT=${0##*/}

 STDOUT="/dev/fd/2"

-VERSION="0.0.16"
+VERSION="0.0.17"

 #
 # Load library functions
@ -36,7 +37,7 @@ LIB="$HOME/bin/function_lib.sh"
 # shellcheck disable=SC1090
 source "$LIB"

-# {{{ -- Functions -- check_uploads, _log, _usage
+# {{{ -- Functions -- check_uploads, update_show_state, _log, _usage

 #===  FUNCTION  ================================================================
 #         NAME: check_uploads
@ -72,6 +73,36 @@ check_uploads () {
    return 0
 }

+#===  FUNCTION  ================================================================
+#         NAME: update_show_state
+#  DESCRIPTION: Updates the status of a single show in the HPR database.
+#               It is assumed the caller has found the show number in the
+#               'reservations' table with the required status of
+#               'MEDIA_TRANSCODED'. All this function does is to change this
+#               to 'UPLOADED_TO_IA', returning true if successful, otherwise
+#               false.
+#   PARAMETERS: $show           Show number to update
+#      RETURNS: True if the update worked, otherwise false
+#===============================================================================
+update_show_state () {
+    local show=${1:?Usage: update_state show}
+    local BASECOM URL QUERY COMMAND RES
+
+    BASECOM='curl -K ./.hpradmin_curlrc -s'
+    URL="https://hub.hackerpublicradio.org/cms/status.php"
+    QUERY="${BASECOM} ${URL}"
+
+    COMMAND="${QUERY}?ep_num=${show}&status=UPLOADED_TO_IA"
+
+    $COMMAND
+    RES=$?
+    if [[ $RES -ne 0 ]]; then
+        return 1
+    fi
+
+    return 0
+}
+
 #===  FUNCTION  ================================================================
 #         NAME: _log
 #  DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script
@ -83,7 +114,7 @@ check_uploads () {
 #   PARAMETERS: 1 - the message to write
 #      RETURNS: Nothing
 #===============================================================================
-# shellcheck disable=SC2317 disable=SC2059
+# shellcheck disable=SC2059
 _log () {
    local msg="$1"

@ -180,7 +211,7 @@ BASECOM='curl -K ./.hpradmin_curlrc -s'
 URL="https://hub.hackerpublicradio.org/cms/status.php"
 # QUERY1="${BASECOM} ${URL}"
 QUERY2="${BASECOM} -o - ${URL}"
-UPSTATE="$BASEDIR/update_state"
+# UPSTATE="$BASEDIR/update_state"

 #
 # Fallback URL
@ -199,10 +230,10 @@ ia=$(command -v ia)
    echo "Needs the 'make_metadata' script"
    exit 1
 }
-[ -e "$UPSTATE" ] || {
-    echo "Needs the 'update_state' script"
-    exit 1
-}
+# [ -e "$UPSTATE" ] || {
+#     echo "Needs the 'update_state' script"
+#     exit 1
+# }

 #
 # File of processed shows
@ -234,6 +265,9 @@ do
 done
 shift $((OPTIND - 1))

+#
+# Check and set option variables
+#
 DRYRUN=${DRYRUN:-1}
 if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then
    echo "** Use '-d 0' or '-d 1'"
@ -272,6 +306,7 @@ fi

 #
 # Declarations
+# ------------
 #
 declare -A processed
 declare -A ready
@ -282,6 +317,7 @@ lastitem=

 #
 # Load array of processed shows
+# ---- ----- -- --------- -----
 #
 while read -r item; do
    processed+=([$item]=1)
@ -289,46 +325,17 @@ done < "$PROCFILE"
 [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}"

 #
-# TODO: Create the associative array 'ready' containing the numbers of shows
-# ready for upload. This is a way to ensure that we don't try and upload shows
-# in transit to the upload area.
+# Populate the associative array 'ready' with the numbers of shows ready for
+# upload. This is a way to ensure that we don't try and upload shows in
+# transit to the upload area. Only do this if force mode is off.
 #
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Proposed code. Not sure what the actual URL will be nor what will be
-# returned if nothing is ready for upload yet
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# json=$(curl http://hackerpublicradio.org/queue.php -s -o -)
-# while read -r showno; do
-#     ready+=([$showno]=1)
-# done < <(echo "${json}" | jq '.READY_FOR_IA_UPLOAD[] | tonumber')
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Change of plan. Now we have a list of CSV values, so we need to do something
-# like this:
-#
-# reservations=$($BASECOM -o - $URL)
-# while read -r line; do
-#    if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then
-#       state="${BASH_REMATCH[5]}"
-#       show="${BASH_REMATCH[2]}"
-#    fi
-#    if [[ $state = 'MEDIA_TRANSCODED' ]]; then
-#       ready+=([$show]=1)
-#    fi
-# done <<< $reservations
-#
-# At the end of this the associative array 'ready' will contain the keys of
-# shows that are ready for upload (presumably) so we can look in this array to
-# double check.
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 if [[ $FORCE -eq 0 ]]; then
    #
    # Collect the current table of shows requiring work. We expect something like:
    # timestamp_epoc,ep_num,ep_date,key,status,email
    # 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com
    # 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com
-    # TODO: Check for a failure in the query?A
+    # TODO: Reinstate the check for a failure in the query? Se update_state
    # NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired
    #
    reservations=$($QUERY2) || {
@ -353,8 +360,8 @@ if [[ $FORCE -eq 0 ]]; then
    fi

    #
-    # The query returns the bare number, but we're using 'hprxxxx' as the key in
-    # the 'ready' array.
+    # The query returns the bare show number, but we're using 'hprxxxx' as the
+    # key in the 'ready' array.
    #
    while read -r line; do
        if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then
@ -374,7 +381,10 @@ fi

 #
 # Process files. There will be several with the same prefix so look for
-# a change of prefix
+# a change of prefix.
+#
+# The loop is reading from the following pipeline:
+# find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort
 #
 while read -r path; do
    #
@ -390,8 +400,8 @@ while read -r path; do
    _DEBUG "Item $item"

    #
-    # Detect that the item prefix has changed. If it has we're processing
-    # a new IA identifier, so work on this one
+    # Detect that the item prefix has changed. If it has we've found a new IA
+    # identifier, so work on the previous one
    #
    if [[ $item != "$lastitem" ]]; then
        lastitem=$item
@ -425,7 +435,8 @@ while read -r path; do
                processed+=([$lastitem]=1)
            else
                #
-                # Is the show ready for upload?
+                # Is the show ready for upload? We don't check if force mode
+                # is on. If not ready we skip this show.
                #
                if [[ $FORCE -eq 0 ]]; then
                    if [[ ! -v "ready[$lastitem]" ]]; then
@ -472,10 +483,9 @@ while read -r path; do

 done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort)

-#
+#-------------------------------------------------------------------------------
 # Write the processed array to the cache file unless in dry-run mode
-#
-# [ $DEBUG -eq 1 ] && { echo -n 'D> '; declare -p processed; }
+#-------------------------------------------------------------------------------
 _DEBUG "processed = ${!processed[*]}"
 [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}"
 if [[ $DRYRUN -ne 1 ]]; then
@ -484,24 +494,26 @@ if [[ $DRYRUN -ne 1 ]]; then
    done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE"
 fi

-#
+#-------------------------------------------------------------------------------
 # Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'.
+# The show numbers are keys in the associative array 'uploads'. The
+# end-product is a comma-separated list of the keys in the variable '$list'.
 # Order is unimportant because make_metadata sorts internally.
-#
+#-------------------------------------------------------------------------------
 _DEBUG "uploads = ${!uploads[*]}"
 [ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}"
 printf -v list '%s,' "${!uploads[@]}"
 list="${list:0:-1}"

-#
+#-------------------------------------------------------------------------------
 # If there are no uploads to do we can stop
-#
+#-------------------------------------------------------------------------------
 [[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; }

-#
+#-------------------------------------------------------------------------------
 # Check that the shows being uploaded have all their files and log what is
 # happening.
-#
+#-------------------------------------------------------------------------------
 while read -r show; do
    echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE"

@ -512,10 +524,10 @@ while read -r show; do
    fi
 done < <(printf '%s\n' "${!uploads[@]}" | sort)

-#
+#-------------------------------------------------------------------------------
 # Define output files. If the list contains one element then it's a different
 # name from the multi-element case (make_metadata does this too).
-#
+#-------------------------------------------------------------------------------
 if [[ ${#uploads[@]} -eq 1 ]]; then
    metadata="metadata_${minshow}.csv"
    script="script_${minshow}.sh"
@ -524,9 +536,9 @@ else
    script="script_${minshow}-${maxshow}.sh"
 fi

-#
+#-------------------------------------------------------------------------------
 # Perform the uploads or report what would be done
-#
+#-------------------------------------------------------------------------------
 if [[ $DRYRUN -eq 1 ]]; then
    echo "Dry run: Would have uploaded list '$list'"
    echo "Dry run: Would have created $metadata and $script"
@ -573,17 +585,17 @@ else
                echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE"

                #
-                # Update the state in the HPR database, unless we're using
-                # FORCE. Pass the limit used here to this script so it can
-                # stop looking for work unnecessarily
+                # Update the state  of all the shows being processed in the
+                # HPR database, unless we're using FORCE.
                #
                if [[ $FORCE -eq 0 ]]; then
-                    $UPSTATE -l$LIMIT
-                    RES=$?
-                    if [[ $RES -ne 0 ]]; then
-                        echo "Problem updating database state"
-                        exit 1
+                    while read -r show; do
+                        if update_show_state $show; then
+                            echo "Updated state for show $show"
+                        else
+                            echo "Failed to update state for show $show"
                        fi
+                    done < <(printf '%s\n' "${!uploads[@]}" | sort)
                else
                    echo "Not updating the database, FORCE mode is on"
                fi
--- a/InternetArchive/reformat_html
+++ b/InternetArchive/reformat_html
@ -0,0 +1,245 @@
+#!/usr/bin/env perl
+#===============================================================================
+#
+#         FILE: reformat_html
+#
+#        USAGE: ./reformat_html < input.html > output.html
+#
+#  DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes'
+#               field to the format required in the 'description' field of an
+#               item on the IA. It reads from STDIN and writes to STDOUT.
+#
+#      OPTIONS: ---
+# REQUIREMENTS: ---
+#         BUGS: ---
+#        NOTES: ---
+#       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com
+#      VERSION: 0.0.1
+#      CREATED: 2025-02-09 22:56:30
+#     REVISION: 2025-02-13 11:13:37
+#
+#===============================================================================
+
+use v5.36;
+use strict;
+use warnings;
+use feature qw{ say try };
+no warnings qw{ experimental::try };
+
+use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8
+
+use HTML::TreeBuilder 5 -weak;
+use HTML::Entities;
+
+#
+# Version number (Incremented by Vim)
+#
+our $VERSION = '0.0.1';
+
+#
+# Declarations
+#
+my ($verbose, @notes, $notes, $tree);
+
+#
+# Read the input data into an array
+#
+try {
+    @notes = <STDIN>;
+}
+catch ($e) {
+    warn "Problem reading input HTML; $e";
+    exit 1;
+}
+
+die "No input HTML detected\n" unless @notes;
+
+#
+# Turn the array into a scalar
+#
+$notes = join( '', @notes );
+
+#
+# Get ready to parse the array
+#
+$tree = HTML::TreeBuilder->new;
+$tree->ignore_unknown(0);
+$tree->no_expand_entities(1);
+$tree->p_strict(1);
+$tree->store_comments(1);               # Necessary?
+$tree->warn(1);
+
+#
+# Parse HTML to the tree structure
+#
+$tree->parse_content($notes)
+    or die "HTML::TreeBuilder failed to parse input HTML: $!\n";
+
+#
+# Flatten all <pre> tags and add <br/> tags
+#
+$notes = flatten_pre($tree);
+
+#
+# Deal with non-ASCII
+#
+$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' );
+
+#
+# Remove all newlines
+#
+$notes =~ s/\n//g;
+
+#
+# Write the end result to the STDOUT
+#
+say $notes;
+
+exit;
+
+#===  FUNCTION  ================================================================
+#         NAME: flatten_pre
+#      PURPOSE: Process notes "flattening" <pre> contents
+#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
+#                       partially processed notes
+#      RETURNS: Processed notes
+#  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also
+#               contain some formatting tags like <em> and <code>, but spaces
+#               and newlines are significant. The Internet Archive upload API
+#               uses HTTP headers which are text strings without newlines, so
+#               when these tags are uploaded through this route some
+#               formatting is lost. What this routine does is parse the
+#               contents of all <pre> sections in $notes, adding <br/> tags
+#               to replace newlines. It has to perform a full parse
+#               since the contents may include HTML tags and these need to be
+#               passed through intact. It calls the subroutine 'flatten_item' to
+#               deal with the recursive nature of HTML tags.
+#       THROWS: No exceptions
+#     COMMENTS: None
+#     SEE ALSO: N/A
+#===============================================================================
+sub flatten_pre {
+    my ($tree) = @_;
+
+    #
+    # Find all the <pre> tags
+    #
+    my @pre_tags = $tree->look_down( _tag => 'pre', );
+
+    #
+    # Walk the various <pre> elements in the document
+    #
+    foreach my $tag (@pre_tags) {
+        #
+        # Save the tag and empty the original
+        #
+        my $saved = $tag->clone();
+        $tag->delete_content();
+
+        #
+        # Walk the saved content and rebuild the tag into $atag using the
+        # nested arrayref structure permitted by HTML::Element for
+        # convenience (the alternative is a little nasty). See the
+        # documentation for 'new_from_lol' in HTML::Element.
+        #
+        my $atag;
+        foreach my $item ( @{ $saved->content_array_ref } ) {
+            push( @$atag, flatten_item($item) );
+        }
+
+        #
+        # Rebuild the tag from the arrayref we built. We treat the arrayref
+        # structure we just built as an array because otherwise the top level
+        # is interpreted as a spurious <null> tag.
+        #
+        $tag->push_content(@$atag);
+    }
+
+    #
+    # Trim out the original notes from the enclosing tags we added earlier
+    #
+    my $body = $tree->look_down( _tag => 'body' );
+    ( my $result = $body->as_HTML( undef, ' ', {} ) )
+        =~ s{(^<body[^>]*>|</body>$)}{}gi;
+
+    return $result;
+
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: flatten_item
+#      PURPOSE: Recursively "flatten" items within the enclosing <pre>
+#   PARAMETERS: $item   an HTML::Element item parsed from the original
+#                       <pre> section
+#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
+#  DESCRIPTION: Since <pre> sections can contain inline elements which change
+#               the rendering of the text we need to parse these as we add
+#               <br/> tags. This routine does this by recursively descending
+#               through the contents. A common tag sequence is <pre><code> for
+#               scripts and the like. This routine deals with such sequences.
+#               It expects to receive the contents in sequence and builds the
+#               result as a nested arrayref structure.
+#       THROWS: No exceptions
+#     COMMENTS: None
+#     SEE ALSO: N/A
+#===============================================================================
+sub flatten_item {
+    my ($item) = @_;
+
+    return unless defined($item);
+
+    my ( @result, %attr );
+
+    #
+    # Is it a sub-tag or non-tag content?
+    #
+    if ( ref($item) ) {
+        #
+        # It's a tag. Save the tag name and any attributes and recurse into
+        # it. Return an arrayref
+        #
+        push( @result, $item->tag() );
+        %attr = $item->all_external_attr();
+        push( @result, \%attr ) if %attr;
+        for my $child ( $item->content_list() ) {
+            push( @result, flatten_item($child) );
+        }
+        return \@result;
+    }
+    else {
+        #
+        # It's non-tag content. Join the lines with <br/> tags.  Return an
+        # array (since this is a simple list).
+        #
+        # Note that we split with a LIMIT of -1 which causes any trailing list
+        # items to be returned; default behaviour is to drop them.
+        #
+        $item =~ s/\r//g;
+        my @content = split( /\n/, $item, -1 );
+        if (@content) {
+            #
+            # Remove a leading blank line - usually the result of
+            # a "<pre>'NL'text" sequence
+            #
+            shift(@content) if ( $content[0] =~ /^\s*$/ );
+
+            #
+            # Join back the lines with <br/> tags between them.
+            #
+            foreach my $txt (@content) {
+                push( @result, $txt, ['br'] );
+            }
+
+            #
+            # Remove the <br/> at the end, it's spurious
+            #
+            pop(@result);
+        }
+
+        return (@result);
+    }
+
+}
+
+# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker
+
--- a/InternetArchive/tidy_uploaded
+++ b/InternetArchive/tidy_uploaded
@ -6,7 +6,7 @@
 #        USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT]
 #
 #  DESCRIPTION: Relocates HPR audio and other show-related files on 'borg'
-#               after their shows have been uploaded to the Internet Archive
+#               after their shows have been uploaded to the Internet Archive.
 #
 #      OPTIONS: ---
 # REQUIREMENTS: ---
@ -43,7 +43,7 @@ TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1;
 trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT

 #
-# Configure depending whether local or on the VPS
+# Configure depending whether local or on 'borg'
 #
 case $HOSTNAME in
    borg)       BASEDIR="$HOME/InternetArchive"
@ -95,7 +95,7 @@ queued_tasks () {
 #         NAME: movefile
 #  DESCRIPTION: Moves a file to a new place, catering for any directories in
 #               the path
-#   PARAMETERS: $1      directory to move form
+#   PARAMETERS: $1      directory to move from
 #               $2      directory to move to
 #               $3      file (or sub-path to move)
 #      RETURNS: True if a move was done, otherwise False
@ -356,7 +356,7 @@ while read -r path; do
        #
        tasks=$(queued_tasks "$item")
        if [[ $tasks -gt 0 ]]; then
-            echo "** Item $item still has $tasks unfinished " \
+            echo "** Item $item still has $tasks unfinished" \
                "$(ngettext task tasks "$tasks")"
            echo "** Skipping to the next item"
            continue
@ -434,9 +434,6 @@ while read -r path; do

 done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ')

-# Old 'find' used:
-# done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort)
-
 #
 # No shows processed? There was nothing to do
 #
--- a/InternetArchive/update_state
+++ b/InternetArchive/update_state
@ -3,7 +3,7 @@
 #
 #         FILE: update_state
 #
-#        USAGE: ./update_state
+#        USAGE: ./update_state [-h] [-D] [-d] [-F] [-l N] [-m]
 #
 #  DESCRIPTION: A script to update the state of shows which have been sent to
 #               the IA. It looks at the current state of the 'reservations'
@ -136,7 +136,6 @@ esac

 cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; }

-
 #
 # Tools
 #