From 0f1e72748719e08783cee5edf714e6a9cd370363 Mon Sep 17 00:00:00 2001 From: Dave Morriss Date: Thu, 13 Feb 2025 11:24:27 +0000 Subject: [PATCH] New 'reformat_html', plus some cleaning InternetArchive/future_upload: now updates the state of shows InternetArchive/reformat_html: new Perl script to reformat the HTML originally found in the HPR database in the 'notes' field to the format required in the 'description' field of an item on the IA. It reads from STDIN and writes to STDOUT. --- InternetArchive/future_upload | 156 ++++++++++++---------- InternetArchive/reformat_html | 245 ++++++++++++++++++++++++++++++++++ InternetArchive/tidy_uploaded | 11 +- InternetArchive/update_state | 3 +- 4 files changed, 334 insertions(+), 81 deletions(-) create mode 100755 InternetArchive/reformat_html diff --git a/InternetArchive/future_upload b/InternetArchive/future_upload index 5b2868d..4117c36 100755 --- a/InternetArchive/future_upload +++ b/InternetArchive/future_upload @@ -1,9 +1,10 @@ #!/bin/bash - +# shellcheck disable=SC2317 #=============================================================================== # # FILE: future_upload # -# USAGE: ./future_upload +# USAGE: ./future_upload [-h] [-v] [-D] [-d {0|1}] [-F] [-r] [-l cp] # # DESCRIPTION: Uploads future HPR shows based on what is in the upload area # @@ -13,9 +14,9 @@ # NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as # well as 'update_state' # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.16 +# VERSION: 0.0.17 # CREATED: 2021-01-07 12:11:02 -# REVISION: 2025-01-01 11:48:40 +# REVISION: 2025-01-06 17:51:57 # #=============================================================================== @@ -26,7 +27,7 @@ SCRIPT=${0##*/} STDOUT="/dev/fd/2" -VERSION="0.0.16" +VERSION="0.0.17" # # Load library functions @@ -36,7 +37,7 @@ LIB="$HOME/bin/function_lib.sh" # shellcheck disable=SC1090 source "$LIB" -# {{{ -- Functions -- check_uploads, _log, _usage +# {{{ -- Functions -- check_uploads, update_show_state, _log, _usage #=== FUNCTION ================================================================ # NAME: check_uploads @@ -72,6 +73,36 @@ check_uploads () { return 0 } +#=== FUNCTION ================================================================ +# NAME: update_show_state +# DESCRIPTION: Updates the status of a single show in the HPR database. +# It is assumed the caller has found the show number in the +# 'reservations' table with the required status of +# 'MEDIA_TRANSCODED'. All this function does is to change this +# to 'UPLOADED_TO_IA', returning true if successful, otherwise +# false. +# PARAMETERS: $show Show number to update +# RETURNS: True if the update worked, otherwise false +#=============================================================================== +update_show_state () { + local show=${1:?Usage: update_state show} + local BASECOM URL QUERY COMMAND RES + + BASECOM='curl -K ./.hpradmin_curlrc -s' + URL="https://hub.hackerpublicradio.org/cms/status.php" + QUERY="${BASECOM} ${URL}" + + COMMAND="${QUERY}?ep_num=${show}&status=UPLOADED_TO_IA" + + $COMMAND + RES=$? + if [[ $RES -ne 0 ]]; then + return 1 + fi + + return 0 +} + #=== FUNCTION ================================================================ # NAME: _log # DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script @@ -83,7 +114,7 @@ check_uploads () { # PARAMETERS: 1 - the message to write # RETURNS: Nothing #=============================================================================== -# shellcheck disable=SC2317 disable=SC2059 +# shellcheck disable=SC2059 _log () { local msg="$1" @@ -180,7 +211,7 @@ BASECOM='curl -K ./.hpradmin_curlrc -s' URL="https://hub.hackerpublicradio.org/cms/status.php" # QUERY1="${BASECOM} ${URL}" QUERY2="${BASECOM} -o - ${URL}" -UPSTATE="$BASEDIR/update_state" +# UPSTATE="$BASEDIR/update_state" # # Fallback URL @@ -199,10 +230,10 @@ ia=$(command -v ia) echo "Needs the 'make_metadata' script" exit 1 } -[ -e "$UPSTATE" ] || { - echo "Needs the 'update_state' script" - exit 1 -} +# [ -e "$UPSTATE" ] || { +# echo "Needs the 'update_state' script" +# exit 1 +# } # # File of processed shows @@ -234,6 +265,9 @@ do done shift $((OPTIND - 1)) +# +# Check and set option variables +# DRYRUN=${DRYRUN:-1} if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then echo "** Use '-d 0' or '-d 1'" @@ -272,6 +306,7 @@ fi # # Declarations +# ------------ # declare -A processed declare -A ready @@ -282,6 +317,7 @@ lastitem= # # Load array of processed shows +# ---- ----- -- --------- ----- # while read -r item; do processed+=([$item]=1) @@ -289,46 +325,17 @@ done < "$PROCFILE" [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" # -# TODO: Create the associative array 'ready' containing the numbers of shows -# ready for upload. This is a way to ensure that we don't try and upload shows -# in transit to the upload area. +# Populate the associative array 'ready' with the numbers of shows ready for +# upload. This is a way to ensure that we don't try and upload shows in +# transit to the upload area. Only do this if force mode is off. # -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Proposed code. Not sure what the actual URL will be nor what will be -# returned if nothing is ready for upload yet -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# json=$(curl http://hackerpublicradio.org/queue.php -s -o -) -# while read -r showno; do -# ready+=([$showno]=1) -# done < <(echo "${json}" | jq '.READY_FOR_IA_UPLOAD[] | tonumber') -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Change of plan. Now we have a list of CSV values, so we need to do something -# like this: -# -# reservations=$($BASECOM -o - $URL) -# while read -r line; do -# if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then -# state="${BASH_REMATCH[5]}" -# show="${BASH_REMATCH[2]}" -# fi -# if [[ $state = 'MEDIA_TRANSCODED' ]]; then -# ready+=([$show]=1) -# fi -# done <<< $reservations -# -# At the end of this the associative array 'ready' will contain the keys of -# shows that are ready for upload (presumably) so we can look in this array to -# double check. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - if [[ $FORCE -eq 0 ]]; then # # Collect the current table of shows requiring work. We expect something like: # timestamp_epoc,ep_num,ep_date,key,status,email # 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com # 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com - # TODO: Check for a failure in the query?A + # TODO: Reinstate the check for a failure in the query? Se update_state # NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired # reservations=$($QUERY2) || { @@ -353,8 +360,8 @@ if [[ $FORCE -eq 0 ]]; then fi # - # The query returns the bare number, but we're using 'hprxxxx' as the key in - # the 'ready' array. + # The query returns the bare show number, but we're using 'hprxxxx' as the + # key in the 'ready' array. # while read -r line; do if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then @@ -374,7 +381,10 @@ fi # # Process files. There will be several with the same prefix so look for -# a change of prefix +# a change of prefix. +# +# The loop is reading from the following pipeline: +# find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort # while read -r path; do # @@ -390,8 +400,8 @@ while read -r path; do _DEBUG "Item $item" # - # Detect that the item prefix has changed. If it has we're processing - # a new IA identifier, so work on this one + # Detect that the item prefix has changed. If it has we've found a new IA + # identifier, so work on the previous one # if [[ $item != "$lastitem" ]]; then lastitem=$item @@ -425,7 +435,8 @@ while read -r path; do processed+=([$lastitem]=1) else # - # Is the show ready for upload? + # Is the show ready for upload? We don't check if force mode + # is on. If not ready we skip this show. # if [[ $FORCE -eq 0 ]]; then if [[ ! -v "ready[$lastitem]" ]]; then @@ -472,10 +483,9 @@ while read -r path; do done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) -# +#------------------------------------------------------------------------------- # Write the processed array to the cache file unless in dry-run mode -# -# [ $DEBUG -eq 1 ] && { echo -n 'D> '; declare -p processed; } +#------------------------------------------------------------------------------- _DEBUG "processed = ${!processed[*]}" [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" if [[ $DRYRUN -ne 1 ]]; then @@ -484,24 +494,26 @@ if [[ $DRYRUN -ne 1 ]]; then done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE" fi -# +#------------------------------------------------------------------------------- # Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'. +# The show numbers are keys in the associative array 'uploads'. The +# end-product is a comma-separated list of the keys in the variable '$list'. # Order is unimportant because make_metadata sorts internally. -# +#------------------------------------------------------------------------------- _DEBUG "uploads = ${!uploads[*]}" [ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}" printf -v list '%s,' "${!uploads[@]}" list="${list:0:-1}" -# +#------------------------------------------------------------------------------- # If there are no uploads to do we can stop -# +#------------------------------------------------------------------------------- [[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; } -# +#------------------------------------------------------------------------------- # Check that the shows being uploaded have all their files and log what is # happening. -# +#------------------------------------------------------------------------------- while read -r show; do echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE" @@ -512,10 +524,10 @@ while read -r show; do fi done < <(printf '%s\n' "${!uploads[@]}" | sort) -# +#------------------------------------------------------------------------------- # Define output files. If the list contains one element then it's a different # name from the multi-element case (make_metadata does this too). -# +#------------------------------------------------------------------------------- if [[ ${#uploads[@]} -eq 1 ]]; then metadata="metadata_${minshow}.csv" script="script_${minshow}.sh" @@ -524,9 +536,9 @@ else script="script_${minshow}-${maxshow}.sh" fi -# +#------------------------------------------------------------------------------- # Perform the uploads or report what would be done -# +#------------------------------------------------------------------------------- if [[ $DRYRUN -eq 1 ]]; then echo "Dry run: Would have uploaded list '$list'" echo "Dry run: Would have created $metadata and $script" @@ -573,17 +585,17 @@ else echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE" # - # Update the state in the HPR database, unless we're using - # FORCE. Pass the limit used here to this script so it can - # stop looking for work unnecessarily + # Update the state of all the shows being processed in the + # HPR database, unless we're using FORCE. # if [[ $FORCE -eq 0 ]]; then - $UPSTATE -l$LIMIT - RES=$? - if [[ $RES -ne 0 ]]; then - echo "Problem updating database state" - exit 1 - fi + while read -r show; do + if update_show_state $show; then + echo "Updated state for show $show" + else + echo "Failed to update state for show $show" + fi + done < <(printf '%s\n' "${!uploads[@]}" | sort) else echo "Not updating the database, FORCE mode is on" fi diff --git a/InternetArchive/reformat_html b/InternetArchive/reformat_html new file mode 100755 index 0000000..a7d6106 --- /dev/null +++ b/InternetArchive/reformat_html @@ -0,0 +1,245 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: reformat_html +# +# USAGE: ./reformat_html < input.html > output.html +# +# DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes' +# field to the format required in the 'description' field of an +# item on the IA. It reads from STDIN and writes to STDOUT. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.1 +# CREATED: 2025-02-09 22:56:30 +# REVISION: 2025-02-13 11:13:37 +# +#=============================================================================== + +use v5.36; +use strict; +use warnings; +use feature qw{ say try }; +no warnings qw{ experimental::try }; + +use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 + +use HTML::TreeBuilder 5 -weak; +use HTML::Entities; + +# +# Version number (Incremented by Vim) +# +our $VERSION = '0.0.1'; + +# +# Declarations +# +my ($verbose, @notes, $notes, $tree); + +# +# Read the input data into an array +# +try { + @notes = ; +} +catch ($e) { + warn "Problem reading input HTML; $e"; + exit 1; +} + +die "No input HTML detected\n" unless @notes; + +# +# Turn the array into a scalar +# +$notes = join( '', @notes ); + +# +# Get ready to parse the array +# +$tree = HTML::TreeBuilder->new; +$tree->ignore_unknown(0); +$tree->no_expand_entities(1); +$tree->p_strict(1); +$tree->store_comments(1); # Necessary? +$tree->warn(1); + +# +# Parse HTML to the tree structure +# +$tree->parse_content($notes) + or die "HTML::TreeBuilder failed to parse input HTML: $!\n"; + +# +# Flatten all
 tags and add 
tags +# +$notes = flatten_pre($tree); + +# +# Deal with non-ASCII +# +$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); + +# +# Remove all newlines +# +$notes =~ s/\n//g; + +# +# Write the end result to the STDOUT +# +say $notes; + +exit; + +#=== FUNCTION ================================================================ +# NAME: flatten_pre +# PURPOSE: Process notes "flattening"
 contents
+#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
+#                       partially processed notes
+#      RETURNS: Processed notes
+#  DESCRIPTION: The HTML "
" tag encloses preformatted text. It can also
+#               contain some formatting tags like  and , but spaces
+#               and newlines are significant. The Internet Archive upload API
+#               uses HTTP headers which are text strings without newlines, so
+#               when these tags are uploaded through this route some
+#               formatting is lost. What this routine does is parse the
+#               contents of all 
 sections in $notes, adding 
tags +# to replace newlines. It has to perform a full parse +# since the contents may include HTML tags and these need to be +# passed through intact. It calls the subroutine 'flatten_item' to +# deal with the recursive nature of HTML tags. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub flatten_pre { + my ($tree) = @_; + + # + # Find all the
 tags
+    #
+    my @pre_tags = $tree->look_down( _tag => 'pre', );
+
+    #
+    # Walk the various 
 elements in the document
+    #
+    foreach my $tag (@pre_tags) {
+        #
+        # Save the tag and empty the original
+        #
+        my $saved = $tag->clone();
+        $tag->delete_content();
+
+        #
+        # Walk the saved content and rebuild the tag into $atag using the
+        # nested arrayref structure permitted by HTML::Element for
+        # convenience (the alternative is a little nasty). See the
+        # documentation for 'new_from_lol' in HTML::Element.
+        #
+        my $atag;
+        foreach my $item ( @{ $saved->content_array_ref } ) {
+            push( @$atag, flatten_item($item) );
+        }
+
+        #
+        # Rebuild the tag from the arrayref we built. We treat the arrayref
+        # structure we just built as an array because otherwise the top level
+        # is interpreted as a spurious  tag.
+        #
+        $tag->push_content(@$atag);
+    }
+
+    #
+    # Trim out the original notes from the enclosing tags we added earlier
+    #
+    my $body = $tree->look_down( _tag => 'body' );
+    ( my $result = $body->as_HTML( undef, ' ', {} ) )
+        =~ s{(^]*>|$)}{}gi;
+
+    return $result;
+
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: flatten_item
+#      PURPOSE: Recursively "flatten" items within the enclosing 
+#   PARAMETERS: $item   an HTML::Element item parsed from the original
+#                       
 section
+#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
+#  DESCRIPTION: Since 
 sections can contain inline elements which change
+#               the rendering of the text we need to parse these as we add
+#               
tags. This routine does this by recursively descending +# through the contents. A common tag sequence is
 for
+#               scripts and the like. This routine deals with such sequences.
+#               It expects to receive the contents in sequence and builds the
+#               result as a nested arrayref structure.
+#       THROWS: No exceptions
+#     COMMENTS: None
+#     SEE ALSO: N/A
+#===============================================================================
+sub flatten_item {
+    my ($item) = @_;
+
+    return unless defined($item);
+
+    my ( @result, %attr );
+
+    #
+    # Is it a sub-tag or non-tag content?
+    #
+    if ( ref($item) ) {
+        #
+        # It's a tag. Save the tag name and any attributes and recurse into
+        # it. Return an arrayref
+        #
+        push( @result, $item->tag() );
+        %attr = $item->all_external_attr();
+        push( @result, \%attr ) if %attr;
+        for my $child ( $item->content_list() ) {
+            push( @result, flatten_item($child) );
+        }
+        return \@result;
+    }
+    else {
+        #
+        # It's non-tag content. Join the lines with 
tags. Return an + # array (since this is a simple list). + # + # Note that we split with a LIMIT of -1 which causes any trailing list + # items to be returned; default behaviour is to drop them. + # + $item =~ s/\r//g; + my @content = split( /\n/, $item, -1 ); + if (@content) { + # + # Remove a leading blank line - usually the result of + # a "
'NL'text" sequence
+            #
+            shift(@content) if ( $content[0] =~ /^\s*$/ );
+
+            #
+            # Join back the lines with 
tags between them. + # + foreach my $txt (@content) { + push( @result, $txt, ['br'] ); + } + + # + # Remove the
at the end, it's spurious + # + pop(@result); + } + + return (@result); + } + +} + +# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker + diff --git a/InternetArchive/tidy_uploaded b/InternetArchive/tidy_uploaded index fee3a0a..4680373 100755 --- a/InternetArchive/tidy_uploaded +++ b/InternetArchive/tidy_uploaded @@ -6,7 +6,7 @@ # USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT] # # DESCRIPTION: Relocates HPR audio and other show-related files on 'borg' -# after their shows have been uploaded to the Internet Archive +# after their shows have been uploaded to the Internet Archive. # # OPTIONS: --- # REQUIREMENTS: --- @@ -43,7 +43,7 @@ TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT # -# Configure depending whether local or on the VPS +# Configure depending whether local or on 'borg' # case $HOSTNAME in borg) BASEDIR="$HOME/InternetArchive" @@ -95,7 +95,7 @@ queued_tasks () { # NAME: movefile # DESCRIPTION: Moves a file to a new place, catering for any directories in # the path -# PARAMETERS: $1 directory to move form +# PARAMETERS: $1 directory to move from # $2 directory to move to # $3 file (or sub-path to move) # RETURNS: True if a move was done, otherwise False @@ -356,7 +356,7 @@ while read -r path; do # tasks=$(queued_tasks "$item") if [[ $tasks -gt 0 ]]; then - echo "** Item $item still has $tasks unfinished " \ + echo "** Item $item still has $tasks unfinished" \ "$(ngettext task tasks "$tasks")" echo "** Skipping to the next item" continue @@ -434,9 +434,6 @@ while read -r path; do done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ') -# Old 'find' used: -# done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) - # # No shows processed? There was nothing to do # diff --git a/InternetArchive/update_state b/InternetArchive/update_state index 3ae90af..778609d 100755 --- a/InternetArchive/update_state +++ b/InternetArchive/update_state @@ -3,7 +3,7 @@ # # FILE: update_state # -# USAGE: ./update_state +# USAGE: ./update_state [-h] [-D] [-d] [-F] [-l N] [-m] # # DESCRIPTION: A script to update the state of shows which have been sent to # the IA. It looks at the current state of the 'reservations' @@ -136,7 +136,6 @@ esac cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; } - # # Tools #