diff --git a/InternetArchive/future_upload b/InternetArchive/future_upload index 5b2868d..4117c36 100755 --- a/InternetArchive/future_upload +++ b/InternetArchive/future_upload @@ -1,9 +1,10 @@ #!/bin/bash - +# shellcheck disable=SC2317 #=============================================================================== # # FILE: future_upload # -# USAGE: ./future_upload +# USAGE: ./future_upload [-h] [-v] [-D] [-d {0|1}] [-F] [-r] [-l cp] # # DESCRIPTION: Uploads future HPR shows based on what is in the upload area # @@ -13,9 +14,9 @@ # NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as # well as 'update_state' # AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com -# VERSION: 0.0.16 +# VERSION: 0.0.17 # CREATED: 2021-01-07 12:11:02 -# REVISION: 2025-01-01 11:48:40 +# REVISION: 2025-01-06 17:51:57 # #=============================================================================== @@ -26,7 +27,7 @@ SCRIPT=${0##*/} STDOUT="/dev/fd/2" -VERSION="0.0.16" +VERSION="0.0.17" # # Load library functions @@ -36,7 +37,7 @@ LIB="$HOME/bin/function_lib.sh" # shellcheck disable=SC1090 source "$LIB" -# {{{ -- Functions -- check_uploads, _log, _usage +# {{{ -- Functions -- check_uploads, update_show_state, _log, _usage #=== FUNCTION ================================================================ # NAME: check_uploads @@ -72,6 +73,36 @@ check_uploads () { return 0 } +#=== FUNCTION ================================================================ +# NAME: update_show_state +# DESCRIPTION: Updates the status of a single show in the HPR database. +# It is assumed the caller has found the show number in the +# 'reservations' table with the required status of +# 'MEDIA_TRANSCODED'. All this function does is to change this +# to 'UPLOADED_TO_IA', returning true if successful, otherwise +# false. +# PARAMETERS: $show Show number to update +# RETURNS: True if the update worked, otherwise false +#=============================================================================== +update_show_state () { + local show=${1:?Usage: update_state show} + local BASECOM URL QUERY COMMAND RES + + BASECOM='curl -K ./.hpradmin_curlrc -s' + URL="https://hub.hackerpublicradio.org/cms/status.php" + QUERY="${BASECOM} ${URL}" + + COMMAND="${QUERY}?ep_num=${show}&status=UPLOADED_TO_IA" + + $COMMAND + RES=$? + if [[ $RES -ne 0 ]]; then + return 1 + fi + + return 0 +} + #=== FUNCTION ================================================================ # NAME: _log # DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script @@ -83,7 +114,7 @@ check_uploads () { # PARAMETERS: 1 - the message to write # RETURNS: Nothing #=============================================================================== -# shellcheck disable=SC2317 disable=SC2059 +# shellcheck disable=SC2059 _log () { local msg="$1" @@ -180,7 +211,7 @@ BASECOM='curl -K ./.hpradmin_curlrc -s' URL="https://hub.hackerpublicradio.org/cms/status.php" # QUERY1="${BASECOM} ${URL}" QUERY2="${BASECOM} -o - ${URL}" -UPSTATE="$BASEDIR/update_state" +# UPSTATE="$BASEDIR/update_state" # # Fallback URL @@ -199,10 +230,10 @@ ia=$(command -v ia) echo "Needs the 'make_metadata' script" exit 1 } -[ -e "$UPSTATE" ] || { - echo "Needs the 'update_state' script" - exit 1 -} +# [ -e "$UPSTATE" ] || { +# echo "Needs the 'update_state' script" +# exit 1 +# } # # File of processed shows @@ -234,6 +265,9 @@ do done shift $((OPTIND - 1)) +# +# Check and set option variables +# DRYRUN=${DRYRUN:-1} if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then echo "** Use '-d 0' or '-d 1'" @@ -272,6 +306,7 @@ fi # # Declarations +# ------------ # declare -A processed declare -A ready @@ -282,6 +317,7 @@ lastitem= # # Load array of processed shows +# ---- ----- -- --------- ----- # while read -r item; do processed+=([$item]=1) @@ -289,46 +325,17 @@ done < "$PROCFILE" [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" # -# TODO: Create the associative array 'ready' containing the numbers of shows -# ready for upload. This is a way to ensure that we don't try and upload shows -# in transit to the upload area. +# Populate the associative array 'ready' with the numbers of shows ready for +# upload. This is a way to ensure that we don't try and upload shows in +# transit to the upload area. Only do this if force mode is off. # -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Proposed code. Not sure what the actual URL will be nor what will be -# returned if nothing is ready for upload yet -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# json=$(curl http://hackerpublicradio.org/queue.php -s -o -) -# while read -r showno; do -# ready+=([$showno]=1) -# done < <(echo "${json}" | jq '.READY_FOR_IA_UPLOAD[] | tonumber') -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Change of plan. Now we have a list of CSV values, so we need to do something -# like this: -# -# reservations=$($BASECOM -o - $URL) -# while read -r line; do -# if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then -# state="${BASH_REMATCH[5]}" -# show="${BASH_REMATCH[2]}" -# fi -# if [[ $state = 'MEDIA_TRANSCODED' ]]; then -# ready+=([$show]=1) -# fi -# done <<< $reservations -# -# At the end of this the associative array 'ready' will contain the keys of -# shows that are ready for upload (presumably) so we can look in this array to -# double check. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - if [[ $FORCE -eq 0 ]]; then # # Collect the current table of shows requiring work. We expect something like: # timestamp_epoc,ep_num,ep_date,key,status,email # 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com # 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com - # TODO: Check for a failure in the query?A + # TODO: Reinstate the check for a failure in the query? Se update_state # NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired # reservations=$($QUERY2) || { @@ -353,8 +360,8 @@ if [[ $FORCE -eq 0 ]]; then fi # - # The query returns the bare number, but we're using 'hprxxxx' as the key in - # the 'ready' array. + # The query returns the bare show number, but we're using 'hprxxxx' as the + # key in the 'ready' array. # while read -r line; do if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then @@ -374,7 +381,10 @@ fi # # Process files. There will be several with the same prefix so look for -# a change of prefix +# a change of prefix. +# +# The loop is reading from the following pipeline: +# find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort # while read -r path; do # @@ -390,8 +400,8 @@ while read -r path; do _DEBUG "Item $item" # - # Detect that the item prefix has changed. If it has we're processing - # a new IA identifier, so work on this one + # Detect that the item prefix has changed. If it has we've found a new IA + # identifier, so work on the previous one # if [[ $item != "$lastitem" ]]; then lastitem=$item @@ -425,7 +435,8 @@ while read -r path; do processed+=([$lastitem]=1) else # - # Is the show ready for upload? + # Is the show ready for upload? We don't check if force mode + # is on. If not ready we skip this show. # if [[ $FORCE -eq 0 ]]; then if [[ ! -v "ready[$lastitem]" ]]; then @@ -472,10 +483,9 @@ while read -r path; do done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) -# +#------------------------------------------------------------------------------- # Write the processed array to the cache file unless in dry-run mode -# -# [ $DEBUG -eq 1 ] && { echo -n 'D> '; declare -p processed; } +#------------------------------------------------------------------------------- _DEBUG "processed = ${!processed[*]}" [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" if [[ $DRYRUN -ne 1 ]]; then @@ -484,24 +494,26 @@ if [[ $DRYRUN -ne 1 ]]; then done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE" fi -# +#------------------------------------------------------------------------------- # Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'. +# The show numbers are keys in the associative array 'uploads'. The +# end-product is a comma-separated list of the keys in the variable '$list'. # Order is unimportant because make_metadata sorts internally. -# +#------------------------------------------------------------------------------- _DEBUG "uploads = ${!uploads[*]}" [ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}" printf -v list '%s,' "${!uploads[@]}" list="${list:0:-1}" -# +#------------------------------------------------------------------------------- # If there are no uploads to do we can stop -# +#------------------------------------------------------------------------------- [[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; } -# +#------------------------------------------------------------------------------- # Check that the shows being uploaded have all their files and log what is # happening. -# +#------------------------------------------------------------------------------- while read -r show; do echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE" @@ -512,10 +524,10 @@ while read -r show; do fi done < <(printf '%s\n' "${!uploads[@]}" | sort) -# +#------------------------------------------------------------------------------- # Define output files. If the list contains one element then it's a different # name from the multi-element case (make_metadata does this too). -# +#------------------------------------------------------------------------------- if [[ ${#uploads[@]} -eq 1 ]]; then metadata="metadata_${minshow}.csv" script="script_${minshow}.sh" @@ -524,9 +536,9 @@ else script="script_${minshow}-${maxshow}.sh" fi -# +#------------------------------------------------------------------------------- # Perform the uploads or report what would be done -# +#------------------------------------------------------------------------------- if [[ $DRYRUN -eq 1 ]]; then echo "Dry run: Would have uploaded list '$list'" echo "Dry run: Would have created $metadata and $script" @@ -573,17 +585,17 @@ else echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE" # - # Update the state in the HPR database, unless we're using - # FORCE. Pass the limit used here to this script so it can - # stop looking for work unnecessarily + # Update the state of all the shows being processed in the + # HPR database, unless we're using FORCE. # if [[ $FORCE -eq 0 ]]; then - $UPSTATE -l$LIMIT - RES=$? - if [[ $RES -ne 0 ]]; then - echo "Problem updating database state" - exit 1 - fi + while read -r show; do + if update_show_state $show; then + echo "Updated state for show $show" + else + echo "Failed to update state for show $show" + fi + done < <(printf '%s\n' "${!uploads[@]}" | sort) else echo "Not updating the database, FORCE mode is on" fi diff --git a/InternetArchive/reformat_html b/InternetArchive/reformat_html new file mode 100755 index 0000000..a7d6106 --- /dev/null +++ b/InternetArchive/reformat_html @@ -0,0 +1,245 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: reformat_html +# +# USAGE: ./reformat_html < input.html > output.html +# +# DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes' +# field to the format required in the 'description' field of an +# item on the IA. It reads from STDIN and writes to STDOUT. +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com +# VERSION: 0.0.1 +# CREATED: 2025-02-09 22:56:30 +# REVISION: 2025-02-13 11:13:37 +# +#=============================================================================== + +use v5.36; +use strict; +use warnings; +use feature qw{ say try }; +no warnings qw{ experimental::try }; + +use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 + +use HTML::TreeBuilder 5 -weak; +use HTML::Entities; + +# +# Version number (Incremented by Vim) +# +our $VERSION = '0.0.1'; + +# +# Declarations +# +my ($verbose, @notes, $notes, $tree); + +# +# Read the input data into an array +# +try { + @notes = ; +} +catch ($e) { + warn "Problem reading input HTML; $e"; + exit 1; +} + +die "No input HTML detected\n" unless @notes; + +# +# Turn the array into a scalar +# +$notes = join( '', @notes ); + +# +# Get ready to parse the array +# +$tree = HTML::TreeBuilder->new; +$tree->ignore_unknown(0); +$tree->no_expand_entities(1); +$tree->p_strict(1); +$tree->store_comments(1); # Necessary? +$tree->warn(1); + +# +# Parse HTML to the tree structure +# +$tree->parse_content($notes) + or die "HTML::TreeBuilder failed to parse input HTML: $!\n"; + +# +# Flatten all
 tags and add 
tags +# +$notes = flatten_pre($tree); + +# +# Deal with non-ASCII +# +$notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); + +# +# Remove all newlines +# +$notes =~ s/\n//g; + +# +# Write the end result to the STDOUT +# +say $notes; + +exit; + +#=== FUNCTION ================================================================ +# NAME: flatten_pre +# PURPOSE: Process notes "flattening"
 contents
+#   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and
+#                       partially processed notes
+#      RETURNS: Processed notes
+#  DESCRIPTION: The HTML "
" tag encloses preformatted text. It can also
+#               contain some formatting tags like  and , but spaces
+#               and newlines are significant. The Internet Archive upload API
+#               uses HTTP headers which are text strings without newlines, so
+#               when these tags are uploaded through this route some
+#               formatting is lost. What this routine does is parse the
+#               contents of all 
 sections in $notes, adding 
tags +# to replace newlines. It has to perform a full parse +# since the contents may include HTML tags and these need to be +# passed through intact. It calls the subroutine 'flatten_item' to +# deal with the recursive nature of HTML tags. +# THROWS: No exceptions +# COMMENTS: None +# SEE ALSO: N/A +#=============================================================================== +sub flatten_pre { + my ($tree) = @_; + + # + # Find all the
 tags
+    #
+    my @pre_tags = $tree->look_down( _tag => 'pre', );
+
+    #
+    # Walk the various 
 elements in the document
+    #
+    foreach my $tag (@pre_tags) {
+        #
+        # Save the tag and empty the original
+        #
+        my $saved = $tag->clone();
+        $tag->delete_content();
+
+        #
+        # Walk the saved content and rebuild the tag into $atag using the
+        # nested arrayref structure permitted by HTML::Element for
+        # convenience (the alternative is a little nasty). See the
+        # documentation for 'new_from_lol' in HTML::Element.
+        #
+        my $atag;
+        foreach my $item ( @{ $saved->content_array_ref } ) {
+            push( @$atag, flatten_item($item) );
+        }
+
+        #
+        # Rebuild the tag from the arrayref we built. We treat the arrayref
+        # structure we just built as an array because otherwise the top level
+        # is interpreted as a spurious  tag.
+        #
+        $tag->push_content(@$atag);
+    }
+
+    #
+    # Trim out the original notes from the enclosing tags we added earlier
+    #
+    my $body = $tree->look_down( _tag => 'body' );
+    ( my $result = $body->as_HTML( undef, ' ', {} ) )
+        =~ s{(^]*>|$)}{}gi;
+
+    return $result;
+
+}
+
+#===  FUNCTION  ================================================================
+#         NAME: flatten_item
+#      PURPOSE: Recursively "flatten" items within the enclosing 
+#   PARAMETERS: $item   an HTML::Element item parsed from the original
+#                       
 section
+#      RETURNS: An arrayref if the last seen item was a tag, otherwise a list
+#  DESCRIPTION: Since 
 sections can contain inline elements which change
+#               the rendering of the text we need to parse these as we add
+#               
tags. This routine does this by recursively descending +# through the contents. A common tag sequence is
 for
+#               scripts and the like. This routine deals with such sequences.
+#               It expects to receive the contents in sequence and builds the
+#               result as a nested arrayref structure.
+#       THROWS: No exceptions
+#     COMMENTS: None
+#     SEE ALSO: N/A
+#===============================================================================
+sub flatten_item {
+    my ($item) = @_;
+
+    return unless defined($item);
+
+    my ( @result, %attr );
+
+    #
+    # Is it a sub-tag or non-tag content?
+    #
+    if ( ref($item) ) {
+        #
+        # It's a tag. Save the tag name and any attributes and recurse into
+        # it. Return an arrayref
+        #
+        push( @result, $item->tag() );
+        %attr = $item->all_external_attr();
+        push( @result, \%attr ) if %attr;
+        for my $child ( $item->content_list() ) {
+            push( @result, flatten_item($child) );
+        }
+        return \@result;
+    }
+    else {
+        #
+        # It's non-tag content. Join the lines with 
tags. Return an + # array (since this is a simple list). + # + # Note that we split with a LIMIT of -1 which causes any trailing list + # items to be returned; default behaviour is to drop them. + # + $item =~ s/\r//g; + my @content = split( /\n/, $item, -1 ); + if (@content) { + # + # Remove a leading blank line - usually the result of + # a "
'NL'text" sequence
+            #
+            shift(@content) if ( $content[0] =~ /^\s*$/ );
+
+            #
+            # Join back the lines with 
tags between them. + # + foreach my $txt (@content) { + push( @result, $txt, ['br'] ); + } + + # + # Remove the
at the end, it's spurious + # + pop(@result); + } + + return (@result); + } + +} + +# vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker + diff --git a/InternetArchive/tidy_uploaded b/InternetArchive/tidy_uploaded index fee3a0a..4680373 100755 --- a/InternetArchive/tidy_uploaded +++ b/InternetArchive/tidy_uploaded @@ -6,7 +6,7 @@ # USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT] # # DESCRIPTION: Relocates HPR audio and other show-related files on 'borg' -# after their shows have been uploaded to the Internet Archive +# after their shows have been uploaded to the Internet Archive. # # OPTIONS: --- # REQUIREMENTS: --- @@ -43,7 +43,7 @@ TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT # -# Configure depending whether local or on the VPS +# Configure depending whether local or on 'borg' # case $HOSTNAME in borg) BASEDIR="$HOME/InternetArchive" @@ -95,7 +95,7 @@ queued_tasks () { # NAME: movefile # DESCRIPTION: Moves a file to a new place, catering for any directories in # the path -# PARAMETERS: $1 directory to move form +# PARAMETERS: $1 directory to move from # $2 directory to move to # $3 file (or sub-path to move) # RETURNS: True if a move was done, otherwise False @@ -356,7 +356,7 @@ while read -r path; do # tasks=$(queued_tasks "$item") if [[ $tasks -gt 0 ]]; then - echo "** Item $item still has $tasks unfinished " \ + echo "** Item $item still has $tasks unfinished" \ "$(ngettext task tasks "$tasks")" echo "** Skipping to the next item" continue @@ -434,9 +434,6 @@ while read -r path; do done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort | cut -f2 -d' ') -# Old 'find' used: -# done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) - # # No shows processed? There was nothing to do # diff --git a/InternetArchive/update_state b/InternetArchive/update_state index 3ae90af..778609d 100755 --- a/InternetArchive/update_state +++ b/InternetArchive/update_state @@ -3,7 +3,7 @@ # # FILE: update_state # -# USAGE: ./update_state +# USAGE: ./update_state [-h] [-D] [-d] [-F] [-l N] [-m] # # DESCRIPTION: A script to update the state of shows which have been sent to # the IA. It looks at the current state of the 'reservations' @@ -136,7 +136,6 @@ esac cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; } - # # Tools #