forked from HPR/hpr-tools
		
	New 'reformat_html', plus some cleaning
InternetArchive/future_upload: now updates the state of shows
InternetArchive/reformat_html: new Perl script to reformat the HTML
    originally found in the HPR database in the 'notes' field to the format
    required in the 'description' field of an item on the IA. It reads
    from STDIN and writes to STDOUT.
			
			
This commit is contained in:
		| @@ -1,9 +1,10 @@ | |||||||
| #!/bin/bash - | #!/bin/bash - | ||||||
|  | # shellcheck disable=SC2317 | ||||||
| #=============================================================================== | #=============================================================================== | ||||||
| # | # | ||||||
| #         FILE: future_upload | #         FILE: future_upload | ||||||
| # | # | ||||||
| #        USAGE: ./future_upload | #        USAGE: ./future_upload [-h] [-v] [-D] [-d {0|1}] [-F] [-r] [-l cp] | ||||||
| # | # | ||||||
| #  DESCRIPTION: Uploads future HPR shows based on what is in the upload area | #  DESCRIPTION: Uploads future HPR shows based on what is in the upload area | ||||||
| # | # | ||||||
| @@ -13,9 +14,9 @@ | |||||||
| #        NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as | #        NOTES: Contains methods from 'delete_uploaded' and 'weekly_upload' as | ||||||
| #               well as 'update_state' | #               well as 'update_state' | ||||||
| #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||||||
| #      VERSION: 0.0.16 | #      VERSION: 0.0.17 | ||||||
| #      CREATED: 2021-01-07 12:11:02 | #      CREATED: 2021-01-07 12:11:02 | ||||||
| #     REVISION: 2025-01-01 11:48:40 | #     REVISION: 2025-01-06 17:51:57 | ||||||
| # | # | ||||||
| #=============================================================================== | #=============================================================================== | ||||||
|  |  | ||||||
| @@ -26,7 +27,7 @@ SCRIPT=${0##*/} | |||||||
|  |  | ||||||
| STDOUT="/dev/fd/2" | STDOUT="/dev/fd/2" | ||||||
|  |  | ||||||
| VERSION="0.0.16" | VERSION="0.0.17" | ||||||
|  |  | ||||||
| # | # | ||||||
| # Load library functions | # Load library functions | ||||||
| @@ -36,7 +37,7 @@ LIB="$HOME/bin/function_lib.sh" | |||||||
| # shellcheck disable=SC1090 | # shellcheck disable=SC1090 | ||||||
| source "$LIB" | source "$LIB" | ||||||
|  |  | ||||||
| # {{{ -- Functions -- check_uploads, _log, _usage | # {{{ -- Functions -- check_uploads, update_show_state, _log, _usage | ||||||
|  |  | ||||||
| #===  FUNCTION  ================================================================ | #===  FUNCTION  ================================================================ | ||||||
| #         NAME: check_uploads | #         NAME: check_uploads | ||||||
| @@ -72,6 +73,36 @@ check_uploads () { | |||||||
|     return 0 |     return 0 | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #===  FUNCTION  ================================================================ | ||||||
|  | #         NAME: update_show_state | ||||||
|  | #  DESCRIPTION: Updates the status of a single show in the HPR database. | ||||||
|  | #               It is assumed the caller has found the show number in the | ||||||
|  | #               'reservations' table with the required status of | ||||||
|  | #               'MEDIA_TRANSCODED'. All this function does is to change this | ||||||
|  | #               to 'UPLOADED_TO_IA', returning true if successful, otherwise | ||||||
|  | #               false. | ||||||
|  | #   PARAMETERS: $show           Show number to update | ||||||
|  | #      RETURNS: True if the update worked, otherwise false | ||||||
|  | #=============================================================================== | ||||||
|  | update_show_state () { | ||||||
|  |     local show=${1:?Usage: update_state show} | ||||||
|  |     local BASECOM URL QUERY COMMAND RES | ||||||
|  |  | ||||||
|  |     BASECOM='curl -K ./.hpradmin_curlrc -s' | ||||||
|  |     URL="https://hub.hackerpublicradio.org/cms/status.php" | ||||||
|  |     QUERY="${BASECOM} ${URL}" | ||||||
|  |  | ||||||
|  |     COMMAND="${QUERY}?ep_num=${show}&status=UPLOADED_TO_IA" | ||||||
|  |  | ||||||
|  |     $COMMAND | ||||||
|  |     RES=$? | ||||||
|  |     if [[ $RES -ne 0 ]]; then | ||||||
|  |         return 1 | ||||||
|  |     fi | ||||||
|  |  | ||||||
|  |     return 0 | ||||||
|  | } | ||||||
|  |  | ||||||
| #===  FUNCTION  ================================================================ | #===  FUNCTION  ================================================================ | ||||||
| #         NAME: _log | #         NAME: _log | ||||||
| #  DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script | #  DESCRIPTION: Writes a log record to the predefined $LOGFILE in this script | ||||||
| @@ -83,7 +114,7 @@ check_uploads () { | |||||||
| #   PARAMETERS: 1 - the message to write | #   PARAMETERS: 1 - the message to write | ||||||
| #      RETURNS: Nothing | #      RETURNS: Nothing | ||||||
| #=============================================================================== | #=============================================================================== | ||||||
| # shellcheck disable=SC2317 disable=SC2059 | # shellcheck disable=SC2059 | ||||||
| _log () { | _log () { | ||||||
|     local msg="$1" |     local msg="$1" | ||||||
|  |  | ||||||
| @@ -180,7 +211,7 @@ BASECOM='curl -K ./.hpradmin_curlrc -s' | |||||||
| URL="https://hub.hackerpublicradio.org/cms/status.php" | URL="https://hub.hackerpublicradio.org/cms/status.php" | ||||||
| # QUERY1="${BASECOM} ${URL}" | # QUERY1="${BASECOM} ${URL}" | ||||||
| QUERY2="${BASECOM} -o - ${URL}" | QUERY2="${BASECOM} -o - ${URL}" | ||||||
| UPSTATE="$BASEDIR/update_state" | # UPSTATE="$BASEDIR/update_state" | ||||||
|  |  | ||||||
| # | # | ||||||
| # Fallback URL | # Fallback URL | ||||||
| @@ -199,10 +230,10 @@ ia=$(command -v ia) | |||||||
|     echo "Needs the 'make_metadata' script" |     echo "Needs the 'make_metadata' script" | ||||||
|     exit 1 |     exit 1 | ||||||
| } | } | ||||||
| [ -e "$UPSTATE" ] || { | # [ -e "$UPSTATE" ] || { | ||||||
|     echo "Needs the 'update_state' script" | #     echo "Needs the 'update_state' script" | ||||||
|     exit 1 | #     exit 1 | ||||||
| } | # } | ||||||
|  |  | ||||||
| # | # | ||||||
| # File of processed shows | # File of processed shows | ||||||
| @@ -234,6 +265,9 @@ do | |||||||
| done | done | ||||||
| shift $((OPTIND - 1)) | shift $((OPTIND - 1)) | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Check and set option variables | ||||||
|  | # | ||||||
| DRYRUN=${DRYRUN:-1} | DRYRUN=${DRYRUN:-1} | ||||||
| if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then | if [[ $DRYRUN -ne 0 && $DRYRUN -ne 1 ]]; then | ||||||
|     echo "** Use '-d 0' or '-d 1'" |     echo "** Use '-d 0' or '-d 1'" | ||||||
| @@ -272,6 +306,7 @@ fi | |||||||
|  |  | ||||||
| # | # | ||||||
| # Declarations | # Declarations | ||||||
|  | # ------------ | ||||||
| # | # | ||||||
| declare -A processed | declare -A processed | ||||||
| declare -A ready | declare -A ready | ||||||
| @@ -282,6 +317,7 @@ lastitem= | |||||||
|  |  | ||||||
| # | # | ||||||
| # Load array of processed shows | # Load array of processed shows | ||||||
|  | # ---- ----- -- --------- ----- | ||||||
| # | # | ||||||
| while read -r item; do | while read -r item; do | ||||||
|     processed+=([$item]=1) |     processed+=([$item]=1) | ||||||
| @@ -289,46 +325,17 @@ done < "$PROCFILE" | |||||||
| [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" | [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" | ||||||
|  |  | ||||||
| # | # | ||||||
| # TODO: Create the associative array 'ready' containing the numbers of shows | # Populate the associative array 'ready' with the numbers of shows ready for | ||||||
| # ready for upload. This is a way to ensure that we don't try and upload shows | # upload. This is a way to ensure that we don't try and upload shows in | ||||||
| # in transit to the upload area. | # transit to the upload area. Only do this if force mode is off. | ||||||
| # | # | ||||||
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
| # Proposed code. Not sure what the actual URL will be nor what will be |  | ||||||
| # returned if nothing is ready for upload yet |  | ||||||
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
| # |  | ||||||
| # json=$(curl http://hackerpublicradio.org/queue.php -s -o -) |  | ||||||
| # while read -r showno; do |  | ||||||
| #     ready+=([$showno]=1) |  | ||||||
| # done < <(echo "${json}" | jq '.READY_FOR_IA_UPLOAD[] | tonumber') |  | ||||||
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
| # Change of plan. Now we have a list of CSV values, so we need to do something |  | ||||||
| # like this: |  | ||||||
| # |  | ||||||
| # reservations=$($BASECOM -o - $URL) |  | ||||||
| # while read -r line; do |  | ||||||
| #    if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then |  | ||||||
| #       state="${BASH_REMATCH[5]}" |  | ||||||
| #       show="${BASH_REMATCH[2]}" |  | ||||||
| #    fi |  | ||||||
| #    if [[ $state = 'MEDIA_TRANSCODED' ]]; then |  | ||||||
| #       ready+=([$show]=1) |  | ||||||
| #    fi |  | ||||||
| # done <<< $reservations |  | ||||||
| # |  | ||||||
| # At the end of this the associative array 'ready' will contain the keys of |  | ||||||
| # shows that are ready for upload (presumably) so we can look in this array to |  | ||||||
| # double check. |  | ||||||
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
|  |  | ||||||
| if [[ $FORCE -eq 0 ]]; then | if [[ $FORCE -eq 0 ]]; then | ||||||
|     # |     # | ||||||
|     # Collect the current table of shows requiring work. We expect something like: |     # Collect the current table of shows requiring work. We expect something like: | ||||||
|     # timestamp_epoc,ep_num,ep_date,key,status,email |     # timestamp_epoc,ep_num,ep_date,key,status,email | ||||||
|     # 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com |     # 1651286617,3617,2022-06-14,fda088e0e3bd5d0353ea6b7569e93b87626ca25976a0a,UPLOADED_TO_IA,lurkingprion@gmail.com | ||||||
|     # 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com |     # 1651648589,3619,2022-06-16,e7d3810afa098863d81663418d8640276272284de68f1,UPLOADED_TO_IA,monochromec@gmail.com | ||||||
|     # TODO: Check for a failure in the query?A |     # TODO: Reinstate the check for a failure in the query? Se update_state | ||||||
|     # NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired |     # NOTE: Problem encountered 2022-09-23 because the SSL certificate has expired | ||||||
|     # |     # | ||||||
|     reservations=$($QUERY2) || { |     reservations=$($QUERY2) || { | ||||||
| @@ -353,8 +360,8 @@ if [[ $FORCE -eq 0 ]]; then | |||||||
|     fi |     fi | ||||||
|  |  | ||||||
|     # |     # | ||||||
|     # The query returns the bare number, but we're using 'hprxxxx' as the key in |     # The query returns the bare show number, but we're using 'hprxxxx' as the | ||||||
|     # the 'ready' array. |     # key in the 'ready' array. | ||||||
|     # |     # | ||||||
|     while read -r line; do |     while read -r line; do | ||||||
|         if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then |         if [[ $line =~ ^([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),.*$ ]]; then | ||||||
| @@ -374,7 +381,10 @@ fi | |||||||
|  |  | ||||||
| # | # | ||||||
| # Process files. There will be several with the same prefix so look for | # Process files. There will be several with the same prefix so look for | ||||||
| # a change of prefix | # a change of prefix. | ||||||
|  | # | ||||||
|  | # The loop is reading from the following pipeline: | ||||||
|  | # find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort | ||||||
| # | # | ||||||
| while read -r path; do | while read -r path; do | ||||||
|     # |     # | ||||||
| @@ -390,8 +400,8 @@ while read -r path; do | |||||||
|     _DEBUG "Item $item" |     _DEBUG "Item $item" | ||||||
|  |  | ||||||
|     # |     # | ||||||
|     # Detect that the item prefix has changed. If it has we're processing |     # Detect that the item prefix has changed. If it has we've found a new IA | ||||||
|     # a new IA identifier, so work on this one |     # identifier, so work on the previous one | ||||||
|     # |     # | ||||||
|     if [[ $item != "$lastitem" ]]; then |     if [[ $item != "$lastitem" ]]; then | ||||||
|         lastitem=$item |         lastitem=$item | ||||||
| @@ -425,7 +435,8 @@ while read -r path; do | |||||||
|                 processed+=([$lastitem]=1) |                 processed+=([$lastitem]=1) | ||||||
|             else |             else | ||||||
|                 # |                 # | ||||||
|                 # Is the show ready for upload? |                 # Is the show ready for upload? We don't check if force mode | ||||||
|  |                 # is on. If not ready we skip this show. | ||||||
|                 # |                 # | ||||||
|                 if [[ $FORCE -eq 0 ]]; then |                 if [[ $FORCE -eq 0 ]]; then | ||||||
|                     if [[ ! -v "ready[$lastitem]" ]]; then |                     if [[ ! -v "ready[$lastitem]" ]]; then | ||||||
| @@ -472,10 +483,9 @@ while read -r path; do | |||||||
|  |  | ||||||
| done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) | done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # Write the processed array to the cache file unless in dry-run mode | # Write the processed array to the cache file unless in dry-run mode | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # [ $DEBUG -eq 1 ] && { echo -n 'D> '; declare -p processed; } |  | ||||||
| _DEBUG "processed = ${!processed[*]}" | _DEBUG "processed = ${!processed[*]}" | ||||||
| [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" | [ "$VERBOSE" -eq 1 ] && echo "Number of shows in cache: ${#processed[@]}" | ||||||
| if [[ $DRYRUN -ne 1 ]]; then | if [[ $DRYRUN -ne 1 ]]; then | ||||||
| @@ -484,24 +494,26 @@ if [[ $DRYRUN -ne 1 ]]; then | |||||||
|     done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE" |     done < <(printf '%s\n' "${!processed[@]}" | sort -u ) > "$PROCFILE" | ||||||
| fi | fi | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'. | # Generate the list of uploads for the 'make_metadata' option '-list=1,2,3'. | ||||||
|  | # The show numbers are keys in the associative array 'uploads'. The | ||||||
|  | # end-product is a comma-separated list of the keys in the variable '$list'. | ||||||
| # Order is unimportant because make_metadata sorts internally. | # Order is unimportant because make_metadata sorts internally. | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| _DEBUG "uploads = ${!uploads[*]}" | _DEBUG "uploads = ${!uploads[*]}" | ||||||
| [ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}" | [ "$VERBOSE" -eq 1 ] && echo "Number of shows for upload: ${#uploads[@]}" | ||||||
| printf -v list '%s,' "${!uploads[@]}" | printf -v list '%s,' "${!uploads[@]}" | ||||||
| list="${list:0:-1}" | list="${list:0:-1}" | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # If there are no uploads to do we can stop | # If there are no uploads to do we can stop | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| [[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; } | [[ ! -v uploads[@] ]] && { echo "Nothing to do!"; exit; } | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # Check that the shows being uploaded have all their files and log what is | # Check that the shows being uploaded have all their files and log what is | ||||||
| # happening. | # happening. | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| while read -r show; do | while read -r show; do | ||||||
|     echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE" |     echo "$(date +%Y%m%d%H%M%S) preparing to upload hpr$show" >> "$LOGFILE" | ||||||
|  |  | ||||||
| @@ -512,10 +524,10 @@ while read -r show; do | |||||||
|     fi |     fi | ||||||
| done < <(printf '%s\n' "${!uploads[@]}" | sort) | done < <(printf '%s\n' "${!uploads[@]}" | sort) | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # Define output files. If the list contains one element then it's a different | # Define output files. If the list contains one element then it's a different | ||||||
| # name from the multi-element case (make_metadata does this too). | # name from the multi-element case (make_metadata does this too). | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| if [[ ${#uploads[@]} -eq 1 ]]; then | if [[ ${#uploads[@]} -eq 1 ]]; then | ||||||
|     metadata="metadata_${minshow}.csv" |     metadata="metadata_${minshow}.csv" | ||||||
|     script="script_${minshow}.sh" |     script="script_${minshow}.sh" | ||||||
| @@ -524,9 +536,9 @@ else | |||||||
|     script="script_${minshow}-${maxshow}.sh" |     script="script_${minshow}-${maxshow}.sh" | ||||||
| fi | fi | ||||||
|  |  | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| # Perform the uploads or report what would be done | # Perform the uploads or report what would be done | ||||||
| # | #------------------------------------------------------------------------------- | ||||||
| if [[ $DRYRUN -eq 1 ]]; then | if [[ $DRYRUN -eq 1 ]]; then | ||||||
|     echo "Dry run: Would have uploaded list '$list'" |     echo "Dry run: Would have uploaded list '$list'" | ||||||
|     echo "Dry run: Would have created $metadata and $script" |     echo "Dry run: Would have created $metadata and $script" | ||||||
| @@ -573,17 +585,17 @@ else | |||||||
|                 echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE" |                 echo "$(date +%Y%m%d%H%M%S) ${#uploads[@]} uploads completed" >> "$LOGFILE" | ||||||
|  |  | ||||||
|                 # |                 # | ||||||
|                 # Update the state in the HPR database, unless we're using |                 # Update the state  of all the shows being processed in the | ||||||
|                 # FORCE. Pass the limit used here to this script so it can |                 # HPR database, unless we're using FORCE. | ||||||
|                 # stop looking for work unnecessarily |  | ||||||
|                 # |                 # | ||||||
|                 if [[ $FORCE -eq 0 ]]; then |                 if [[ $FORCE -eq 0 ]]; then | ||||||
|                     $UPSTATE -l$LIMIT |                     while read -r show; do | ||||||
|                     RES=$? |                         if update_show_state $show; then | ||||||
|                     if [[ $RES -ne 0 ]]; then |                             echo "Updated state for show $show" | ||||||
|                         echo "Problem updating database state" |                         else | ||||||
|                         exit 1 |                             echo "Failed to update state for show $show" | ||||||
|                     fi |                         fi | ||||||
|  |                     done < <(printf '%s\n' "${!uploads[@]}" | sort) | ||||||
|                 else |                 else | ||||||
|                     echo "Not updating the database, FORCE mode is on" |                     echo "Not updating the database, FORCE mode is on" | ||||||
|                 fi |                 fi | ||||||
|   | |||||||
							
								
								
									
										245
									
								
								InternetArchive/reformat_html
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										245
									
								
								InternetArchive/reformat_html
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,245 @@ | |||||||
|  | #!/usr/bin/env perl | ||||||
|  | #=============================================================================== | ||||||
|  | # | ||||||
|  | #         FILE: reformat_html | ||||||
|  | # | ||||||
|  | #        USAGE: ./reformat_html < input.html > output.html | ||||||
|  | # | ||||||
|  | #  DESCRIPTION: Reformats the HTML found in the HPR database in the 'notes' | ||||||
|  | #               field to the format required in the 'description' field of an | ||||||
|  | #               item on the IA. It reads from STDIN and writes to STDOUT. | ||||||
|  | # | ||||||
|  | #      OPTIONS: --- | ||||||
|  | # REQUIREMENTS: --- | ||||||
|  | #         BUGS: --- | ||||||
|  | #        NOTES: --- | ||||||
|  | #       AUTHOR: Dave Morriss (djm), Dave.Morriss@gmail.com | ||||||
|  | #      VERSION: 0.0.1 | ||||||
|  | #      CREATED: 2025-02-09 22:56:30 | ||||||
|  | #     REVISION: 2025-02-13 11:13:37 | ||||||
|  | # | ||||||
|  | #=============================================================================== | ||||||
|  |  | ||||||
|  | use v5.36; | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  | use feature qw{ say try }; | ||||||
|  | no warnings qw{ experimental::try }; | ||||||
|  |  | ||||||
|  | use open ':std', ':encoding(UTF-8)'; # Make all IO UTF-8 | ||||||
|  |  | ||||||
|  | use HTML::TreeBuilder 5 -weak; | ||||||
|  | use HTML::Entities; | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Version number (Incremented by Vim) | ||||||
|  | # | ||||||
|  | our $VERSION = '0.0.1'; | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Declarations | ||||||
|  | # | ||||||
|  | my ($verbose, @notes, $notes, $tree); | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Read the input data into an array | ||||||
|  | # | ||||||
|  | try { | ||||||
|  |     @notes = <STDIN>; | ||||||
|  | } | ||||||
|  | catch ($e) { | ||||||
|  |     warn "Problem reading input HTML; $e"; | ||||||
|  |     exit 1; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | die "No input HTML detected\n" unless @notes; | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Turn the array into a scalar | ||||||
|  | # | ||||||
|  | $notes = join( '', @notes ); | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Get ready to parse the array | ||||||
|  | # | ||||||
|  | $tree = HTML::TreeBuilder->new; | ||||||
|  | $tree->ignore_unknown(0); | ||||||
|  | $tree->no_expand_entities(1); | ||||||
|  | $tree->p_strict(1); | ||||||
|  | $tree->store_comments(1);               # Necessary? | ||||||
|  | $tree->warn(1); | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Parse HTML to the tree structure | ||||||
|  | # | ||||||
|  | $tree->parse_content($notes) | ||||||
|  |     or die "HTML::TreeBuilder failed to parse input HTML: $!\n"; | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Flatten all <pre> tags and add <br/> tags | ||||||
|  | # | ||||||
|  | $notes = flatten_pre($tree); | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Deal with non-ASCII | ||||||
|  | # | ||||||
|  | $notes = encode_entities( $notes, '^\n&\x20-\x25\x27-\x7e' ); | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Remove all newlines | ||||||
|  | # | ||||||
|  | $notes =~ s/\n//g; | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Write the end result to the STDOUT | ||||||
|  | # | ||||||
|  | say $notes; | ||||||
|  |  | ||||||
|  | exit; | ||||||
|  |  | ||||||
|  | #===  FUNCTION  ================================================================ | ||||||
|  | #         NAME: flatten_pre | ||||||
|  | #      PURPOSE: Process notes "flattening" <pre> contents | ||||||
|  | #   PARAMETERS: $tree   HTML::TreeBuilder object containing parsed and | ||||||
|  | #                       partially processed notes | ||||||
|  | #      RETURNS: Processed notes | ||||||
|  | #  DESCRIPTION: The HTML "<pre>" tag encloses preformatted text. It can also | ||||||
|  | #               contain some formatting tags like <em> and <code>, but spaces | ||||||
|  | #               and newlines are significant. The Internet Archive upload API | ||||||
|  | #               uses HTTP headers which are text strings without newlines, so | ||||||
|  | #               when these tags are uploaded through this route some | ||||||
|  | #               formatting is lost. What this routine does is parse the | ||||||
|  | #               contents of all <pre> sections in $notes, adding <br/> tags | ||||||
|  | #               to replace newlines. It has to perform a full parse | ||||||
|  | #               since the contents may include HTML tags and these need to be | ||||||
|  | #               passed through intact. It calls the subroutine 'flatten_item' to | ||||||
|  | #               deal with the recursive nature of HTML tags. | ||||||
|  | #       THROWS: No exceptions | ||||||
|  | #     COMMENTS: None | ||||||
|  | #     SEE ALSO: N/A | ||||||
|  | #=============================================================================== | ||||||
|  | sub flatten_pre { | ||||||
|  |     my ($tree) = @_; | ||||||
|  |  | ||||||
|  |     # | ||||||
|  |     # Find all the <pre> tags | ||||||
|  |     # | ||||||
|  |     my @pre_tags = $tree->look_down( _tag => 'pre', ); | ||||||
|  |  | ||||||
|  |     # | ||||||
|  |     # Walk the various <pre> elements in the document | ||||||
|  |     # | ||||||
|  |     foreach my $tag (@pre_tags) { | ||||||
|  |         # | ||||||
|  |         # Save the tag and empty the original | ||||||
|  |         # | ||||||
|  |         my $saved = $tag->clone(); | ||||||
|  |         $tag->delete_content(); | ||||||
|  |  | ||||||
|  |         # | ||||||
|  |         # Walk the saved content and rebuild the tag into $atag using the | ||||||
|  |         # nested arrayref structure permitted by HTML::Element for | ||||||
|  |         # convenience (the alternative is a little nasty). See the | ||||||
|  |         # documentation for 'new_from_lol' in HTML::Element. | ||||||
|  |         # | ||||||
|  |         my $atag; | ||||||
|  |         foreach my $item ( @{ $saved->content_array_ref } ) { | ||||||
|  |             push( @$atag, flatten_item($item) ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         # | ||||||
|  |         # Rebuild the tag from the arrayref we built. We treat the arrayref | ||||||
|  |         # structure we just built as an array because otherwise the top level | ||||||
|  |         # is interpreted as a spurious <null> tag. | ||||||
|  |         # | ||||||
|  |         $tag->push_content(@$atag); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     # | ||||||
|  |     # Trim out the original notes from the enclosing tags we added earlier | ||||||
|  |     # | ||||||
|  |     my $body = $tree->look_down( _tag => 'body' ); | ||||||
|  |     ( my $result = $body->as_HTML( undef, ' ', {} ) ) | ||||||
|  |         =~ s{(^<body[^>]*>|</body>$)}{}gi; | ||||||
|  |  | ||||||
|  |     return $result; | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #===  FUNCTION  ================================================================ | ||||||
|  | #         NAME: flatten_item | ||||||
|  | #      PURPOSE: Recursively "flatten" items within the enclosing <pre> | ||||||
|  | #   PARAMETERS: $item   an HTML::Element item parsed from the original | ||||||
|  | #                       <pre> section | ||||||
|  | #      RETURNS: An arrayref if the last seen item was a tag, otherwise a list | ||||||
|  | #  DESCRIPTION: Since <pre> sections can contain inline elements which change | ||||||
|  | #               the rendering of the text we need to parse these as we add | ||||||
|  | #               <br/> tags. This routine does this by recursively descending | ||||||
|  | #               through the contents. A common tag sequence is <pre><code> for | ||||||
|  | #               scripts and the like. This routine deals with such sequences. | ||||||
|  | #               It expects to receive the contents in sequence and builds the | ||||||
|  | #               result as a nested arrayref structure. | ||||||
|  | #       THROWS: No exceptions | ||||||
|  | #     COMMENTS: None | ||||||
|  | #     SEE ALSO: N/A | ||||||
|  | #=============================================================================== | ||||||
|  | sub flatten_item { | ||||||
|  |     my ($item) = @_; | ||||||
|  |  | ||||||
|  |     return unless defined($item); | ||||||
|  |  | ||||||
|  |     my ( @result, %attr ); | ||||||
|  |  | ||||||
|  |     # | ||||||
|  |     # Is it a sub-tag or non-tag content? | ||||||
|  |     # | ||||||
|  |     if ( ref($item) ) { | ||||||
|  |         # | ||||||
|  |         # It's a tag. Save the tag name and any attributes and recurse into | ||||||
|  |         # it. Return an arrayref | ||||||
|  |         # | ||||||
|  |         push( @result, $item->tag() ); | ||||||
|  |         %attr = $item->all_external_attr(); | ||||||
|  |         push( @result, \%attr ) if %attr; | ||||||
|  |         for my $child ( $item->content_list() ) { | ||||||
|  |             push( @result, flatten_item($child) ); | ||||||
|  |         } | ||||||
|  |         return \@result; | ||||||
|  |     } | ||||||
|  |     else { | ||||||
|  |         # | ||||||
|  |         # It's non-tag content. Join the lines with <br/> tags.  Return an | ||||||
|  |         # array (since this is a simple list). | ||||||
|  |         # | ||||||
|  |         # Note that we split with a LIMIT of -1 which causes any trailing list | ||||||
|  |         # items to be returned; default behaviour is to drop them. | ||||||
|  |         # | ||||||
|  |         $item =~ s/\r//g; | ||||||
|  |         my @content = split( /\n/, $item, -1 ); | ||||||
|  |         if (@content) { | ||||||
|  |             # | ||||||
|  |             # Remove a leading blank line - usually the result of | ||||||
|  |             # a "<pre>'NL'text" sequence | ||||||
|  |             # | ||||||
|  |             shift(@content) if ( $content[0] =~ /^\s*$/ ); | ||||||
|  |  | ||||||
|  |             # | ||||||
|  |             # Join back the lines with <br/> tags between them. | ||||||
|  |             # | ||||||
|  |             foreach my $txt (@content) { | ||||||
|  |                 push( @result, $txt, ['br'] ); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             # | ||||||
|  |             # Remove the <br/> at the end, it's spurious | ||||||
|  |             # | ||||||
|  |             pop(@result); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return (@result); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | # vim: syntax=perl:ts=8:sw=4:et:ai:tw=78:fo=tcrqn21:fdm=marker | ||||||
|  |  | ||||||
| @@ -6,7 +6,7 @@ | |||||||
| #        USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT] | #        USAGE: ./tidy_uploaded [-h] [-v] [-d {0|1}] [-c COUNT] | ||||||
| # | # | ||||||
| #  DESCRIPTION: Relocates HPR audio and other show-related files on 'borg' | #  DESCRIPTION: Relocates HPR audio and other show-related files on 'borg' | ||||||
| #               after their shows have been uploaded to the Internet Archive | #               after their shows have been uploaded to the Internet Archive. | ||||||
| # | # | ||||||
| #      OPTIONS: --- | #      OPTIONS: --- | ||||||
| # REQUIREMENTS: --- | # REQUIREMENTS: --- | ||||||
| @@ -43,7 +43,7 @@ TMP1=$(mktemp) || { echo "$SCRIPT: creation of temporary file failed!"; exit 1; | |||||||
| trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT | trap 'cleanup_temp $TMP1' SIGHUP SIGINT SIGPIPE SIGTERM EXIT | ||||||
|  |  | ||||||
| # | # | ||||||
| # Configure depending whether local or on the VPS | # Configure depending whether local or on 'borg' | ||||||
| # | # | ||||||
| case $HOSTNAME in | case $HOSTNAME in | ||||||
|     borg)       BASEDIR="$HOME/InternetArchive" |     borg)       BASEDIR="$HOME/InternetArchive" | ||||||
| @@ -95,7 +95,7 @@ queued_tasks () { | |||||||
| #         NAME: movefile | #         NAME: movefile | ||||||
| #  DESCRIPTION: Moves a file to a new place, catering for any directories in | #  DESCRIPTION: Moves a file to a new place, catering for any directories in | ||||||
| #               the path | #               the path | ||||||
| #   PARAMETERS: $1      directory to move form | #   PARAMETERS: $1      directory to move from | ||||||
| #               $2      directory to move to | #               $2      directory to move to | ||||||
| #               $3      file (or sub-path to move) | #               $3      file (or sub-path to move) | ||||||
| #      RETURNS: True if a move was done, otherwise False | #      RETURNS: True if a move was done, otherwise False | ||||||
| @@ -356,7 +356,7 @@ while read -r path; do | |||||||
|         # |         # | ||||||
|         tasks=$(queued_tasks "$item") |         tasks=$(queued_tasks "$item") | ||||||
|         if [[ $tasks -gt 0 ]]; then |         if [[ $tasks -gt 0 ]]; then | ||||||
|             echo "** Item $item still has $tasks unfinished " \ |             echo "** Item $item still has $tasks unfinished" \ | ||||||
|                 "$(ngettext task tasks "$tasks")" |                 "$(ngettext task tasks "$tasks")" | ||||||
|             echo "** Skipping to the next item" |             echo "** Skipping to the next item" | ||||||
|             continue |             continue | ||||||
| @@ -434,9 +434,6 @@ while read -r path; do | |||||||
|  |  | ||||||
| done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ') | done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' -printf "%CY%Cm%Cd%CH%CM%CS %p\n" | sort  | cut -f2 -d' ') | ||||||
|  |  | ||||||
| # Old 'find' used: |  | ||||||
| # done < <(find "$UPLOADS" -regextype posix-extended -regex '.*hpr[0-9]{4}.*' | sort) |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # No shows processed? There was nothing to do | # No shows processed? There was nothing to do | ||||||
| # | # | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ | |||||||
| # | # | ||||||
| #         FILE: update_state | #         FILE: update_state | ||||||
| # | # | ||||||
| #        USAGE: ./update_state | #        USAGE: ./update_state [-h] [-D] [-d] [-F] [-l N] [-m] | ||||||
| # | # | ||||||
| #  DESCRIPTION: A script to update the state of shows which have been sent to | #  DESCRIPTION: A script to update the state of shows which have been sent to | ||||||
| #               the IA. It looks at the current state of the 'reservations' | #               the IA. It looks at the current state of the 'reservations' | ||||||
| @@ -136,7 +136,6 @@ esac | |||||||
|  |  | ||||||
| cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; } | cd "$BASEDIR" || { echo "Can't cd to $BASEDIR"; exit 1; } | ||||||
|  |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # Tools | # Tools | ||||||
| # | # | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user