From 5cfdd42b11aa2a811e49bd159c5b699b79873b07 Mon Sep 17 00:00:00 2001 From: Ken Fallon Date: Thu, 16 Jan 2025 22:02:43 +0100 Subject: [PATCH] The show processing needs to be refactored #5 --- workflow/process_episode.bash | 184 ++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 33 deletions(-) diff --git a/workflow/process_episode.bash b/workflow/process_episode.bash index 295fd2c..69c1d7c 100755 --- a/workflow/process_episode.bash +++ b/workflow/process_episode.bash @@ -73,7 +73,7 @@ function program_checks() { done } - is_installed audio2image.bash awk base64 cat csvtojson curl date detox eval extract_images ffprobe file find grep grep head jq kate magick mediainfo mv realpath rsync seamonkey sed sed sort sponge ssh touch touch wget + is_installed audio2image.bash awk base64 cat csvtojson curl date detox eval extract_images ffprobe file find grep head jq kate magick mediainfo mv realpath remove-image.pl rsync seamonkey sed sed sort sponge ssh touch touch wget for arg in $* do @@ -335,6 +335,12 @@ function extract_images_brute_force() { echo_debug "Extracting images with grep." + if [ -s "${shownotes_html%.*}_edited.html" ] + then + echo_debug "There is already an edited version of the shownotes at \"${shownotes_html%.*}_edited.html\", slipping image extraction." + return + fi + if [[ -z "${shownotes_html}" || ! -s "${shownotes_html}" ]] then echo_error "The shownotes_html file \"${shownotes_html}\" could not be found." @@ -344,49 +350,120 @@ function extract_images_brute_force() { sed "s#>#>\n#g" "${shownotes_html}" | sponge "${shownotes_html}" - image_count="1" # Extract embedded images + image_count_embedded="1" for image in $( grep --color=never --perl-regexp --only-matching 'data:image/[^;]*;base64,\K[a-zA-Z0-9+/=]*' "${shownotes_html}" ) do - this_image="${working_dir}/hpr${ep_num}_image_${image_count}" + this_image="${working_dir}/hpr${ep_num}_image_${image_count_embedded}" echo -n "$image" | base64 -di > ${this_image} this_ext="$( file --mime-type ${this_image} | awk -F '/' '{print $NF}' )" mv -v "${this_image}" "${this_image}.${this_ext}" this_width="$( mediainfo "${this_image}.${this_ext}" | grep Width | awk -F ': | pixels' '{print $2}' | sed 's/ //g' )" if [ "${this_width}" -gt "400" ] then + echo_debug "Generating thumbnail for embedded image \"${this_image}.${this_ext}\"." magick "${this_image}.${this_ext}" -resize 400x "${this_image}_tn.${this_ext}" fi - ((image_count=image_count+1)) + ((image_count_embedded=image_count_embedded+1)) done # Download referenced images + image_count_external="1" for image in $( grep --color=never --perl-regexp --only-matching '' "${shownotes_html}" | awk -F 'src=' '{print $2}' | awk -F '"' '{print $2}' ) do - this_image="${working_dir}/hpr${ep_num}_image_${image_count}" + this_image="${working_dir}/hpr${ep_num}_image_ext_${image_count_external}" wget "${image}" --output-document=${this_image} this_ext="$( file --mime-type ${this_image} | awk -F '/' '{print $NF}' )" - if [ ! -e "${this_image}.${this_ext}" ] - then - mv -v "${this_image%.*}" "${this_image}.${this_ext}" - fi + mv -v "${this_image%.*}" "${this_image}.${this_ext}" this_width="$( mediainfo "${this_image}.${this_ext}" | grep Width | awk -F ': | pixels' '{print $2}' | sed 's/ //g' )" if [ "${this_width}" -gt "400" ] then + echo_debug "Generating thumbnail for external image \"${this_image}.${this_ext}\"." magick "${this_image}.${this_ext}" -resize 400x "${this_image}_tn.${this_ext}" fi - ((image_count=image_count+1)) + ((image_count_external=image_count_external+1)) done - # TODO Link up image_count > TODO Link up image_count by looping `> "${shownotes_html}.embedded_images" + else + this_image="$( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_${image_count_embedded}.*" )" + if [[ -z "${this_image}" || ! -s "${this_image}" ]] + then + echo_error "Unable to find an image for \"${image_count_embedded}\", \"${this_image}\"." + fi + this_image="$( basename "${this_image}" )" + this_image_tn="$( find "${working_dir}/" -type f -iname "${this_image%.*}_tn.*" )" + if [[ -z "${this_image_tn}" || ! -s "${this_image_tn}" ]] + then + echo "${this_line}" | sed "s@LOCAL_IMAGE_REMOVED@${this_image}@g" >> "${shownotes_html}.embedded_images" + else + this_image_tn="$( basename "${this_image_tn}" )" + echo "" >> "${shownotes_html}.embedded_images" + echo "${this_line}" | sed "s@LOCAL_IMAGE_REMOVED@${this_image_tn}@g" >> "${shownotes_html}.embedded_images" + echo "" >> "${shownotes_html}.embedded_images" + fi + ((image_count_embedded=image_count_embedded+1)) + fi + done + + mv -v "${shownotes_html}.embedded_images" "${shownotes_html}" + + else + echo_debug "No embedded images found. ${image_count_embedded}" + fi - - # Picture 1 shows the broken dog    walking accessory. + if [ "${image_count_external}" -gt "1" ] + then + image_count_external="1" + + touch "${shownotes_html}.external_images" + + cat "${shownotes_html}" | remove-image.pl | while read this_line + do + if [ "$( echo "${this_line}" | grep --count "REMOTE_IMAGE_REMOVED" )" -eq "0" ] + then + echo "${this_line}" >> "${shownotes_html}.external_images" + else + this_image="$( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_ext_${image_count_external}.*" )" + if [[ -z "${this_image}" || ! -s "${this_image}" ]] + then + echo_error "Unable to find an image for \"${image_count_external}\", \"${this_image}\"." + fi + this_image="$( basename "${this_image}" )" + this_image_tn="$( find "${working_dir}/" -type f -iname "${this_image%.*}_tn.*" )" + if [[ -z "${this_image_tn}" || ! -s "${this_image_tn}" ]] + then + echo "${this_line}" | sed "s@REMOTE_IMAGE_REMOVED@${this_image}@g" >> "${shownotes_html}.external_images" + else + this_image_tn="$( basename "${this_image_tn}" )" + echo "" >> "${shownotes_html}.external_images" + echo "${this_line}" | sed "s@REMOTE_IMAGE_REMOVED@${this_image_tn}@g" >> "${shownotes_html}.external_images" + echo "" >> "${shownotes_html}.external_images" + fi + ((image_count_external=image_count_external+1)) + fi + done + + mv -v "${shownotes_html}.external_images" "${shownotes_html}" + + else + echo_debug "No external images found." + fi ## TODO End Temp fix @@ -1095,7 +1172,7 @@ $(cat "${working_dir}/hpr${ep_num}.srt" )
-

Raw shownotes.html

+

Transcript File


@@ -1151,13 +1228,16 @@ function register_assets() {
   
   echo '"episode_id","filename","extension","size", "sha1sum", "mime_type", "file_type"' | tee "${working_dir}/hpr${ep_num}_assets.csv"
   
-  for this_asset in hpr${ep_num}.flac hpr${ep_num}.wav hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt hpr${ep_num}_image_*.*
+  for this_asset_filename in hpr${ep_num}.flac hpr${ep_num}.wav hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt $( find "${working_dir}/" -maxdepth 1 -type f -iname "hpr${ep_num}_image_*.*" )
   do
-    echo_debug "Registering \"${this_asset}\"."
-    if [[ ! -s "${working_dir}/${this_asset}" ]]
+    this_asset_filename="$( basename "${this_asset_filename}" )"
+    echo_debug "Registering \"${this_asset_filename}\"."
+    this_asset="${working_dir}/${this_asset_filename}"
+    
+    if [[ ! -s "${this_asset}" ]]
     then
-      echo_error "Failed to register missing file \"${working_dir}/${this_asset}\"."
-      ls -al "${working_dir}/${this_asset}"
+      echo_error "Failed to register missing file \"${this_asset}\"."
+      ls -al "${this_asset}"
     fi
     this_asset_basename=$( basename "${this_asset}" )
     this_asset_extension="${this_asset_basename##*.}"
@@ -1166,6 +1246,11 @@ function register_assets() {
     this_asset_mime_type=$( file --dereference --brief --mime "${this_asset}" )
     this_asset_file_type=$( file --dereference --brief "${this_asset}" )
     
+    if [ "$( echo ${this_asset_file_type} | wc --chars )" -gt "130" ]
+    then
+      this_asset_file_type="${this_asset_mime_type}"
+    fi
+    
     variables=( ep_num this_asset_basename this_asset_extension this_asset_size this_asset_sha1sum this_asset_mime_type this_asset_file_type working_dir ep_num )
 
     for variable in "${variables[@]}"
@@ -1191,13 +1276,52 @@ function register_assets() {
   then
     echo_error "The asset json file \"${working_dir}/hpr${ep_num}_assets.json\" is missing.";
   fi
-    
-  if [ "$( curl --silent --netrc-file $HOME/.netrc --write-out '%{http_code}' --request POST https://hub.hackerpublicradio.org/cms/assets.php --data-ascii @"${working_dir}/hpr${ep_num}_assets.json" --header "Content-Type: application/json" )" != 200 ]
+  
+  response="$( curl --silent --netrc-file $HOME/.netrc --write-out '%{http_code}' --output /dev/null --request POST https://hub.hackerpublicradio.org/cms/assets.php --data-ascii @"${working_dir}/hpr${ep_num}_assets.json" --header "Content-Type: application/json" )"
+  if [[ -z "${response}" || "${response}" != "200" ]]
   then
-    echo_error "The assets for episode hpr${ep_num} has not been registered."
+    echo_error "The assets for episode hpr${ep_num} has not been registered. The response was \"${response}\""
   fi
 }
 
+#################################################
+# Register the assets with the hpr database
+
+function copy_files_to_origin_server() {
+  echo_debug "Copying the files to the origin server"
+  
+  # TODO get a origin server capable of storing all the files
+  for this_asset in hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt $( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_*.*" )
+  do
+    this_asset="$( basename ${this_asset} )"
+    this_file="${working_dir}/${this_asset}"
+  
+    echo_debug "Copying \"${this_file}\" to the origin server."
+    
+    if [[ ! -s "${this_file}" ]]
+    then
+      echo_error "Failed to transfer missing file \"${this_file}\"."
+      ls -al "${this_file}"
+    fi
+  
+    rsync --archive --quiet --partial --progress "${this_file}" rsync.net:hpr/eps/hpr${ep_num}/${this_asset}
+    
+    origin_sha1sum="$( echo $( ssh rsync.net "sha1 hpr/eps/hpr${ep_num}/${this_asset}" 2> /dev/null ) | awk '{print $NF}' )"
+    this_asset_sha1sum="$( sha1sum "${this_file}" | awk '{print $1}' )"
+    
+    if [[ -z "${origin_sha1sum}" || -z "${this_asset_sha1sum}" ]]
+    then
+      echo_error "Could not determine the local/origin sha1sum for file \"${this_file}\"."
+    fi
+    
+    if [ "${origin_sha1sum}" != "${this_asset_sha1sum}" ]
+    then
+      echo_error "The local sha1sum \"${origin_sha1sum}\" and origin \"${this_asset_sha1sum}\" are mismatched for file \"${this_file}\"."
+    fi
+   
+  done
+}
+
 #################################################
 # Send the derived files to the server borg to be sent to the Internet Archive
 
@@ -1288,27 +1412,19 @@ function copy_derived_files_to_borg_for_the_internet_archive() {
 # TODO Add support for community news - reusing ^^^
 # TODO Add support for stereo for some episodes that request it
 # TODO Include links in extract_images_brute_force
-# TODO run hpr_generator to genrate only the new episode
 # TODO take screenshots of the rendered episode on the hpr website
 # TODO audio_channels default to mono - stereo as an option
-# TODO check the channels on the source audio and add a warning in the report to check it's ok to reduce to mono
 # TODO Add chapter support 
 # TODO incorporate direct upload to the IA
-# TODO change MEDIA_TRANSCODED
-# TODO incorporate assets 
 # TODO copy the files to the backup disk
-# TODO copy the derived files to the ccdn origin server 
-# TODO fix permissions on vger(two)
-
-# TODO 
 
 program_checks              # We know that all the programs and variables are set
 
 get_working_dir $@          # We have a working directory and a valid json file
 
 get_episode_metadata $@     # We have all the metadata we need to process the show.
-
-extract_images_brute_force  # Extract_images by brute force
+ 
+extract_images_brute_force  # We have extracted the images by brute force
 
 media_checks                # 
 
@@ -1334,6 +1450,8 @@ manual_final_review
 
 register_assets
 
+copy_files_to_origin_server
+
 copy_derived_files_to_borg_for_the_internet_archive
 
 echo_debug "The End"