1
0
forked from HPR/hpr-tools

The show processing needs to be refactored #5

This commit is contained in:
Ken Fallon 2025-01-16 22:02:43 +01:00
parent e1df438111
commit 5cfdd42b11

View File

@ -73,7 +73,7 @@ function program_checks() {
done
}
is_installed audio2image.bash awk base64 cat csvtojson curl date detox eval extract_images ffprobe file find grep grep head jq kate magick mediainfo mv realpath rsync seamonkey sed sed sort sponge ssh touch touch wget
is_installed audio2image.bash awk base64 cat csvtojson curl date detox eval extract_images ffprobe file find grep head jq kate magick mediainfo mv realpath remove-image.pl rsync seamonkey sed sed sort sponge ssh touch touch wget
for arg in $*
do
@ -335,6 +335,12 @@ function extract_images_brute_force() {
echo_debug "Extracting images with grep."
if [ -s "${shownotes_html%.*}_edited.html" ]
then
echo_debug "There is already an edited version of the shownotes at \"${shownotes_html%.*}_edited.html\", slipping image extraction."
return
fi
if [[ -z "${shownotes_html}" || ! -s "${shownotes_html}" ]]
then
echo_error "The shownotes_html file \"${shownotes_html}\" could not be found."
@ -344,49 +350,120 @@ function extract_images_brute_force() {
sed "s#>#>\n#g" "${shownotes_html}" | sponge "${shownotes_html}"
image_count="1"
# Extract embedded images
image_count_embedded="1"
for image in $( grep --color=never --perl-regexp --only-matching 'data:image/[^;]*;base64,\K[a-zA-Z0-9+/=]*' "${shownotes_html}" )
do
this_image="${working_dir}/hpr${ep_num}_image_${image_count}"
this_image="${working_dir}/hpr${ep_num}_image_${image_count_embedded}"
echo -n "$image" | base64 -di > ${this_image}
this_ext="$( file --mime-type ${this_image} | awk -F '/' '{print $NF}' )"
mv -v "${this_image}" "${this_image}.${this_ext}"
this_width="$( mediainfo "${this_image}.${this_ext}" | grep Width | awk -F ': | pixels' '{print $2}' | sed 's/ //g' )"
if [ "${this_width}" -gt "400" ]
then
echo_debug "Generating thumbnail for embedded image \"${this_image}.${this_ext}\"."
magick "${this_image}.${this_ext}" -resize 400x "${this_image}_tn.${this_ext}"
fi
((image_count=image_count+1))
((image_count_embedded=image_count_embedded+1))
done
# Download referenced images
image_count_external="1"
for image in $( grep --color=never --perl-regexp --only-matching '<img.*src.*http.*>' "${shownotes_html}" | awk -F 'src=' '{print $2}' | awk -F '"' '{print $2}' )
do
this_image="${working_dir}/hpr${ep_num}_image_${image_count}"
this_image="${working_dir}/hpr${ep_num}_image_ext_${image_count_external}"
wget "${image}" --output-document=${this_image}
this_ext="$( file --mime-type ${this_image} | awk -F '/' '{print $NF}' )"
if [ ! -e "${this_image}.${this_ext}" ]
then
mv -v "${this_image%.*}" "${this_image}.${this_ext}"
fi
mv -v "${this_image%.*}" "${this_image}.${this_ext}"
this_width="$( mediainfo "${this_image}.${this_ext}" | grep Width | awk -F ': | pixels' '{print $2}' | sed 's/ //g' )"
if [ "${this_width}" -gt "400" ]
then
echo_debug "Generating thumbnail for external image \"${this_image}.${this_ext}\"."
magick "${this_image}.${this_ext}" -resize 400x "${this_image}_tn.${this_ext}"
fi
((image_count=image_count+1))
((image_count_external=image_count_external+1))
done
# TODO Link up image_count > TODO Link up image_count by looping `<img` tags with images found on the disk.
cat "${shownotes_html}" | remove-image.pl | sponge "${shownotes_html}"
#grep --color=never --perl-regexp --invert-match 'data:image/[^;]*;base64,\K[a-zA-Z0-9+/=]*' "${shownotes_html}"
if [ "${image_count_embedded}" -gt "1" ]
then
image_count_embedded="1"
touch "${shownotes_html}.embedded_images"
cat "${shownotes_html}" | while read this_line
do
if [ "$( echo "${this_line}" | grep --count "LOCAL_IMAGE_REMOVED" )" -eq "0" ]
then
echo "${this_line}" >> "${shownotes_html}.embedded_images"
else
this_image="$( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_${image_count_embedded}.*" )"
if [[ -z "${this_image}" || ! -s "${this_image}" ]]
then
echo_error "Unable to find an image for \"${image_count_embedded}\", \"${this_image}\"."
fi
this_image="$( basename "${this_image}" )"
this_image_tn="$( find "${working_dir}/" -type f -iname "${this_image%.*}_tn.*" )"
if [[ -z "${this_image_tn}" || ! -s "${this_image_tn}" ]]
then
echo "${this_line}" | sed "s@LOCAL_IMAGE_REMOVED@${this_image}@g" >> "${shownotes_html}.embedded_images"
else
this_image_tn="$( basename "${this_image_tn}" )"
echo "<a href=\"${this_image}\">" >> "${shownotes_html}.embedded_images"
echo "${this_line}" | sed "s@LOCAL_IMAGE_REMOVED@${this_image_tn}@g" >> "${shownotes_html}.embedded_images"
echo "</a>" >> "${shownotes_html}.embedded_images"
fi
((image_count_embedded=image_count_embedded+1))
fi
done
mv -v "${shownotes_html}.embedded_images" "${shownotes_html}"
else
echo_debug "No embedded images found. ${image_count_embedded}"
fi
# <img alt="Picture 1 shows the broken dog walking accessory." border="0" height="300" src="hpr4283_image_1_tn.jpeg" width="400" />
if [ "${image_count_external}" -gt "1" ]
then
image_count_external="1"
touch "${shownotes_html}.external_images"
cat "${shownotes_html}" | remove-image.pl | while read this_line
do
if [ "$( echo "${this_line}" | grep --count "REMOTE_IMAGE_REMOVED" )" -eq "0" ]
then
echo "${this_line}" >> "${shownotes_html}.external_images"
else
this_image="$( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_ext_${image_count_external}.*" )"
if [[ -z "${this_image}" || ! -s "${this_image}" ]]
then
echo_error "Unable to find an image for \"${image_count_external}\", \"${this_image}\"."
fi
this_image="$( basename "${this_image}" )"
this_image_tn="$( find "${working_dir}/" -type f -iname "${this_image%.*}_tn.*" )"
if [[ -z "${this_image_tn}" || ! -s "${this_image_tn}" ]]
then
echo "${this_line}" | sed "s@REMOTE_IMAGE_REMOVED@${this_image}@g" >> "${shownotes_html}.external_images"
else
this_image_tn="$( basename "${this_image_tn}" )"
echo "<a href=\"${this_image}\">" >> "${shownotes_html}.external_images"
echo "${this_line}" | sed "s@REMOTE_IMAGE_REMOVED@${this_image_tn}@g" >> "${shownotes_html}.external_images"
echo "</a>" >> "${shownotes_html}.external_images"
fi
((image_count_external=image_count_external+1))
fi
done
mv -v "${shownotes_html}.external_images" "${shownotes_html}"
else
echo_debug "No external images found."
fi
## TODO End Temp fix
@ -1095,7 +1172,7 @@ $(cat "${working_dir}/hpr${ep_num}.srt" )
</pre>
<hr />
<h3>Raw shownotes.html</h3>
<h3>Transcript File</h3>
<hr />
<pre>
@ -1151,13 +1228,16 @@ function register_assets() {
echo '"episode_id","filename","extension","size", "sha1sum", "mime_type", "file_type"' | tee "${working_dir}/hpr${ep_num}_assets.csv"
for this_asset in hpr${ep_num}.flac hpr${ep_num}.wav hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt hpr${ep_num}_image_*.*
for this_asset_filename in hpr${ep_num}.flac hpr${ep_num}.wav hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt $( find "${working_dir}/" -maxdepth 1 -type f -iname "hpr${ep_num}_image_*.*" )
do
echo_debug "Registering \"${this_asset}\"."
if [[ ! -s "${working_dir}/${this_asset}" ]]
this_asset_filename="$( basename "${this_asset_filename}" )"
echo_debug "Registering \"${this_asset_filename}\"."
this_asset="${working_dir}/${this_asset_filename}"
if [[ ! -s "${this_asset}" ]]
then
echo_error "Failed to register missing file \"${working_dir}/${this_asset}\"."
ls -al "${working_dir}/${this_asset}"
echo_error "Failed to register missing file \"${this_asset}\"."
ls -al "${this_asset}"
fi
this_asset_basename=$( basename "${this_asset}" )
this_asset_extension="${this_asset_basename##*.}"
@ -1166,6 +1246,11 @@ function register_assets() {
this_asset_mime_type=$( file --dereference --brief --mime "${this_asset}" )
this_asset_file_type=$( file --dereference --brief "${this_asset}" )
if [ "$( echo ${this_asset_file_type} | wc --chars )" -gt "130" ]
then
this_asset_file_type="${this_asset_mime_type}"
fi
variables=( ep_num this_asset_basename this_asset_extension this_asset_size this_asset_sha1sum this_asset_mime_type this_asset_file_type working_dir ep_num )
for variable in "${variables[@]}"
@ -1191,13 +1276,52 @@ function register_assets() {
then
echo_error "The asset json file \"${working_dir}/hpr${ep_num}_assets.json\" is missing.";
fi
if [ "$( curl --silent --netrc-file $HOME/.netrc --write-out '%{http_code}' --request POST https://hub.hackerpublicradio.org/cms/assets.php --data-ascii @"${working_dir}/hpr${ep_num}_assets.json" --header "Content-Type: application/json" )" != 200 ]
response="$( curl --silent --netrc-file $HOME/.netrc --write-out '%{http_code}' --output /dev/null --request POST https://hub.hackerpublicradio.org/cms/assets.php --data-ascii @"${working_dir}/hpr${ep_num}_assets.json" --header "Content-Type: application/json" )"
if [[ -z "${response}" || "${response}" != "200" ]]
then
echo_error "The assets for episode hpr${ep_num} has not been registered."
echo_error "The assets for episode hpr${ep_num} has not been registered. The response was \"${response}\""
fi
}
#################################################
# Register the assets with the hpr database
function copy_files_to_origin_server() {
echo_debug "Copying the files to the origin server"
# TODO get a origin server capable of storing all the files
for this_asset in hpr${ep_num}.mp3 hpr${ep_num}.ogg hpr${ep_num}.opus hpr${ep_num}.srt hpr${ep_num}.txt $( find "${working_dir}/" -type f -iname "hpr${ep_num}_image_*.*" )
do
this_asset="$( basename ${this_asset} )"
this_file="${working_dir}/${this_asset}"
echo_debug "Copying \"${this_file}\" to the origin server."
if [[ ! -s "${this_file}" ]]
then
echo_error "Failed to transfer missing file \"${this_file}\"."
ls -al "${this_file}"
fi
rsync --archive --quiet --partial --progress "${this_file}" rsync.net:hpr/eps/hpr${ep_num}/${this_asset}
origin_sha1sum="$( echo $( ssh rsync.net "sha1 hpr/eps/hpr${ep_num}/${this_asset}" 2> /dev/null ) | awk '{print $NF}' )"
this_asset_sha1sum="$( sha1sum "${this_file}" | awk '{print $1}' )"
if [[ -z "${origin_sha1sum}" || -z "${this_asset_sha1sum}" ]]
then
echo_error "Could not determine the local/origin sha1sum for file \"${this_file}\"."
fi
if [ "${origin_sha1sum}" != "${this_asset_sha1sum}" ]
then
echo_error "The local sha1sum \"${origin_sha1sum}\" and origin \"${this_asset_sha1sum}\" are mismatched for file \"${this_file}\"."
fi
done
}
#################################################
# Send the derived files to the server borg to be sent to the Internet Archive
@ -1288,27 +1412,19 @@ function copy_derived_files_to_borg_for_the_internet_archive() {
# TODO Add support for community news - reusing ^^^
# TODO Add support for stereo for some episodes that request it
# TODO Include links in extract_images_brute_force
# TODO run hpr_generator to genrate only the new episode
# TODO take screenshots of the rendered episode on the hpr website
# TODO audio_channels default to mono - stereo as an option
# TODO check the channels on the source audio and add a warning in the report to check it's ok to reduce to mono
# TODO Add chapter support
# TODO incorporate direct upload to the IA
# TODO change MEDIA_TRANSCODED
# TODO incorporate assets
# TODO copy the files to the backup disk
# TODO copy the derived files to the ccdn origin server
# TODO fix permissions on vger(two)
# TODO
program_checks # We know that all the programs and variables are set
get_working_dir $@ # We have a working directory and a valid json file
get_episode_metadata $@ # We have all the metadata we need to process the show.
extract_images_brute_force # Extract_images by brute force
extract_images_brute_force # We have extracted the images by brute force
media_checks #
@ -1334,6 +1450,8 @@ manual_final_review
register_assets
copy_files_to_origin_server
copy_derived_files_to_borg_for_the_internet_archive
echo_debug "The End"