-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild
executable file
·248 lines (231 loc) · 11.4 KB
/
build
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/bash
# Generates/updates a podcast and deploys it with github.io pages
# To clean out and start over: find ./data/ -type f ! -name '*.au' ! -name '*.mp3' -delete
# Install AND/OR upgrade the dependencies
echo "Loading dependencies..."
brew install yt-dlp ffmpeg gsed jq > /dev/null 2>&1 || brew upgrade yt-dlp ffmpeg > /dev/null 2>&1
# The brew version may not be the latest, so we'll use pip to upgrade yt-dlp
pip install --upgrade yt-dlp > /dev/null 2>&1
mkdir -p data
mkdir -p data/shorts
# Prompt the user for the YouTube video URL
# read -p "Enter the YouTube playlist URL: " playlist
playlist="https://www.youtube.com/playlist?list=PLNQNP1byj5Eik62Ksz2dCmeWlaByay4dB"
# Download the audio, subtitles and metadata, keep an archive to track downloaded files.
# Stop when a file already exists, assuming we are caught up.
yt-dlp --format 140 \
--output 'data/%(upload_date>%Y-%m-%d)s-%(id)s.%(ext)s' \
--download-archive ./data/archive.txt \
--ignore-errors --write-info-json \
--user-agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36" \
--cookies ./all_cookies.txt \
--write-auto-subs --sub-langs en --sub-format vtt \
--break-on-existing \
"$playlist"
echo "Checking for m4a files that have not been trimmed"
for file in $(find ./data -maxdepth 1 -type f -name "*.m4a" | sort -r)
do
duration=$(ffprobe -i "$file" -show_entries format=duration -v quiet -of csv="p=0")
# If duration is more than 45min throw an error here for debugging.
if [ $(echo "$duration > 3300" | bc) -eq 1 ]
then
video_id=$(echo "$file" | gsed 's/.*\/[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}-\([a-zA-Z0-9_-]*\)\.m4a/\1/')
echo "The audio file $file is long ($duration seconds). The youtube video at url: https://www.youtube.com/watch?v=$video_id may need trimming."
# Delete the m4a, mp3, info.json, en.vtt, en.txt and .au files for this video.
rm -f "$file"
rm -f $(echo "$file" | gsed 's/\.m4a/\.mp3/')
rm -f $(echo "$file" | gsed 's/\.m4a/\.info.json/')
rm -f $(echo "$file" | gsed 's/\.m4a/\.en.vtt/')
rm -f $(echo "$file" | gsed 's/\.m4a/\.en.txt/')
rm -f $(echo "$file" | gsed 's/\.m4a/\.au/')
# Remove the entry from ./data/archive.txt so that it will be re-downloaded
gsed -i "/$video_id/d" ./data/archive.txt
exit 1;
fi
done
function remove-duplicates-by-extension {
EXTENSION="$1"
echo "Checking for duplicate $EXTENSION files..."
for file in $(find ./data -maxdepth 1 -type f -name "*.$EXTENSION" | sort)
do
# escape . in the extension
REGEX=$(echo "$EXTENSION" | gsed 's/\./\\\\./g')
video_id=$(echo "$file" | gsed "s/.*\/[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}-\([a-zA-Z0-9_-]*\)\.$REGEX/\1/")
if [ "$video_id" == "$file" ]
then
continue
fi
file_date=$(echo "$file" | gsed 's/.*\/\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)-.*/\1\2\3/')
# echo "Checking for duplicates of $video_id..."
for newfile in $(find ./data -maxdepth 1 -type f -name "*-$video_id.$EXTENSION" | grep -v "$file" | sort -r)
do
# Make sure it's not the current file
if [ "$file" == "$newfile" ]
then
continue
fi
# We want to keep the oldest file, so only look for files newer than the current file.
newfile_date=$(echo "$newfile" | gsed 's/.*\/\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)-.*/\1\2\3/')
if [ "$newfile_date" -lt "$file_date" ]
then
continue
fi
echo "Removing $newfile as a duplicate of $file."
rm -f $newfile
done
done
}
# Perhaps not always needed, but needed in my case to cleanup previous downloads where I inadvertently caused duplicates.
remove-duplicates-by-extension "au"
remove-duplicates-by-extension "mp3"
remove-duplicates-by-extension "m4a"
remove-duplicates-by-extension "en.vtt"
remove-duplicates-by-extension "en.txt"
remove-duplicates-by-extension "info.json"
echo "Generating minimalist transcriptions"
for file in $(find ./data -maxdepth 1 -type f -name "*.vtt" | sort -r)
do
newfile="${file/.vtt/.txt}"
if [ -f "$newfile" ]
then
continue
fi
text=$(cat "$file")
# Remove lines like:
# WEBVTT
# Kind: captions
# Language: en
text=$(echo "$text" | gsed '/^WEBVTT$/d' | gsed '/^Kind: captions$/d' | gsed '/^Language: en$/d')
# Remove timestamps like: 00:00:00.000 --> 00:00:02.000 and the rest of that line
text=$(echo "$text" | gsed 's/[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}\.[0-9]\{3\} --> [0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}\.[0-9]\{3\}.*//g')
# Remove <> tags and </> tags
text=$(echo "$text" | gsed 's/<[^>]*>//g')
text=$(echo "$text" | gsed 's/<\/[^>]*>//g')
# Remove text in [] brackets like [Music]
text=$(echo "$text" | gsed 's/\[.*\]//g')
# Remove empty lines
text=$(echo "$text" | gsed '/^\s*$/d')
# Remove duplicate contiguous lines, but allow duplicate lines that are not contiguous
text=$(echo "$text" | uniq)
echo "$text" > "$newfile"
done
# Loop through all the /data/*.json files to generate an RSS feed file, including title, description, date, etc.
echo "Generating RSS feed and README.md..."
echo '<html><head><title>Creekside Church of Christ podcast</title></head><body>' > index.html
echo '<h1>Creekside Church of Christ podcast</h1>' >> index.html
echo '<ul>' >> index.html
echo '<?xml version="1.0" encoding="UTF-8"?>' > podcast.xml
echo '<rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0">' >> podcast.xml
echo ' <channel>' >> podcast.xml
echo ' <title>Creekside</title>' >> podcast.xml
echo ' <link>https://creekside.life</link>' >> podcast.xml
echo ' <language>en-us</language>' >> podcast.xml
echo ' <itunes:subtitle>Weekly sermons and other content from Creekside Church of Christ in Valrico, Florida</itunes:subtitle>' >> podcast.xml
echo ' <itunes:author>Church of Christ in Valrico FL</itunes:author>' >> podcast.xml
echo ' <itunes:summary>Weekly sermons and other content from Creekside Church of Christ in Valrico, Florida</itunes:summary>' >> podcast.xml
echo ' <description>Weekly sermons and other content from Creekside Church of Christ in Valrico, Florida</description>' >> podcast.xml
echo ' <itunes:owner>' >> podcast.xml
echo ' <itunes:name>Church of Christ in Valrico FL</itunes:name>' >> podcast.xml
#echo ' <itunes:email>[email protected]</itunes:email>' >> podcast.xml
echo ' <itunes:email>[email protected]</itunes:email>' >> podcast.xml
echo ' </itunes:owner>' >> podcast.xml
echo ' <itunes:image href="https://creekside-life.github.io/podcast/cover-square-lg.jpg" />' >> podcast.xml
echo ' <itunes:category text="Religion & Spirituality">' >> podcast.xml
echo ' <itunes:category text="Christianity" />' >> podcast.xml
echo ' </itunes:category>' >> podcast.xml
echo ' <itunes:explicit>no</itunes:explicit>' >> podcast.xml
echo ' <itunes:type>episodic</itunes:type>' >> podcast.xml
echo ' <itunes:complete>no</itunes:complete>' >> podcast.xml
echo ' <itunes:new-feed-url>https://creekside-life.github.io/podcast/podcast.xml</itunes:new-feed-url>' >> podcast.xml
for file in $(find ./data -maxdepth 1 -type f -name "*.info.json" | sort -r | gsed 's/\.\///')
do
JSON=$(cat "$file")
# gsed 's/\s+-\s+.*//' |
title=$(echo "$JSON" | jq -r '.title' | gsed 's/^\s*//' | gsed 's/\s*$//')
title_xml=$(echo "$title" | gsed 's/"/\"/g' | gsed "s/'/\'/g" | gsed 's/&/\&/g' | gsed 's/</\</g' | gsed 's/>/\>/g')
description=$(echo "$JSON" | jq -r '.description' | gsed 's/"/\"/g' | gsed "s/'/\'/g" | gsed 's/&/\&/g' | gsed 's/</\</g' | gsed 's/>/\>/g')
thumbnail=$(echo "$JSON" | jq -r '.thumbnail')
transcript=$(echo "$file" | gsed 's/\.info\.json/\.en\.txt/')
date=$(echo "$JSON" | jq -r '.upload_date')
if [[ -z "$date" || "$date" == "null" ]]
then
date=$(echo "$JSON" | jq -r '.release_date')
if [[ -z "$date" || "$date" == "null" ]]
then
continue
fi
fi
md_date=$(date -j -f "%Y%m%d" "$date" "+%Y-%m-%d")
date=$(date -j -f "%Y%m%d" "$date" "+%a, %d %b %Y %T %z")
m4a_file=$(echo "$file" | gsed 's/\.info\.json$/\.m4a/')
mp3_file=$(echo "$file" | gsed 's/\.info\.json$/\.mp3/')
bash auphonic.sh "$m4a_file" "$title" "$mp3_file"
# mp3_fast=$(echo "$mp3_file" | sed 's/\.[^.]*$/_sp.mp3/')
# bash speed_up.sh "$mp3_file" "$mp3_fast"
# if [ ! -f "$mp3_fast" ]
# then
# echo "The audio file $mp3_fast does not exist."
# exit 1;
# continue
# fi
length=$(stat -f%z "$mp3_file")
duration=$(ffprobe -i "$mp3_file" -show_entries format=duration -v quiet -of csv="p=0")
# If duration is more than 45min throw an error here for debugging.
if [ $(echo "$duration > 3300" | bc) -eq 1 ]
then
echo "The audio file $mp3_file is long ($duration seconds). The youtube video at url: https://www.youtube.com/watch?v=$(echo "$JSON" | jq -r '.id') may be too long."
exit 1;
fi
# Remove the following lines from the description:
# 813-685-0750
# 2908 Bell Shoals Rd
# Brandon, FL 33511
# http://creekside.life
# https://creekside.life
# Creekside Church of Christ
# 3949 Lithia Pinecrest Rd, Valrico, FL 33596
description=$(echo "$description" | gsed '/Originally aired:/d' | gsed '/813-685-0750/d' | gsed '/2908 Bell Shoals Rd/d' | gsed '/Brandon, FL 33511/d' | gsed '/http:\/\/creekside.life/d' | gsed '/https:\/\/creekside.life/d' | gsed '/Creekside Church of Christ/d' | gsed '/3949 Lithia Pinecrest Rd, Valrico, FL 33596/d' | gsed '/^\s*$/d')
au_id_file=$(echo "$file" | gsed 's/\.info\.json/\.au/')
video_url="https://www.youtube.com/watch?v$(echo "$JSON" | jq -r '.id')"
au_id=$(cat "$au_id_file")
echo " <item>" >> podcast.xml
echo " <title>$title_xml</title>" >> podcast.xml
echo " <description>$description</description>" >> podcast.xml
echo " <pubDate>$date</pubDate>" >> podcast.xml
echo " <enclosure url=\"https://creekside-life.github.io/podcast/$mp3_file\" length=\"$length\" type=\"audio/mpeg\" />" >> podcast.xml
echo " <guid>$au_id</guid>" >> podcast.xml
echo " <itunes:image href=\"$thumbnail\" />" >> podcast.xml
echo " <itunes:author>Creekside Church of Christ</itunes:author>" >> podcast.xml
echo " <itunes:summary>$description</itunes:summary>" >> podcast.xml
echo " <itunes:duration>$duration</itunes:duration>" >> podcast.xml
echo " <itunes:transcript src=\"https://creekside-life.github.io/podcast/$transcript\" />" >> podcast.xml
echo " <link>$video_url</link>" >> podcast.xml
echo " </item>" >> podcast.xml
echo "<li><a href=\"https://creekside-life.github.io/podcast/$mp3_file\">$title</a> $md_date <a href=\"$transcript\">txt</a> <a href=\"$video_url\">vid</a></li>" >> index.html
done
echo ' </channel>' >> podcast.xml
echo '</rss>' >> podcast.xml
echo '</ul><br/><a href="podcast.xml">RSS feed</a></body></html>' >> index.html
# Deploy the podcast to the website
# For a large initial push:
# with git remove all history, so that this is the first commit, use the main branch.
#rm -rf .git
#git init
#git config --global init.defaultBranch main
#git branch -M main
#git remote add origin https://github.com/creekside-life/podcast.git
#git add build podcast.xml auphonic.sh speed_up.sh
#git commit -m "$(date -u "+%Y-%m-%d %H:%M:%S")"
#git push origin main --force
# Push one mp3 at a time to avoid disconnects.
#for file in $(find data -maxdepth 1 -type f -name "*.mp3" | sort -r)
#do
# git add "$file"
# git commit -m "$(date -u "+%Y-%m-%d %H:%M:%S")"
# git push origin main
#done
# For subsequent pushes:
git config http.postBuffer 524288000
git add .
git commit -m "$(date -u "+%Y-%m-%d %H:%M:%S")"
git push origin main