-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubTranslate.py
310 lines (290 loc) · 13.8 KB
/
subTranslate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# Subtitle Translation
# requires googletrans and pysub-parser
# for language code, please view https://py-googletrans.readthedocs.io/en/latest/#googletrans-languages1
# or run "sudo pip3 install pysub-parser googletrans==4.0.0rc1" before starting this script
# due to some weird bugs in googletrans stable build, using the release candidate builds are even more stable than the stable build itself
# subTranslate
# how it works:
# method 1 (default): parse subtitle --> send to google to translate --> generate a new subtitle file
# method 2: parse subtitle --> send to google to translate --> duplicate original subtitle file --> find and replace text
#
# how does "parse subtitle" works:
# this script uses pysubparser as an easy solution for reading subtitles. This script will then take advantage of the ability of reading start and end timings to generate a new subtitle file
#
# how does "send to google to translate" works:
# this script uses the google translate API for translation. As such, it is common that google will block any suspicious connections to google API.
# as a counter, this script will attempt to send short burst of requests every few seconds, which in theory will look less like a DDOS attack to google
# however, this function will mean that the script will be alot more slower (up to 300% slower), but will ensure full functionality in the long run
# if you choose to risk getting ip ban by google, you maybe set "segments = 0" down below, which will send all the lines to google quickly without mercy
#
# how does preserveOriginalCodec works:
# base on your input subtitle, the script will generate a new file with the same codec as the input subtitle. however, special edits done to the subtitle will be gone.
# to preserve the original edits, set preserveOriginalEdits = 1, which will use method 2 instead
import platform
import os
from os import system, name # clearScreen
from pathlib import Path
from os import listdir
from os.path import isfile, join
import subprocess
import shutil
import re
import datetime
import time
import sys
import glob
import re
import random
import string
import shutil
from pysubparser import parser
from googletrans import Translator
arrayOfExtentions = [".ass", ".srt"] # which file extension to search for
identifyingCharacteristic = ".english.default" # define what the input should have in their name. Leave empty for none
extraIdentifyingCharacteristic = ".translated" # creatings a marking characteristic in the output file. Leave empty for none
cwd = os.getcwd() # sets current location. Specify a specific location if you like
language = "zh-cn"
sleep_duration = 2 # sleeps between each intervals to avoid being banned. Recommend = 10
illegalList = ["-", ">", ":", " "] # will remove such characters in lines. This function will not be enabled if preserveOriginalEdits is disabled
preserveOriginalEdits = 0 # by default, this script will write a clean new srt file. But if you want the script to use the original file as a base, then this script will try to preserve it.
# but this function has an issue where it will skip some lines due to how python works. It is recommended that you leave this to 0
#
preserveOriginalCodec = 1 # by default, this value will be 1. if the subtitle codec is supported, tihis script will attempt to create a new subtitle with the original codec.
# if disabled, the script will always generate a srt file regardless of the input.
# but please note that this function only matters if preserveOriginalEdits is disabled
#
# segments ==
# segments seperates the subtitles into smaller chuncks, this enables the script to delay between chunks to avoid ip ban
# so if you set segments = 20, the script wlill send 20 requests to google API in a rapid succession.
# and so if you disable segments, script will send all of the requests immediately without any concerns for waiting
# specifying a higher value will be equal to higher risk of ban
# if you meet any issues, you can try specifying a smaller chunk
# set to 0 to disable segments [not recommended]
# recommended = 5
segments = 20
for x in range(100):
illegalList.append(x)
def findFiles(arrayOfNames, workingDir): # file files in a given directory
filtered = []
for extension in arrayOfNames:
path = cwd
files = []
for r, d, f in os.walk(path):
for file in f:
if extension in file:
files.append(os.path.join(r, file))
filtered.append(list(filter(lambda k: extension in k, files)))
combinedFiltered = combineArray(filtered)
return combinedFiltered
def combineArray(inputx): # converts two dimension array to one dimension
combine = []
for x in inputx:
for y in x:
combine.append(y)
return combine
def timeConvert(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
return "%d:%02d:%02d" % (hour, minutes, seconds)
def parseSubtitles(inputx):
# parse the subtitles into a variable
subtitles = parser.parse(inputx)
return subtitles
def rapidTranslate(inputx, languagey):
# quickly burst through list to google translateAPI
translator = Translator()
results = []
for x in inputx:
results.append(translator.translate(x, dest=languagey))
time.sleep(0.2)
return results
def translateSubtitles(inputVariable, languagex):
global sleep_duration, segments
# translate the variable and return a variable
# NOTE: Please understand that this might not work since google might limit their API
inputVariablex = []
for x in inputVariable:
inputVariablex.append(str(x.text))
inputVariable = inputVariablex
translator = Translator()
translated = []
print("attempting conversion, this might take awhile..")
print("target language:", languagex)
print("total lines:", len(inputVariable))
if segments > 0:
seperatedList = split_list(inputVariable, int(len(inputVariable) / segments))
print("total segments:", len(seperatedList))
print("total lines in each segments:", segments)
if segments > 50:
print("[WARNING] segments is greater than 50, which risks the chances of an IP ban")
print("[WARNING] you can ignore this warning if you know what you are doing")
else:
seperatedList = []
seperatedList.append(inputVariable)
print("")
print("[WARNING] script is attempting to send the full", len(inputVariable), "lines to the google API")
print("[WARNING] this is not recommended as it might create an issue with the chunk being too big to process")
print("[WARNING] set segments to something higher than 0 to avoid this issue")
print("[WARNING] this script might look unresponsive for a long time")
if sleep_duration < 2:
print("")
print("[WARNING] sleep_duration is less than 2, which risks the chances of an IP ban")
print("[WARNING] you can ignore this warning if you know what you are doing")
elif segments > 0:
print("sleep between translations:", sleep_duration, "seconds")
count = 0
while True:
try:
translated = []
for x in seperatedList:
timeStart = time.time()
result = rapidTranslate(x, languagex)
for x in result:
translated.append(str(x.text))
count = count + 1
timeTaken = time.time() - timeStart
timeLeft = ((len(seperatedList) - count) * timeTaken) + ((len(seperatedList) - count) * sleep_duration)
percentageCompleted = (count/len(seperatedList)) * 100
if int(percentageCompleted) == 100:
print("progress:", int(percentageCompleted), "percent completed. Estimated time left to completion (hh:mm:ss):", timeConvert(int(timeLeft)))
print("translation process has been completed")
else:
print("progress:", int(percentageCompleted), "percent completed. Estimated time left to completion (hh:mm:ss):", timeConvert(int(timeLeft)), end="\r")
time.sleep(sleep_duration)
return translated
except OSError as error:
print(error)
print("")
print("[WARNING] API failure? Retrying in 5 minutes")
if not segments > 0:
print("[WARNING] detected that segments is not enabled. please consider enabling it instead and try again")
else:
print("[WARNING] script thinks that its a possible IP ban issue")
print("[NOTE] ctrl + c again to exit")
time.sleep(300)
print("[RETRY] wait time passed. trying again now..")
def copyAndEditSub(inputx, inputBase, inputVariable):
global identifyingCharacteristic, language, extraIdentifyingCharacteristic, illegalList
# copy the original file, then find and replace the specific texts accordingly
# this will preserve the original edits done to the subtitles.
final = target.replace(identifyingCharacteristic, "." + language + extraIdentifyingCharacteristic)
shutil.copyfile(target, final) # duplicate original file to a new file
inputVariable = list(inputVariable)
inputBase = list(inputBase)
# clean up the parse content
for x in range(len(inputBase)):
print(type(inputBase[x]))
var = str(inputBase[x])
replacementText = str(x) + " > "
inputBase[x] = var.replace(replacementText, "")
# remove illegal characters
for x in range(len(inputBase)):
for y in illegalList:
if inputBase[x] == y:
inputBase[x] = ""
inputVariable[x] = ""
for x in range(len(inputBase)):
print(str(inputBase[x]), " --> ", str(inputVariable[x]))
for x in range(len(inputVariable)):
reading_file = open(final, "r")
new_file_content = ""
for line in reading_file:
stripped_line = line.strip()
new_line = stripped_line.replace(str(inputBase[x]), str(inputVariable[x]))
new_file_content += new_line +"\n"
reading_file.close()
writing_file = open(final, "w")
writing_file.write(new_file_content)
writing_file.close()
print("wrote translation into", final)
print("please note that this method has an issue of being unable to properly replace lines with a line break.")
print("this issue has been solved by actually generating a new srt file. however, this method will not preserve existing edits.")
def writeNewSubtitle(inputx, inputBase, inputVariable):
# this function will write a new srt file instead of reusing the old one
global identifyingCharacteristic, language, extraIdentifyingCharacteristic, arrayOfExtentions
print("writing new srt file")
final = target.replace(identifyingCharacteristic, "." + language + extraIdentifyingCharacteristic)
subStartTime = []
subEndTime = []
if ".ass" in inputx and preserveOriginalCodec == 1:
# ass file support
for x in inputBase:
# the reason for removing 000 in time code is because i have no idea why subparse keep adding 000 at the bad
# this is like a quick patch and hoping it wont cause any issue in the future
subStartTime.append(str(x.start).replace("000", ""))
subEndTime.append(str(x.end).replace("000", ""))
f = open(final, "w+")
f.write(str("[Script Info]\n"))
f.write(str("; script generated and translated by subTranslate.py\n"))
f.write(str("WrapStyle: 1\n"))
f.write(str("\n"))
f.write(str("[Events]\n"))
f.write(str("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"))
for x in range(len(inputVariable)):
f.write(str("Dialogue: 0,"))
f.write(str(subStartTime[x] + "," + subEndTime[x]))
f.write(str(",Default,,0000,0000,0000,,"))
f.write(str(inputVariable[x]) + "\n")
f.close()
else:
# srt file support or converts anything unknown to srt anyways
for x in inputBase:
# the reason for removing 000 in time code is because i have no idea why subparse keep adding 000 at the bad
# this is like a quick patch and hoping it wont cause any issue in the future
subStartTime.append(str(x.start).replace("000", ""))
subEndTime.append(str(x.end).replace("000", ""))
for x in arrayOfExtentions:
if x in final:
final = final.replace(x, ".srt")
f = open(final, "w+")
f.write(str("1\n"))
f.write(str("00:00:00.000 --> 00:00:00.000\n"))
f.write(str("script generated and translated by subTranslate.py\n"))
f.write("\n")
for x in range(len(inputVariable)):
f.write(str(x+2) + "\n")
f.write(subStartTime[x] + " --> " + subEndTime[x] + "\n")
f.write(str(inputVariable[x]) + "\n")
f.write("\n")
f.close()
print("finished writing subtitle into", final)
def split_list(alist, wanted_parts):
length = len(alist)
return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts]
for i in range(wanted_parts) ]
targetFiles = findFiles(arrayOfExtentions, cwd) # search for targets
print("job started")
for x in targetFiles:
print(x)
if not identifyingCharacteristic == "":
print("only files with", identifyingCharacteristic, "will be processed. you may safely ignore the rest.")
for target in targetFiles: # recursive scrap thru all files in searched list
for extension in arrayOfExtentions: # helps with determining the extension
if not target.count(extension) == 0: # prevent works on external
if identifyingCharacteristic in target:
if not extraIdentifyingCharacteristic in target:
if not os.path.isfile(target.replace(identifyingCharacteristic, extraIdentifyingCharacteristic)):
targetConverted = target.replace(identifyingCharacteristic, "." + language + ".translated")
if not os.path.isfile(targetConverted):
if not "._" in target:
TotalTimeStart = time.time()
print("")
print("processing", target)
if preserveOriginalEdits == 1:
print("[WARNING] preserveOriginalEdits has been enabled. this function will work terribily with frames that has more than 1 lines")
print("[WARNING] to avoid facing this issue, disable preserveOriginalEdits")
try:
copyAndEditSub(target, parseSubtitles(target), translateSubtitles(parseSubtitles(target), language))
except:
print("[WARNING] failed to convert", target)
else:
try:
writeNewSubtitle(target, parseSubtitles(target), translateSubtitles(parseSubtitles(target), language))
except:
print("[WARNING] failed to convert", target)
TotalTimeTaken = time.time() - TotalTimeStart
print("Time taken for this subtitle (hh:mm:ss):", timeConvert(int(TotalTimeTaken)))
print("job done")