-
Notifications
You must be signed in to change notification settings - Fork 0
/
3-translate.py
110 lines (77 loc) · 4.25 KB
/
3-translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("lang_code", help="lang code used in the files")
parser.add_argument("opus_lang_code", help="lang code used by the MT system")
args = parser.parse_args()
# Define the language code, used in the file names
#lang_code = "CZ"
lang_code = args.lang_code
opus_lang_code = args.opus_lang_code
# Main path
main_path = "/home/tajak/Parlamint-translation"
# Define other paths
extracted_dataframe_path = "{}/results/{}/ParlaMint-{}-extracted-source-data.csv".format(main_path, lang_code, lang_code)
translated_dataframe_path = "{}/results/{}/ParlaMint-{}-translated.csv".format(main_path, lang_code, lang_code)
# -------------NO CHANGING OF THE CODE NEEDED FROM NOW ONWARDS------------------
from knockknock import discord_sender
# Get notified once the code ends
webhook_url = open("/home/tajak/Parlamint-translation/discord_key.txt", "r").read()
@discord_sender(webhook_url=webhook_url)
def translate(lang_code, opus_lang_code, extracted_dataframe_path, translated_dataframe_path):
"""
This function translates the text from the dataframe, created with the extract_text() function
with OPUS-MT models using EasyNMT. It returns a dataframe with the translation.
Args:
- opus_lang_code: the lang code to be used in the OPUS-MT model - use the one that performed the best in the comparison (see function choose_model())
"""
import pandas as pd
import numpy as np
import regex as re
from easynmt import EasyNMT
from IPython.display import display
import time
# Open the file, created in the previous step
df = pd.read_csv("{}".format(extracted_dataframe_path), sep="\t", index_col=0, na_filter = False)
# Define the model
model = EasyNMT('opus-mt')
print("Entire corpus has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))
# Create a list of sentences from the df
sentence_list = df.text.to_list()
print("Translation started.")
start_time = time.time()
#Translate the list of sentences - you need to provide the source language as it is in the name of the model - the opus_lang_code
#for opus_lang_code in lang_models_dict[lang_code]:
translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')
translation_time = round((time.time() - start_time)/60,2)
print("Translation completed. It took {} minutes for {} instances - {} minutes per one sentence.".format(translation_time, len(sentence_list), translation_time/len(sentence_list)))
# Add the translations to the df
df["translation"] = translation_list
#translation shortening of long MT repetition errors: if len (words) of EN sentence is more than 4X len (words) of original sentence, shorten the EN sentence to length 4X original sentence (to the nearest word) - applicable only if len of translation is longer than 6 words (to avoid cases where a couple of words in source language would be translated with a longer sequences in English that are still correct)
# Add translation length
df["tran_length"] = df["translation"].str.split().str.len()
# Add info where tran length is more than 3 times longer than original length
df["shorten"] = np.where((df["tran_length"] > 6) & (df["tran_length"] > 4*df["length"]), df["translation"], "no")
# Shorten translations and substitute the longer translations with them
shorten_translation = []
for i in list(zip(df["translation"], df["shorten"], df["length"])):
if i[1] == "no":
shorten_translation.append(i[0])
else:
new_length = 4*i[2]
# Shorten too long translations to the length of four times the length of the original sentence
new_translation_list = i[0].split()[:new_length]
new_translation = " ".join(new_translation_list)
shorten_translation.append(new_translation)
# Substitute previous translations with the new list
df["translation"] = shorten_translation
# Drop the "shorten" and "tran length" columns
df = df.drop(columns=["shorten", "tran_length"])
# Display the df
print(df[:3].to_markdown())
print("\n\n\n")
# Save the df
df.to_csv("{}".format(translated_dataframe_path), sep="\t")
print("The file is saved as {}".format(translated_dataframe_path))
return df
df = translate(lang_code, opus_lang_code, extracted_dataframe_path, translated_dataframe_path)