Skip to content

Commit

Permalink
Merge branch 'master' into exp2
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed Aug 21, 2024
2 parents 125822b + f840f7d commit 717f958
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 10 deletions.
1 change: 1 addition & 0 deletions data-processing-tools/join-single-file.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def _remove_special_newlines(result):
mapping = {
u"\u2028" : '',
u"\u2029" : '',
u"\u0085" : '',
}

for char in mapping.keys():
Expand Down
15 changes: 10 additions & 5 deletions data-processing-tools/tests/testjoin-single-file.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ def test_has_dot_or_equivalent_false(self):
self.assertFalse(join_single_file._has_dot_or_equivalent("num1"))
self.assertFalse(join_single_file._has_dot_or_equivalent("Hola"))

def test__is_sentence_len_good_len_zero(self):
def test_is_sentence_len_good_len_zero(self):
self.assertFalse(join_single_file._is_sentence_len_good("", ""))
self.assertFalse(join_single_file._is_sentence_len_good("A", ""))
self.assertFalse(join_single_file._is_sentence_len_good("", "B"))

def test__is_sentence_len_good_diff(self):
def test_is_sentence_len_good_diff(self):
src = "Mai"
trg = "localized lexeme inflections - short month form||Jun"

Expand All @@ -104,7 +104,7 @@ def test__is_sentence_len_good_diff(self):
trg = "Tots els contactes"
self.assertFalse(join_single_file._is_sentence_len_good(src, trg))

def test__is_sentence_len_good_true(self):
def test_is_sentence_len_good_true(self):
src = "May"
trg = "Maig"
self.assertTrue(join_single_file._is_sentence_len_good(src, trg))
Expand All @@ -113,15 +113,20 @@ def test__is_sentence_len_good_true(self):
trg = "Tots els contactes"
self.assertTrue(join_single_file._is_sentence_len_good(src, trg))

def test__get_val_test_split_lines(self):
def test_get_val_test_split_lines(self):
steps_val, steps_test = join_single_file._get_val_test_split_steps(lines = 1000000, per_mille_val = 1, per_mille_test = 2)
self.assertEquals(1000, steps_val)
self.assertEquals(500, steps_test)

def test___clean_for_dup_detection(self):
def test_clean_for_dup_detection(self):
self.assertEquals("Word1Word2", join_single_file._clean_for_dup_detection("Word1 Word2\n"))
self.assertEquals("Word1Word2", join_single_file._clean_for_dup_detection("Word1\tWord2\r"))
self.assertEquals("Word1Word2.", join_single_file._clean_for_dup_detection("Word1 Word2.\r"))

def test_remove_special_newlines(self):
self.assertEquals(("Holamón", True), join_single_file._remove_special_newlines("Hola\u2028món"))
self.assertEquals(("Holamón", True), join_single_file._remove_special_newlines("Hola\u2029món"))
self.assertEquals(("Holamón", True), join_single_file._remove_special_newlines("Hola\u0085món"))

if __name__ == '__main__':
unittest.main()
132 changes: 132 additions & 0 deletions evaluate/sc-translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2024 Jordi Mas i Hernandez <[email protected]>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WAR RANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

import os
import datetime
import json

def file_len(fname):
if not os.path.isfile(fname):
return 0

i = 0
with open(fname) as f:
for i, l in enumerate(f):
pass

return i + 1

def get_sacrebleu(reference_file, hypotesis_file):
JSON_FILE = 'bleu.json'

cmd = f'sacrebleu {reference_file} -i {hypotesis_file} -m bleu > {JSON_FILE}'
os.system(cmd)
print(cmd)
with open(JSON_FILE) as f:
data = json.load(f)

return f"{data['score']:0.1f}"

def save_json(scores):
with open("sc-translate-bleu.json", "w") as outfile:
json.dump(scores, outfile, indent=4)

import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download


def _translate(tokenizer, translator, src):
tokenized=tokenizer.tokenize(src)

translated = translator.translate_batch([tokenized[0]])
translated = tokenizer.detokenize(translated[0][0]['tokens'])
return translated

def main():
print("Translates flores200 datasets using HuggingFace Softcatalà models")

pair_languages = [
"cat-eng",
"eng-cat",
"cat-deu",
"deu-cat",
"cat-fra",
"fra-cat",
"cat-spa",
"spa-cat",
"cat-eus",
"eus-cat",
"cat-nld",
"nld-cat",
"ita-cat",
"cat-ita",
"glg-cat",
"cat-glg",
"jpn-cat",
"cat-jpn",
"oci-cat",
"cat-oci",
]

blue_scores = {}

for pair_language in pair_languages:
source_language, target_language = pair_language.split("-")
model_name = f"translate-{pair_language}"

hypotesis_file = f"sc-translate/sc-flores200-{model_name}.{target_language}"
print(hypotesis_file)
input_file = f"flores200.{source_language}"

model_dir = snapshot_download(repo_id=f"softcatala/{model_name}", revision="main")
tokenizer=pyonmttok.Tokenizer(mode="none", sp_model_path = model_dir + "/sp_m.model")
translator = ctranslate2.Translator(model_dir)

start_time = datetime.datetime.now()
LINES_IN_DATA_SET = 1012
if file_len(hypotesis_file) != LINES_IN_DATA_SET:
cnt = 0
with open(input_file, "r") as source, open(hypotesis_file, "w") as target:

while True:
src = source.readline()

if not src:
break

t = _translate(tokenizer, translator, src)
# print(t)
target.write(t + "\n")
cnt += 1
if cnt % 100 == 0:
print(cnt)


reference_file = f"flores200.{target_language}"
sacrebleu = get_sacrebleu(reference_file, hypotesis_file)
blue_scores[f'{source_language}-{target_language}'] = sacrebleu
print(f"'{source_language}-{target_language}', BLEU: '{sacrebleu}'")
s = 'Time used: {0}'.format(datetime.datetime.now() - start_time)
print(s)
save_json(blue_scores)

if __name__ == "__main__":
main()
40 changes: 35 additions & 5 deletions models-hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,53 @@
import os
from remotemodels import RemoteModels


def main():
models = RemoteModels().get_list_of_models()
for model_url in models:
for language_pair in ["eng-cat", "cat-eng" "deu-cat", "cat-deu"]:
for language_pair in [
"eng-cat",
"cat-eng",
"deu-cat",
"cat-deu",
"fra-cat",
"cat-fra",
"spa-cat",
"cat-spa",
"eus-cat",
"cat-eus",
"nld-cat",
"cat-nld",
"ita-cat",
"cat-ita",
"glg-cat",
"cat-glg",
"jpn-cat",
"cat-jpn",
"oci-cat",
"cat-oci",
]:

if language_pair in model_url:
print(f"model: {model_url}")
PATH = (f"translate-{language_pair}")
PATH = f"translate-{language_pair}"
GIT_URL = f"https://huggingface.co/softcatala/{PATH}"
cmd = f'git clone {GIT_URL}'
if os.path.exists(PATH):
cmd = f"cd {PATH} && git pull && cd .."
else:
cmd = f"git clone {GIT_URL}"
os.system(cmd)

ZIP_FILE = os.path.join(PATH, "model.zip")
RemoteModels().download_file(model_url, ZIP_FILE)

for subdir in ["ctranslate2", "tokenizer"]:
cmd = f'unzip -d {PATH} -o -j {ZIP_FILE} "{language_pair}/{subdir}/*"'
for subdir in ["ctranslate2", "tokenizer", "metadata"]:
files = "model_description.txt" if subdir == "metadata" else "*"
cmd = f'unzip -d {PATH} -o -j {ZIP_FILE} "{language_pair}/{subdir}/{files}"'
os.system(cmd)

os.remove(ZIP_FILE)


if __name__ == "__main__":
main()

0 comments on commit 717f958

Please sign in to comment.