Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,10 +4,12 @@ import torch
|
|
4 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, pipeline, logging
|
5 |
import languagecodes
|
6 |
import requests, os
|
|
|
7 |
|
8 |
logging.set_verbosity_error()
|
9 |
favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
|
10 |
all_langs = languagecodes.iso_languages
|
|
|
11 |
|
12 |
# Language options as list, add favourite languages first
|
13 |
options = list(favourite_langs.keys())
|
@@ -21,7 +23,7 @@ models = ["Helsinki-NLP",
|
|
21 |
"t5-small", "t5-base", "t5-large",
|
22 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
23 |
"Argos", "Google",
|
24 |
-
"HuggingFaceTB/SmolLM3-3B",
|
25 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
26 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
27 |
"openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6"
|
@@ -36,35 +38,7 @@ def model_to_cuda(model):
|
|
36 |
print("CUDA not available! Using CPU.")
|
37 |
return model
|
38 |
|
39 |
-
def
|
40 |
-
import argostranslate.package
|
41 |
-
print('Downloading model', from_code, to_code)
|
42 |
-
# Download and install Argos Translate package
|
43 |
-
argostranslate.package.update_package_index()
|
44 |
-
available_packages = argostranslate.package.get_available_packages()
|
45 |
-
package_to_install = next(
|
46 |
-
filter(
|
47 |
-
lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
|
48 |
-
)
|
49 |
-
)
|
50 |
-
argostranslate.package.install_from_path(package_to_install.download())
|
51 |
-
|
52 |
-
def argos(sl, tl, input_text):
|
53 |
-
import argostranslate.translate, argostranslate.package
|
54 |
-
# Translate
|
55 |
-
try:
|
56 |
-
download_argos_model(sl, tl)
|
57 |
-
translated_text = argostranslate.translate.translate(input_text, sl, tl)
|
58 |
-
except StopIteration:
|
59 |
-
# packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages())
|
60 |
-
packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages())
|
61 |
-
translated_text = f"No Argos model for {sl} to {tl}. Try other model or languages combination from the available Argos models: {packages_info}."
|
62 |
-
except Exception as error:
|
63 |
-
translated_text = error
|
64 |
-
print(error)
|
65 |
-
return translated_text
|
66 |
-
|
67 |
-
def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
|
68 |
if model_name == "Helsinki-NLP":
|
69 |
message_text = f'Translated from {sl} to {tl} with {model_name}.'
|
70 |
try:
|
@@ -82,31 +56,7 @@ def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
|
|
82 |
return translated_text, message_text
|
83 |
except EnvironmentError as error:
|
84 |
return f"Error finding model: {model_name}! Try other available language combination.", error
|
85 |
-
|
86 |
-
def HelsinkiNLP(sl, tl, input_text):
|
87 |
-
try: # Standard bilingual model
|
88 |
-
model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
|
89 |
-
pipe = pipeline("translation", model=model_name, device=-1)
|
90 |
-
translation = pipe(input_text)
|
91 |
-
return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
|
92 |
-
except EnvironmentError:
|
93 |
-
try: # Tatoeba models
|
94 |
-
model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
|
95 |
-
pipe = pipeline("translation", model=model_name, device=-1)
|
96 |
-
translation = pipe(input_text)
|
97 |
-
return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
|
98 |
-
except EnvironmentError as error:
|
99 |
-
try: # Last resort: multi to multi
|
100 |
-
model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
|
101 |
-
pipe = pipeline("translation", model=model_name)
|
102 |
-
tl = 'deu' # Hard coded for now for testing
|
103 |
-
translation = pipe(f'>>{tl}<< {input_text}')
|
104 |
-
return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
|
105 |
-
except Exception as error:
|
106 |
-
return f"Error translating with model: {model_name}! Try other available language combination.", error
|
107 |
-
except KeyError as error:
|
108 |
-
return f"Error: Translation direction {sl} to {tl} is not supported by Helsinki Translation Models", error
|
109 |
-
|
110 |
class Translators:
|
111 |
def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
|
112 |
self.model_name = model_name
|
@@ -118,6 +68,60 @@ class Translators:
|
|
118 |
response = requests.get(url)
|
119 |
return response.json()[0][0][0]
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
def smollm(self):
|
122 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
123 |
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
@@ -221,87 +225,121 @@ class Translators:
|
|
221 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
222 |
translated_text = translator(self.input_text, max_length=512)
|
223 |
return translated_text[0]['translation_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
def teuken(
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
def eurollm(model_name, sl, tl, input_text):
|
254 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
255 |
-
model = AutoModelForCausalLM.from_pretrained(model_name)
|
256 |
-
prompt = f"{sl}: {input_text} {tl}:"
|
257 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
258 |
-
outputs = model.generate(**inputs, max_new_tokens=512)
|
259 |
-
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
260 |
-
result = output.rsplit(f'{tl}:')[-1].strip()
|
261 |
-
return result
|
262 |
-
|
263 |
-
def eurollm_instruct(model_name, sl, tl, input_text):
|
264 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
265 |
-
model = AutoModelForCausalLM.from_pretrained(model_name)
|
266 |
-
text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {sl} source text to {tl}:\n{sl}: {input_text} \n{tl}: <|im_end|>\n<|im_start|>assistant\n'
|
267 |
-
inputs = tokenizer(text, return_tensors="pt")
|
268 |
-
outputs = model.generate(**inputs, max_new_tokens=512)
|
269 |
-
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
270 |
-
if f'{tl}:' in output:
|
271 |
-
output = output.rsplit(f'{tl}:')[-1].strip().replace('assistant\n', '')
|
272 |
-
return output
|
273 |
|
274 |
-
def unbabel(
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
|
296 |
@spaces.GPU
|
297 |
-
def translate_text(input_text: str,
|
298 |
"""
|
299 |
Translates the input text from the source language to the target language using a specified model.
|
300 |
|
301 |
Parameters:
|
302 |
input_text (str): The source text to be translated
|
303 |
-
|
304 |
-
|
305 |
model_name (str): The selected translation model name
|
306 |
|
307 |
Returns:
|
@@ -314,16 +352,16 @@ def translate_text(input_text: str, sselected_language: str, tselected_language:
|
|
314 |
("Hallo Welt", "Translated from English to German with Helsinki-NLP.")
|
315 |
"""
|
316 |
|
317 |
-
sl = all_langs[
|
318 |
-
tl = all_langs[
|
319 |
-
message_text = f'Translated from {
|
320 |
print(message_text)
|
321 |
try:
|
322 |
if model_name.startswith("Helsinki-NLP"):
|
323 |
-
translated_text, message_text =
|
324 |
|
325 |
elif model_name == 'Argos':
|
326 |
-
translated_text =
|
327 |
|
328 |
elif model_name == 'Google':
|
329 |
translated_text = Translators(model_name, sl, tl, input_text).google()
|
@@ -332,44 +370,47 @@ def translate_text(input_text: str, sselected_language: str, tselected_language:
|
|
332 |
translated_text = Translators(model_name, sl, tl, input_text).mtom()
|
333 |
|
334 |
elif model_name.startswith('t5'):
|
335 |
-
translated_text = Translators(model_name,
|
336 |
|
337 |
elif 'flan' in model_name.lower():
|
338 |
-
translated_text = Translators(model_name,
|
339 |
|
340 |
elif 'mt0' in model_name.lower():
|
341 |
-
translated_text = Translators(model_name,
|
342 |
|
343 |
elif 'bloomz' in model_name.lower():
|
344 |
-
translated_text = Translators(model_name,
|
345 |
|
346 |
elif 'nllb' in model_name.lower():
|
347 |
-
nnlbsl, nnlbtl = languagecodes.nllb_language_codes[
|
348 |
translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb()
|
349 |
|
350 |
elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
|
351 |
-
translated_text = Translators(model_name,
|
352 |
|
353 |
elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
|
354 |
-
translated_text = Translators(model_name,
|
355 |
|
356 |
elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
|
357 |
-
translated_text = Translators(model_name,
|
358 |
|
359 |
elif 'teuken' in model_name.lower():
|
360 |
-
translated_text =
|
361 |
|
362 |
elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
|
363 |
-
translated_text =
|
364 |
|
365 |
elif model_name == "utter-project/EuroLLM-1.7B":
|
366 |
-
translated_text =
|
367 |
|
368 |
elif 'Unbabel' in model_name:
|
369 |
-
translated_text =
|
370 |
|
371 |
elif model_name == "HuggingFaceTB/SmolLM3-3B":
|
372 |
-
translated_text = Translators(model_name,
|
|
|
|
|
|
|
373 |
|
374 |
except Exception as error:
|
375 |
translated_text = error
|
@@ -389,22 +430,22 @@ def create_interface():
|
|
389 |
input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
|
390 |
|
391 |
with gr.Row():
|
392 |
-
|
393 |
-
|
394 |
swap_button = gr.Button("Swap Languages", size="md")
|
395 |
-
swap_button.click(fn=swap_languages, inputs=[
|
396 |
|
397 |
model_name = gr.Dropdown(choices=models, label=f"Select a model. Default is {models[0]}.", value = models[0], interactive=True)
|
398 |
translate_button = gr.Button("Translate")
|
399 |
|
400 |
translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
|
401 |
message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False,
|
402 |
-
value=f'Default translation settings: from {
|
403 |
allmodels = gr.HTML(label="Model links:", value=', '.join([f'<a href="https://huggingface.co/{model}">{model}</a>' for model in models]))
|
404 |
|
405 |
translate_button.click(
|
406 |
fn=translate_text,
|
407 |
-
inputs=[input_text,
|
408 |
outputs=[translated_text, message_text]
|
409 |
)
|
410 |
|
|
|
4 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, pipeline, logging
|
5 |
import languagecodes
|
6 |
import requests, os
|
7 |
+
import polars as pl
|
8 |
|
9 |
logging.set_verbosity_error()
|
10 |
favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
|
11 |
all_langs = languagecodes.iso_languages
|
12 |
+
df = pl.read_parquet("isolanguages.parquet")
|
13 |
|
14 |
# Language options as list, add favourite languages first
|
15 |
options = list(favourite_langs.keys())
|
|
|
23 |
"t5-small", "t5-base", "t5-large",
|
24 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
25 |
"Argos", "Google",
|
26 |
+
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
27 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
28 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
29 |
"openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6"
|
|
|
38 |
print("CUDA not available! Using CPU.")
|
39 |
return model
|
40 |
|
41 |
+
def HelsinkiNLPAutoTokenizer(sl, tl, input_text): # deprecated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
if model_name == "Helsinki-NLP":
|
43 |
message_text = f'Translated from {sl} to {tl} with {model_name}.'
|
44 |
try:
|
|
|
56 |
return translated_text, message_text
|
57 |
except EnvironmentError as error:
|
58 |
return f"Error finding model: {model_name}! Try other available language combination.", error
|
59 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
class Translators:
|
61 |
def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
|
62 |
self.model_name = model_name
|
|
|
68 |
response = requests.get(url)
|
69 |
return response.json()[0][0][0]
|
70 |
|
71 |
+
def download_argos_model(from_code, to_code):
|
72 |
+
import argostranslate.package
|
73 |
+
print('Downloading model', from_code, to_code)
|
74 |
+
# Download and install Argos Translate package
|
75 |
+
argostranslate.package.update_package_index()
|
76 |
+
available_packages = argostranslate.package.get_available_packages()
|
77 |
+
package_to_install = next(
|
78 |
+
filter(
|
79 |
+
lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
|
80 |
+
)
|
81 |
+
)
|
82 |
+
argostranslate.package.install_from_path(package_to_install.download())
|
83 |
+
|
84 |
+
def argos(self):
|
85 |
+
import argostranslate.translate, argostranslate.package
|
86 |
+
# Translate
|
87 |
+
try:
|
88 |
+
download_argos_model(self.sl, self.tl)
|
89 |
+
translated_text = argostranslate.translate.translate(self.input_text, self.sl, self.tl)
|
90 |
+
except StopIteration:
|
91 |
+
# packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages())
|
92 |
+
packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages())
|
93 |
+
translated_text = f"No Argos model for {self.sl} to {self.tl}. Try other model or languages combination from the available Argos models: {packages_info}."
|
94 |
+
except Exception as error:
|
95 |
+
translated_text = error
|
96 |
+
print(error)
|
97 |
+
return translated_text
|
98 |
+
|
99 |
+
def HelsinkiNLP(self):
|
100 |
+
try: # Standard bilingual model
|
101 |
+
model_name = f"Helsinki-NLP/opus-mt-{self.sl}-{self.tl}"
|
102 |
+
pipe = pipeline("translation", model=self.model_name, device=-1)
|
103 |
+
translation = pipe(self.input_text)
|
104 |
+
return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
|
105 |
+
except EnvironmentError:
|
106 |
+
try: # Tatoeba models
|
107 |
+
model_name = f"Helsinki-NLP/opus-tatoeba-{self.sl}-{self.tl}"
|
108 |
+
pipe = pipeline("translation", model=self.model_name, device=-1)
|
109 |
+
translation = pipe(self.input_text)
|
110 |
+
return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
|
111 |
+
except EnvironmentError as error:
|
112 |
+
try: # Last resort: multi to multi
|
113 |
+
model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
|
114 |
+
pipe = pipeline("translation", model=self.model_name)
|
115 |
+
non_empty_iso = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
116 |
+
iso1_dict = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_iso}
|
117 |
+
iso3tl = iso1_dict.get(self.tl)[2] # 'deu'
|
118 |
+
translation = pipe(f'>>{iso3tl}<< {self.input_text}')
|
119 |
+
return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
|
120 |
+
except Exception as error:
|
121 |
+
return f"Error translating with model: {self.model_name}! Try other available language combination.", error
|
122 |
+
except KeyError as error:
|
123 |
+
return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
|
124 |
+
|
125 |
def smollm(self):
|
126 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
127 |
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
|
|
225 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
226 |
translated_text = translator(self.input_text, max_length=512)
|
227 |
return translated_text[0]['translation_text']
|
228 |
+
|
229 |
+
def wingpt(self):
|
230 |
+
model = AutoModelForCausalLM.from_pretrained(
|
231 |
+
self.model_name,
|
232 |
+
torch_dtype="auto",
|
233 |
+
device_map="auto"
|
234 |
+
)
|
235 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
236 |
+
# input_json = '{"input_text": self.input_text}'
|
237 |
+
messages = [
|
238 |
+
{"role": "system", "content": f"Translate this to {self.tl} language"},
|
239 |
+
{"role": "user", "content": self.input_text}
|
240 |
+
]
|
241 |
+
|
242 |
+
text = tokenizer.apply_chat_template(
|
243 |
+
messages,
|
244 |
+
tokenize=False,
|
245 |
+
add_generation_prompt=True
|
246 |
+
)
|
247 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
248 |
+
|
249 |
+
generated_ids = model.generate(
|
250 |
+
**model_inputs,
|
251 |
+
max_new_tokens=512,
|
252 |
+
temperature=0.1
|
253 |
+
)
|
254 |
+
|
255 |
+
generated_ids = [
|
256 |
+
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
257 |
+
]
|
258 |
+
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
|
259 |
+
output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
260 |
+
result = output.split('\n')[-1].strip() if '\n' in output else output.strip()
|
261 |
+
return result
|
262 |
+
|
263 |
+
def eurollm(self):
|
264 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
265 |
+
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
266 |
+
prompt = f"{self.sl}: {self.input_text} {self.tl}:"
|
267 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
268 |
+
outputs = model.generate(**inputs, max_new_tokens=512)
|
269 |
+
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
270 |
+
result = output.rsplit(f'{self.tl}:')[-1].strip() if '\n' in output else output.strip()
|
271 |
+
return result
|
272 |
+
|
273 |
+
def eurollm_instruct(self):
|
274 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
275 |
+
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
276 |
+
text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {self.sl} source text to {self.tl}:\n{self.sl}: {self.input_text} \n{self.tl}: <|im_end|>\n<|im_start|>assistant\n'
|
277 |
+
inputs = tokenizer(text, return_tensors="pt")
|
278 |
+
outputs = model.generate(**inputs, max_new_tokens=512)
|
279 |
+
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
280 |
+
if f'{self.tl}:' in output:
|
281 |
+
output = output.rsplit(f'{self.tl}:')[-1].strip().replace('assistant\n', '').strip()
|
282 |
+
return output
|
283 |
|
284 |
+
def teuken(self):
|
285 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
286 |
+
model = AutoModelForCausalLM.from_pretrained(
|
287 |
+
self.model_name,
|
288 |
+
trust_remote_code=True,
|
289 |
+
torch_dtype=torch.bfloat16,
|
290 |
+
)
|
291 |
+
model = model.to(device).eval()
|
292 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
293 |
+
self.model_name,
|
294 |
+
use_fast=False,
|
295 |
+
trust_remote_code=True,
|
296 |
+
)
|
297 |
+
translation_prompt = f"Translate the following text from {self.sl} into {self.tl}: {self.input_text}"
|
298 |
+
messages = [{"role": "User", "content": translation_prompt}]
|
299 |
+
prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt")
|
300 |
+
prediction = model.generate(
|
301 |
+
prompt_ids.to(model.device),
|
302 |
+
max_length=512,
|
303 |
+
do_sample=True,
|
304 |
+
top_k=50,
|
305 |
+
top_p=0.95,
|
306 |
+
temperature=0.7,
|
307 |
+
num_return_sequences=1,
|
308 |
+
)
|
309 |
+
translation = tokenizer.decode(prediction[0].tolist())
|
310 |
+
return translation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
+
def unbabel(self):
|
313 |
+
pipe = pipeline("text-generation", model=self.model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
314 |
+
messages = [{"role": "user",
|
315 |
+
"content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text}.\n{self.tl}:"}]
|
316 |
+
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
317 |
+
tokenized_input = pipe.tokenizer(self.input_text, return_tensors="pt")
|
318 |
+
num_input_tokens = len(tokenized_input["input_ids"][0])
|
319 |
+
max_new_tokens = round(num_input_tokens + 0.25 * num_input_tokens)
|
320 |
+
outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
|
321 |
+
translated_text = outputs[0]["generated_text"]
|
322 |
+
print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}",
|
323 |
+
"Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}")
|
324 |
+
markers = ["<end_of_turn>", "<|im_end|>", "<|im_start|>assistant"] # , "\n"
|
325 |
+
for marker in markers:
|
326 |
+
if marker in translated_text:
|
327 |
+
translated_text = translated_text.split(marker)[1].strip()
|
328 |
+
translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text
|
329 |
+
translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text
|
330 |
+
split_translated_text = translated_text.split('\n', translated_text.count('\n'))
|
331 |
+
translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
|
332 |
+
return translated_text
|
333 |
|
334 |
@spaces.GPU
|
335 |
+
def translate_text(input_text: str, s_language: str, t_language: str, model_name: str) -> tuple[str, str]:
|
336 |
"""
|
337 |
Translates the input text from the source language to the target language using a specified model.
|
338 |
|
339 |
Parameters:
|
340 |
input_text (str): The source text to be translated
|
341 |
+
s_language (str): The source language of the input text
|
342 |
+
t_language (str): The target language in which the input text is translated
|
343 |
model_name (str): The selected translation model name
|
344 |
|
345 |
Returns:
|
|
|
352 |
("Hallo Welt", "Translated from English to German with Helsinki-NLP.")
|
353 |
"""
|
354 |
|
355 |
+
sl = all_langs[s_language]
|
356 |
+
tl = all_langs[t_language]
|
357 |
+
message_text = f'Translated from {s_language} to {t_language} with {model_name}'
|
358 |
print(message_text)
|
359 |
try:
|
360 |
if model_name.startswith("Helsinki-NLP"):
|
361 |
+
translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP()
|
362 |
|
363 |
elif model_name == 'Argos':
|
364 |
+
translated_text = Translators(model_name, sl, tl, input_text).argos()
|
365 |
|
366 |
elif model_name == 'Google':
|
367 |
translated_text = Translators(model_name, sl, tl, input_text).google()
|
|
|
370 |
translated_text = Translators(model_name, sl, tl, input_text).mtom()
|
371 |
|
372 |
elif model_name.startswith('t5'):
|
373 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
|
374 |
|
375 |
elif 'flan' in model_name.lower():
|
376 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).flan()
|
377 |
|
378 |
elif 'mt0' in model_name.lower():
|
379 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).bigscience()
|
380 |
|
381 |
elif 'bloomz' in model_name.lower():
|
382 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).bloomz()
|
383 |
|
384 |
elif 'nllb' in model_name.lower():
|
385 |
+
nnlbsl, nnlbtl = languagecodes.nllb_language_codes[s_language], languagecodes.nllb_language_codes[t_language]
|
386 |
translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb()
|
387 |
|
388 |
elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
|
389 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_many()
|
390 |
|
391 |
elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
|
392 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).mbart_one_to_many()
|
393 |
|
394 |
elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
|
395 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one()
|
396 |
|
397 |
elif 'teuken' in model_name.lower():
|
398 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).teuken()
|
399 |
|
400 |
elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
|
401 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct()
|
402 |
|
403 |
elif model_name == "utter-project/EuroLLM-1.7B":
|
404 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).eurollm()
|
405 |
|
406 |
elif 'Unbabel' in model_name:
|
407 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
|
408 |
|
409 |
elif model_name == "HuggingFaceTB/SmolLM3-3B":
|
410 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).smollm()
|
411 |
+
|
412 |
+
elif model_name == "winninghealth/WiNGPT-Babel-2":
|
413 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
|
414 |
|
415 |
except Exception as error:
|
416 |
translated_text = error
|
|
|
430 |
input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
|
431 |
|
432 |
with gr.Row():
|
433 |
+
s_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
|
434 |
+
t_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
|
435 |
swap_button = gr.Button("Swap Languages", size="md")
|
436 |
+
swap_button.click(fn=swap_languages, inputs=[s_language, t_language], outputs=[s_language, t_language], api_name=False, show_api=False)
|
437 |
|
438 |
model_name = gr.Dropdown(choices=models, label=f"Select a model. Default is {models[0]}.", value = models[0], interactive=True)
|
439 |
translate_button = gr.Button("Translate")
|
440 |
|
441 |
translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
|
442 |
message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False,
|
443 |
+
value=f'Default translation settings: from {s_language.value} to {t_language.value} with {model_name.value}.')
|
444 |
allmodels = gr.HTML(label="Model links:", value=', '.join([f'<a href="https://huggingface.co/{model}">{model}</a>' for model in models]))
|
445 |
|
446 |
translate_button.click(
|
447 |
fn=translate_text,
|
448 |
+
inputs=[input_text, s_language, t_language, model_name],
|
449 |
outputs=[translated_text, message_text]
|
450 |
)
|
451 |
|