TiberiuCristianLeon commited on
Commit
f434999
·
verified ·
1 Parent(s): 4831ca2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -150
app.py CHANGED
@@ -4,10 +4,12 @@ import torch
4
  from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, pipeline, logging
5
  import languagecodes
6
  import requests, os
 
7
 
8
  logging.set_verbosity_error()
9
  favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
10
  all_langs = languagecodes.iso_languages
 
11
 
12
  # Language options as list, add favourite languages first
13
  options = list(favourite_langs.keys())
@@ -21,7 +23,7 @@ models = ["Helsinki-NLP",
21
  "t5-small", "t5-base", "t5-large",
22
  "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
23
  "Argos", "Google",
24
- "HuggingFaceTB/SmolLM3-3B",
25
  "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
26
  "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
27
  "openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6"
@@ -36,35 +38,7 @@ def model_to_cuda(model):
36
  print("CUDA not available! Using CPU.")
37
  return model
38
 
39
- def download_argos_model(from_code, to_code):
40
- import argostranslate.package
41
- print('Downloading model', from_code, to_code)
42
- # Download and install Argos Translate package
43
- argostranslate.package.update_package_index()
44
- available_packages = argostranslate.package.get_available_packages()
45
- package_to_install = next(
46
- filter(
47
- lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
48
- )
49
- )
50
- argostranslate.package.install_from_path(package_to_install.download())
51
-
52
- def argos(sl, tl, input_text):
53
- import argostranslate.translate, argostranslate.package
54
- # Translate
55
- try:
56
- download_argos_model(sl, tl)
57
- translated_text = argostranslate.translate.translate(input_text, sl, tl)
58
- except StopIteration:
59
- # packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages())
60
- packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages())
61
- translated_text = f"No Argos model for {sl} to {tl}. Try other model or languages combination from the available Argos models: {packages_info}."
62
- except Exception as error:
63
- translated_text = error
64
- print(error)
65
- return translated_text
66
-
67
- def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
68
  if model_name == "Helsinki-NLP":
69
  message_text = f'Translated from {sl} to {tl} with {model_name}.'
70
  try:
@@ -82,31 +56,7 @@ def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
82
  return translated_text, message_text
83
  except EnvironmentError as error:
84
  return f"Error finding model: {model_name}! Try other available language combination.", error
85
-
86
- def HelsinkiNLP(sl, tl, input_text):
87
- try: # Standard bilingual model
88
- model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
89
- pipe = pipeline("translation", model=model_name, device=-1)
90
- translation = pipe(input_text)
91
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
92
- except EnvironmentError:
93
- try: # Tatoeba models
94
- model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
95
- pipe = pipeline("translation", model=model_name, device=-1)
96
- translation = pipe(input_text)
97
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
98
- except EnvironmentError as error:
99
- try: # Last resort: multi to multi
100
- model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
101
- pipe = pipeline("translation", model=model_name)
102
- tl = 'deu' # Hard coded for now for testing
103
- translation = pipe(f'>>{tl}<< {input_text}')
104
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
105
- except Exception as error:
106
- return f"Error translating with model: {model_name}! Try other available language combination.", error
107
- except KeyError as error:
108
- return f"Error: Translation direction {sl} to {tl} is not supported by Helsinki Translation Models", error
109
-
110
  class Translators:
111
  def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
112
  self.model_name = model_name
@@ -118,6 +68,60 @@ class Translators:
118
  response = requests.get(url)
119
  return response.json()[0][0][0]
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def smollm(self):
122
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
123
  model = AutoModelForCausalLM.from_pretrained(self.model_name)
@@ -221,87 +225,121 @@ class Translators:
221
  translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
222
  translated_text = translator(self.input_text, max_length=512)
223
  return translated_text[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- def teuken(model_name, sl, tl, input_text):
226
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
227
- model = AutoModelForCausalLM.from_pretrained(
228
- model_name,
229
- trust_remote_code=True,
230
- torch_dtype=torch.bfloat16,
231
- )
232
- model = model.to(device).eval()
233
- tokenizer = AutoTokenizer.from_pretrained(
234
- model_name,
235
- use_fast=False,
236
- trust_remote_code=True,
237
- )
238
- translation_prompt = f"Translate the following text from {sl} into {tl}: {input_text}"
239
- messages = [{"role": "User", "content": translation_prompt}]
240
- prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt")
241
- prediction = model.generate(
242
- prompt_ids.to(model.device),
243
- max_length=512,
244
- do_sample=True,
245
- top_k=50,
246
- top_p=0.95,
247
- temperature=0.7,
248
- num_return_sequences=1,
249
- )
250
- translation = tokenizer.decode(prediction[0].tolist())
251
- return translation
252
-
253
- def eurollm(model_name, sl, tl, input_text):
254
- tokenizer = AutoTokenizer.from_pretrained(model_name)
255
- model = AutoModelForCausalLM.from_pretrained(model_name)
256
- prompt = f"{sl}: {input_text} {tl}:"
257
- inputs = tokenizer(prompt, return_tensors="pt")
258
- outputs = model.generate(**inputs, max_new_tokens=512)
259
- output = tokenizer.decode(outputs[0], skip_special_tokens=True)
260
- result = output.rsplit(f'{tl}:')[-1].strip()
261
- return result
262
-
263
- def eurollm_instruct(model_name, sl, tl, input_text):
264
- tokenizer = AutoTokenizer.from_pretrained(model_name)
265
- model = AutoModelForCausalLM.from_pretrained(model_name)
266
- text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {sl} source text to {tl}:\n{sl}: {input_text} \n{tl}: <|im_end|>\n<|im_start|>assistant\n'
267
- inputs = tokenizer(text, return_tensors="pt")
268
- outputs = model.generate(**inputs, max_new_tokens=512)
269
- output = tokenizer.decode(outputs[0], skip_special_tokens=True)
270
- if f'{tl}:' in output:
271
- output = output.rsplit(f'{tl}:')[-1].strip().replace('assistant\n', '')
272
- return output
273
 
274
- def unbabel(model_name, sl, tl, input_text):
275
- pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
276
- messages = [{"role": "user",
277
- "content": f"Translate the following text from {sl} into {tl}.\n{sl}: {input_text}.\n{tl}:"}]
278
- prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
279
- tokenized_input = pipe.tokenizer(input_text, return_tensors="pt")
280
- num_input_tokens = len(tokenized_input["input_ids"][0])
281
- max_new_tokens = round(num_input_tokens + 0.25 * num_input_tokens)
282
- outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
283
- translated_text = outputs[0]["generated_text"]
284
- print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}",
285
- "Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}")
286
- markers = ["<end_of_turn>", "<|im_end|>", "<|im_start|>assistant"] # , "\n"
287
- for marker in markers:
288
- if marker in translated_text:
289
- translated_text = translated_text.split(marker)[1].strip()
290
- translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text
291
- translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text
292
- split_translated_text = translated_text.split('\n', translated_text.count('\n'))
293
- translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
294
- return translated_text
295
 
296
  @spaces.GPU
297
- def translate_text(input_text: str, sselected_language: str, tselected_language: str, model_name: str) -> tuple[str, str]:
298
  """
299
  Translates the input text from the source language to the target language using a specified model.
300
 
301
  Parameters:
302
  input_text (str): The source text to be translated
303
- sselected_language (str): The source language of the input text
304
- tselected_language (str): The target language in which the input text is translated
305
  model_name (str): The selected translation model name
306
 
307
  Returns:
@@ -314,16 +352,16 @@ def translate_text(input_text: str, sselected_language: str, tselected_language:
314
  ("Hallo Welt", "Translated from English to German with Helsinki-NLP.")
315
  """
316
 
317
- sl = all_langs[sselected_language]
318
- tl = all_langs[tselected_language]
319
- message_text = f'Translated from {sselected_language} to {tselected_language} with {model_name}'
320
  print(message_text)
321
  try:
322
  if model_name.startswith("Helsinki-NLP"):
323
- translated_text, message_text = HelsinkiNLP(sl, tl, input_text)
324
 
325
  elif model_name == 'Argos':
326
- translated_text = argos(sl, tl, input_text)
327
 
328
  elif model_name == 'Google':
329
  translated_text = Translators(model_name, sl, tl, input_text).google()
@@ -332,44 +370,47 @@ def translate_text(input_text: str, sselected_language: str, tselected_language:
332
  translated_text = Translators(model_name, sl, tl, input_text).mtom()
333
 
334
  elif model_name.startswith('t5'):
335
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).tfive()
336
 
337
  elif 'flan' in model_name.lower():
338
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).flan()
339
 
340
  elif 'mt0' in model_name.lower():
341
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).bigscience()
342
 
343
  elif 'bloomz' in model_name.lower():
344
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).bloomz()
345
 
346
  elif 'nllb' in model_name.lower():
347
- nnlbsl, nnlbtl = languagecodes.nllb_language_codes[sselected_language], languagecodes.nllb_language_codes[tselected_language]
348
  translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb()
349
 
350
  elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
351
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_many_to_many()
352
 
353
  elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
354
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_one_to_many()
355
 
356
  elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
357
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_many_to_one()
358
 
359
  elif 'teuken' in model_name.lower():
360
- translated_text = teuken(model_name, sselected_language, tselected_language, input_text)
361
 
362
  elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
363
- translated_text = eurollm_instruct(model_name, sselected_language, tselected_language, input_text)
364
 
365
  elif model_name == "utter-project/EuroLLM-1.7B":
366
- translated_text = eurollm(model_name, sselected_language, tselected_language, input_text)
367
 
368
  elif 'Unbabel' in model_name:
369
- translated_text = unbabel(model_name, sselected_language, tselected_language, input_text)
370
 
371
  elif model_name == "HuggingFaceTB/SmolLM3-3B":
372
- translated_text = Translators(model_name, sselected_language, tselected_language, input_text).smollm()
 
 
 
373
 
374
  except Exception as error:
375
  translated_text = error
@@ -389,22 +430,22 @@ def create_interface():
389
  input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
390
 
391
  with gr.Row():
392
- sselected_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
393
- tselected_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
394
  swap_button = gr.Button("Swap Languages", size="md")
395
- swap_button.click(fn=swap_languages, inputs=[sselected_language, tselected_language], outputs=[sselected_language, tselected_language], api_name=False, show_api=False)
396
 
397
  model_name = gr.Dropdown(choices=models, label=f"Select a model. Default is {models[0]}.", value = models[0], interactive=True)
398
  translate_button = gr.Button("Translate")
399
 
400
  translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
401
  message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False,
402
- value=f'Default translation settings: from {sselected_language.value} to {tselected_language.value} with {model_name.value}.')
403
  allmodels = gr.HTML(label="Model links:", value=', '.join([f'<a href="https://huggingface.co/{model}">{model}</a>' for model in models]))
404
 
405
  translate_button.click(
406
  fn=translate_text,
407
- inputs=[input_text, sselected_language, tselected_language, model_name],
408
  outputs=[translated_text, message_text]
409
  )
410
 
 
4
  from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, pipeline, logging
5
  import languagecodes
6
  import requests, os
7
+ import polars as pl
8
 
9
  logging.set_verbosity_error()
10
  favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
11
  all_langs = languagecodes.iso_languages
12
+ df = pl.read_parquet("isolanguages.parquet")
13
 
14
  # Language options as list, add favourite languages first
15
  options = list(favourite_langs.keys())
 
23
  "t5-small", "t5-base", "t5-large",
24
  "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
25
  "Argos", "Google",
26
+ "HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
27
  "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
28
  "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
29
  "openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6"
 
38
  print("CUDA not available! Using CPU.")
39
  return model
40
 
41
+ def HelsinkiNLPAutoTokenizer(sl, tl, input_text): # deprecated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if model_name == "Helsinki-NLP":
43
  message_text = f'Translated from {sl} to {tl} with {model_name}.'
44
  try:
 
56
  return translated_text, message_text
57
  except EnvironmentError as error:
58
  return f"Error finding model: {model_name}! Try other available language combination.", error
59
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  class Translators:
61
  def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
62
  self.model_name = model_name
 
68
  response = requests.get(url)
69
  return response.json()[0][0][0]
70
 
71
+ def download_argos_model(from_code, to_code):
72
+ import argostranslate.package
73
+ print('Downloading model', from_code, to_code)
74
+ # Download and install Argos Translate package
75
+ argostranslate.package.update_package_index()
76
+ available_packages = argostranslate.package.get_available_packages()
77
+ package_to_install = next(
78
+ filter(
79
+ lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
80
+ )
81
+ )
82
+ argostranslate.package.install_from_path(package_to_install.download())
83
+
84
+ def argos(self):
85
+ import argostranslate.translate, argostranslate.package
86
+ # Translate
87
+ try:
88
+ download_argos_model(self.sl, self.tl)
89
+ translated_text = argostranslate.translate.translate(self.input_text, self.sl, self.tl)
90
+ except StopIteration:
91
+ # packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages())
92
+ packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages())
93
+ translated_text = f"No Argos model for {self.sl} to {self.tl}. Try other model or languages combination from the available Argos models: {packages_info}."
94
+ except Exception as error:
95
+ translated_text = error
96
+ print(error)
97
+ return translated_text
98
+
99
+ def HelsinkiNLP(self):
100
+ try: # Standard bilingual model
101
+ model_name = f"Helsinki-NLP/opus-mt-{self.sl}-{self.tl}"
102
+ pipe = pipeline("translation", model=self.model_name, device=-1)
103
+ translation = pipe(self.input_text)
104
+ return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
105
+ except EnvironmentError:
106
+ try: # Tatoeba models
107
+ model_name = f"Helsinki-NLP/opus-tatoeba-{self.sl}-{self.tl}"
108
+ pipe = pipeline("translation", model=self.model_name, device=-1)
109
+ translation = pipe(self.input_text)
110
+ return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
111
+ except EnvironmentError as error:
112
+ try: # Last resort: multi to multi
113
+ model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
114
+ pipe = pipeline("translation", model=self.model_name)
115
+ non_empty_iso = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
116
+ iso1_dict = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_iso}
117
+ iso3tl = iso1_dict.get(self.tl)[2] # 'deu'
118
+ translation = pipe(f'>>{iso3tl}<< {self.input_text}')
119
+ return translation[0]['translation_text'], f'Translated from {self.sl} to {self.tl} with {self.model_name}.'
120
+ except Exception as error:
121
+ return f"Error translating with model: {self.model_name}! Try other available language combination.", error
122
+ except KeyError as error:
123
+ return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
124
+
125
  def smollm(self):
126
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
127
  model = AutoModelForCausalLM.from_pretrained(self.model_name)
 
225
  translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
226
  translated_text = translator(self.input_text, max_length=512)
227
  return translated_text[0]['translation_text']
228
+
229
+ def wingpt(self):
230
+ model = AutoModelForCausalLM.from_pretrained(
231
+ self.model_name,
232
+ torch_dtype="auto",
233
+ device_map="auto"
234
+ )
235
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
236
+ # input_json = '{"input_text": self.input_text}'
237
+ messages = [
238
+ {"role": "system", "content": f"Translate this to {self.tl} language"},
239
+ {"role": "user", "content": self.input_text}
240
+ ]
241
+
242
+ text = tokenizer.apply_chat_template(
243
+ messages,
244
+ tokenize=False,
245
+ add_generation_prompt=True
246
+ )
247
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
248
+
249
+ generated_ids = model.generate(
250
+ **model_inputs,
251
+ max_new_tokens=512,
252
+ temperature=0.1
253
+ )
254
+
255
+ generated_ids = [
256
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
257
+ ]
258
+ print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
259
+ output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
260
+ result = output.split('\n')[-1].strip() if '\n' in output else output.strip()
261
+ return result
262
+
263
+ def eurollm(self):
264
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
265
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
266
+ prompt = f"{self.sl}: {self.input_text} {self.tl}:"
267
+ inputs = tokenizer(prompt, return_tensors="pt")
268
+ outputs = model.generate(**inputs, max_new_tokens=512)
269
+ output = tokenizer.decode(outputs[0], skip_special_tokens=True)
270
+ result = output.rsplit(f'{self.tl}:')[-1].strip() if '\n' in output else output.strip()
271
+ return result
272
+
273
+ def eurollm_instruct(self):
274
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
275
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
276
+ text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {self.sl} source text to {self.tl}:\n{self.sl}: {self.input_text} \n{self.tl}: <|im_end|>\n<|im_start|>assistant\n'
277
+ inputs = tokenizer(text, return_tensors="pt")
278
+ outputs = model.generate(**inputs, max_new_tokens=512)
279
+ output = tokenizer.decode(outputs[0], skip_special_tokens=True)
280
+ if f'{self.tl}:' in output:
281
+ output = output.rsplit(f'{self.tl}:')[-1].strip().replace('assistant\n', '').strip()
282
+ return output
283
 
284
+ def teuken(self):
285
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
286
+ model = AutoModelForCausalLM.from_pretrained(
287
+ self.model_name,
288
+ trust_remote_code=True,
289
+ torch_dtype=torch.bfloat16,
290
+ )
291
+ model = model.to(device).eval()
292
+ tokenizer = AutoTokenizer.from_pretrained(
293
+ self.model_name,
294
+ use_fast=False,
295
+ trust_remote_code=True,
296
+ )
297
+ translation_prompt = f"Translate the following text from {self.sl} into {self.tl}: {self.input_text}"
298
+ messages = [{"role": "User", "content": translation_prompt}]
299
+ prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt")
300
+ prediction = model.generate(
301
+ prompt_ids.to(model.device),
302
+ max_length=512,
303
+ do_sample=True,
304
+ top_k=50,
305
+ top_p=0.95,
306
+ temperature=0.7,
307
+ num_return_sequences=1,
308
+ )
309
+ translation = tokenizer.decode(prediction[0].tolist())
310
+ return translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
+ def unbabel(self):
313
+ pipe = pipeline("text-generation", model=self.model_name, torch_dtype=torch.bfloat16, device_map="auto")
314
+ messages = [{"role": "user",
315
+ "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text}.\n{self.tl}:"}]
316
+ prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
317
+ tokenized_input = pipe.tokenizer(self.input_text, return_tensors="pt")
318
+ num_input_tokens = len(tokenized_input["input_ids"][0])
319
+ max_new_tokens = round(num_input_tokens + 0.25 * num_input_tokens)
320
+ outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
321
+ translated_text = outputs[0]["generated_text"]
322
+ print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}",
323
+ "Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}")
324
+ markers = ["<end_of_turn>", "<|im_end|>", "<|im_start|>assistant"] # , "\n"
325
+ for marker in markers:
326
+ if marker in translated_text:
327
+ translated_text = translated_text.split(marker)[1].strip()
328
+ translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text
329
+ translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text
330
+ split_translated_text = translated_text.split('\n', translated_text.count('\n'))
331
+ translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
332
+ return translated_text
333
 
334
  @spaces.GPU
335
+ def translate_text(input_text: str, s_language: str, t_language: str, model_name: str) -> tuple[str, str]:
336
  """
337
  Translates the input text from the source language to the target language using a specified model.
338
 
339
  Parameters:
340
  input_text (str): The source text to be translated
341
+ s_language (str): The source language of the input text
342
+ t_language (str): The target language in which the input text is translated
343
  model_name (str): The selected translation model name
344
 
345
  Returns:
 
352
  ("Hallo Welt", "Translated from English to German with Helsinki-NLP.")
353
  """
354
 
355
+ sl = all_langs[s_language]
356
+ tl = all_langs[t_language]
357
+ message_text = f'Translated from {s_language} to {t_language} with {model_name}'
358
  print(message_text)
359
  try:
360
  if model_name.startswith("Helsinki-NLP"):
361
+ translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP()
362
 
363
  elif model_name == 'Argos':
364
+ translated_text = Translators(model_name, sl, tl, input_text).argos()
365
 
366
  elif model_name == 'Google':
367
  translated_text = Translators(model_name, sl, tl, input_text).google()
 
370
  translated_text = Translators(model_name, sl, tl, input_text).mtom()
371
 
372
  elif model_name.startswith('t5'):
373
+ translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
374
 
375
  elif 'flan' in model_name.lower():
376
+ translated_text = Translators(model_name, s_language, t_language, input_text).flan()
377
 
378
  elif 'mt0' in model_name.lower():
379
+ translated_text = Translators(model_name, s_language, t_language, input_text).bigscience()
380
 
381
  elif 'bloomz' in model_name.lower():
382
+ translated_text = Translators(model_name, s_language, t_language, input_text).bloomz()
383
 
384
  elif 'nllb' in model_name.lower():
385
+ nnlbsl, nnlbtl = languagecodes.nllb_language_codes[s_language], languagecodes.nllb_language_codes[t_language]
386
  translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb()
387
 
388
  elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
389
+ translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_many()
390
 
391
  elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
392
+ translated_text = Translators(model_name, s_language, t_language, input_text).mbart_one_to_many()
393
 
394
  elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
395
+ translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one()
396
 
397
  elif 'teuken' in model_name.lower():
398
+ translated_text = Translators(model_name, s_language, t_language, input_text).teuken()
399
 
400
  elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
401
+ translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct()
402
 
403
  elif model_name == "utter-project/EuroLLM-1.7B":
404
+ translated_text = Translators(model_name, s_language, t_language, input_text).eurollm()
405
 
406
  elif 'Unbabel' in model_name:
407
+ translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
408
 
409
  elif model_name == "HuggingFaceTB/SmolLM3-3B":
410
+ translated_text = Translators(model_name, s_language, t_language, input_text).smollm()
411
+
412
+ elif model_name == "winninghealth/WiNGPT-Babel-2":
413
+ translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
414
 
415
  except Exception as error:
416
  translated_text = error
 
430
  input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
431
 
432
  with gr.Row():
433
+ s_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
434
+ t_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
435
  swap_button = gr.Button("Swap Languages", size="md")
436
+ swap_button.click(fn=swap_languages, inputs=[s_language, t_language], outputs=[s_language, t_language], api_name=False, show_api=False)
437
 
438
  model_name = gr.Dropdown(choices=models, label=f"Select a model. Default is {models[0]}.", value = models[0], interactive=True)
439
  translate_button = gr.Button("Translate")
440
 
441
  translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
442
  message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False,
443
+ value=f'Default translation settings: from {s_language.value} to {t_language.value} with {model_name.value}.')
444
  allmodels = gr.HTML(label="Model links:", value=', '.join([f'<a href="https://huggingface.co/{model}">{model}</a>' for model in models]))
445
 
446
  translate_button.click(
447
  fn=translate_text,
448
+ inputs=[input_text, s_language, t_language, model_name],
449
  outputs=[translated_text, message_text]
450
  )
451