TiberiuCristianLeon commited on
Commit
92c9491
·
verified ·
1 Parent(s): 21077c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -128
app.py CHANGED
@@ -64,6 +64,49 @@ def argos(sl, tl, input_text):
64
  print(error)
65
  return translated_text
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  class Translators:
68
  def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
69
  self.model_name = model_name
@@ -109,57 +152,75 @@ class Translators:
109
  translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
110
  return translated_text
111
 
112
- def mtom(model_name, sl, tl, input_text):
113
- from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
114
- model = M2M100ForConditionalGeneration.from_pretrained(model_name)
115
- tokenizer = M2M100Tokenizer.from_pretrained(model_name)
116
- tokenizer.src_lang = sl
117
- encoded = tokenizer(input_text, return_tensors="pt")
118
- generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tl))
119
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
122
- if model_name == "Helsinki-NLP":
123
- message_text = f'Translated from {sl} to {tl} with {model_name}.'
124
- try:
125
- model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
126
- tokenizer = AutoTokenizer.from_pretrained(model_name)
127
- model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
128
- except EnvironmentError:
129
- try:
130
- model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
131
- tokenizer = AutoTokenizer.from_pretrained(model_name)
132
- model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
133
- input_ids = tokenizer.encode(prompt, return_tensors="pt")
134
- output_ids = model.generate(input_ids, max_length=512)
135
- translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
136
- return translated_text, message_text
137
- except EnvironmentError as error:
138
- return f"Error finding model: {model_name}! Try other available language combination.", error
139
-
140
- def HelsinkiNLP(sl, tl, input_text):
141
- try: # Standard bilingual model
142
- model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
143
- pipe = pipeline("translation", model=model_name, device=-1)
144
- translation = pipe(input_text)
145
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
146
- except EnvironmentError:
147
- try: # Tatoeba models
148
- model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
149
- pipe = pipeline("translation", model=model_name, device=-1)
150
- translation = pipe(input_text)
151
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
152
- except EnvironmentError as error:
153
- try: # Last resort: multi to multi
154
- model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
155
- pipe = pipeline("translation", model=model_name)
156
- tl = 'deu' # Hard coded for now for testing
157
- translation = pipe(f'>>{tl}<< {input_text}')
158
- return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
159
- except Exception as error:
160
- return f"Error translating with model: {model_name}! Try other available language combination.", error
161
- except KeyError as error:
162
- return f"Error: Translation direction {sl} to {tl} is not supported by Helsinki Translation Models", error
163
 
164
  def teuken(model_name, sl, tl, input_text):
165
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -189,24 +250,6 @@ def teuken(model_name, sl, tl, input_text):
189
  translation = tokenizer.decode(prediction[0].tolist())
190
  return translation
191
 
192
- def bigscience(model_name, sl, tl, input_text):
193
- tokenizer = AutoTokenizer.from_pretrained(model_name)
194
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
195
- inputs = tokenizer.encode(f"Translate to {tl}: {input_text}.", return_tensors="pt")
196
- outputs = model.generate(inputs)
197
- translation = tokenizer.decode(outputs[0])
198
- translation = translation.replace('<pad> ', '').replace('</s>', '')
199
- return translation
200
-
201
- def bloomz(model_name, sl, tl, input_text):
202
- tokenizer = AutoTokenizer.from_pretrained(model_name)
203
- model = AutoModelForCausalLM.from_pretrained(model_name)
204
- inputs = tokenizer.encode(f"Translate from {sl} to {tl}: {input_text}. Translation:", return_tensors="pt")
205
- outputs = model.generate(inputs)
206
- translation = tokenizer.decode(outputs[0])
207
- translation = translation.replace('<pad> ', '').replace('</s>', '')
208
- return translation
209
-
210
  def eurollm(model_name, sl, tl, input_text):
211
  tokenizer = AutoTokenizer.from_pretrained(model_name)
212
  model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -228,13 +271,6 @@ def eurollm_instruct(model_name, sl, tl, input_text):
228
  output = output.rsplit(f'{tl}:')[-1].strip().replace('assistant\n', '')
229
  return output
230
 
231
- def nllb(model_name, sl, tl, input_text):
232
- tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=sl)
233
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
234
- translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=sl, tgt_lang=tl)
235
- translated_text = translator(input_text, max_length=512)
236
- return translated_text[0]['translation_text']
237
-
238
  def unbabel(model_name, sl, tl, input_text):
239
  pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
240
  messages = [{"role": "user",
@@ -256,43 +292,6 @@ def unbabel(model_name, sl, tl, input_text):
256
  split_translated_text = translated_text.split('\n', translated_text.count('\n'))
257
  translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
258
  return translated_text
259
-
260
- def mbart_many_to_many(model_name, sl, tl, input_text):
261
- from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
262
- model = MBartForConditionalGeneration.from_pretrained(model_name)
263
- tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
264
- # translate source to target
265
- tokenizer.src_lang = languagecodes.mbart_large_languages[sl]
266
- encoded = tokenizer(input_text, return_tensors="pt")
267
- generated_tokens = model.generate(
268
- **encoded,
269
- forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[tl]]
270
- )
271
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
272
-
273
- def mbart_one_to_many(model_name, sl, tl, input_text):
274
- from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
275
- article_en = input_text
276
- model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
277
- tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
278
- model_inputs = tokenizer(article_en, return_tensors="pt")
279
- # translate from English
280
- langid = languagecodes.mbart_large_languages[tl]
281
- generated_tokens = model.generate(
282
- **model_inputs,
283
- forced_bos_token_id=tokenizer.lang_code_to_id[langid]
284
- )
285
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
286
-
287
- def mbart_many_to_one(model_name, sl, tl, input_text):
288
- from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
289
- model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
290
- tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
291
- # translate to English
292
- tokenizer.src_lang = languagecodes.mbart_large_languages[sl]
293
- encoded = tokenizer(input_text, return_tensors="pt")
294
- generated_tokens = model.generate(**encoded)
295
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
296
 
297
  @spaces.GPU
298
  def translate_text(input_text: str, sselected_language: str, tselected_language: str, model_name: str) -> tuple[str, str]:
@@ -330,42 +329,42 @@ def translate_text(input_text: str, sselected_language: str, tselected_language:
330
  translated_text = Translators(model_name, sl, tl, input_text).google()
331
 
332
  elif "m2m" in model_name.lower():
333
- translated_text = mtom(model_name, sl, tl, input_text)
334
-
335
- elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
336
- translated_text = eurollm_instruct(model_name, sselected_language, tselected_language, input_text)
337
 
338
- elif model_name == "utter-project/EuroLLM-1.7B":
339
- translated_text = eurollm(model_name, sselected_language, tselected_language, input_text)
340
-
341
  elif model_name.startswith('t5'):
342
  translated_text = Translators(model_name, sselected_language, tselected_language, input_text).tfive()
343
 
344
  elif 'flan' in model_name.lower():
345
  translated_text = Translators(model_name, sselected_language, tselected_language, input_text).flan()
346
 
347
- elif 'teuken' in model_name.lower():
348
- translated_text = teuken(model_name, sselected_language, tselected_language, input_text)
349
-
350
  elif 'mt0' in model_name.lower():
351
- translated_text = bigscience(model_name, sselected_language, tselected_language, input_text)
352
 
353
  elif 'bloomz' in model_name.lower():
354
- translated_text = bloomz(model_name, sselected_language, tselected_language, input_text)
355
 
356
  elif 'nllb' in model_name.lower():
357
  nnlbsl, nnlbtl = languagecodes.nllb_language_codes[sselected_language], languagecodes.nllb_language_codes[tselected_language]
358
- translated_text = nllb(model_name, nnlbsl, nnlbtl, input_text)
359
 
360
  elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
361
- translated_text = mbart_many_to_many(model_name, sselected_language, tselected_language, input_text)
362
 
363
  elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
364
- translated_text = mbart_one_to_many(model_name, sselected_language, tselected_language, input_text)
365
 
366
  elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
367
- translated_text = mbart_many_to_one(model_name, sselected_language, tselected_language, input_text)
368
 
 
 
 
 
 
 
 
 
 
369
  elif 'Unbabel' in model_name:
370
  translated_text = unbabel(model_name, sselected_language, tselected_language, input_text)
371
 
 
64
  print(error)
65
  return translated_text
66
 
67
+ def HelsinkiNLPAutoTokenizer(sl, tl, input_text):
68
+ if model_name == "Helsinki-NLP":
69
+ message_text = f'Translated from {sl} to {tl} with {model_name}.'
70
+ try:
71
+ model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
73
+ model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
74
+ except EnvironmentError:
75
+ try:
76
+ model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
77
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
78
+ model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
79
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
80
+ output_ids = model.generate(input_ids, max_length=512)
81
+ translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
82
+ return translated_text, message_text
83
+ except EnvironmentError as error:
84
+ return f"Error finding model: {model_name}! Try other available language combination.", error
85
+
86
+ def HelsinkiNLP(sl, tl, input_text):
87
+ try: # Standard bilingual model
88
+ model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
89
+ pipe = pipeline("translation", model=model_name, device=-1)
90
+ translation = pipe(input_text)
91
+ return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
92
+ except EnvironmentError:
93
+ try: # Tatoeba models
94
+ model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
95
+ pipe = pipeline("translation", model=model_name, device=-1)
96
+ translation = pipe(input_text)
97
+ return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
98
+ except EnvironmentError as error:
99
+ try: # Last resort: multi to multi
100
+ model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul"
101
+ pipe = pipeline("translation", model=model_name)
102
+ tl = 'deu' # Hard coded for now for testing
103
+ translation = pipe(f'>>{tl}<< {input_text}')
104
+ return translation[0]['translation_text'], f'Translated from {sl} to {tl} with {model_name}.'
105
+ except Exception as error:
106
+ return f"Error translating with model: {model_name}! Try other available language combination.", error
107
+ except KeyError as error:
108
+ return f"Error: Translation direction {sl} to {tl} is not supported by Helsinki Translation Models", error
109
+
110
  class Translators:
111
  def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
112
  self.model_name = model_name
 
152
  translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
153
  return translated_text
154
 
155
+ def mbart_many_to_many(self):
156
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
157
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
158
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name)
159
+ # translate source to target
160
+ tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl]
161
+ encoded = tokenizer(self.input_text, return_tensors="pt")
162
+ generated_tokens = model.generate(
163
+ **encoded,
164
+ forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[self.tl]]
165
+ )
166
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
167
+
168
+ def mbart_one_to_many(self):
169
+ # translate from English
170
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
171
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
172
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name, src_lang="en_XX")
173
+ model_inputs = tokenizer(self.input_text, return_tensors="pt")
174
+ langid = languagecodes.mbart_large_languages[self.tl]
175
+ generated_tokens = model.generate(
176
+ **model_inputs,
177
+ forced_bos_token_id=tokenizer.lang_code_to_id[langid]
178
+ )
179
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
180
+
181
+ def mbart_many_to_one(self):
182
+ # translate to English
183
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
184
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
185
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name)
186
+ tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl]
187
+ encoded = tokenizer(self.input_text, return_tensors="pt")
188
+ generated_tokens = model.generate(**encoded)
189
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
190
+
191
+ def mtom(self):
192
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
193
+ model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
194
+ tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
195
+ tokenizer.src_lang = self.sl
196
+ encoded = tokenizer(self.input_text, return_tensors="pt")
197
+ generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
198
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
199
 
200
+ def bigscience(self):
201
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
202
+ model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
203
+ inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}.", return_tensors="pt")
204
+ outputs = model.generate(inputs)
205
+ translation = tokenizer.decode(outputs[0])
206
+ translation = translation.replace('<pad> ', '').replace('</s>', '')
207
+ return translation
208
+
209
+ def bloomz(self):
210
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
211
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
212
+ inputs = tokenizer.encode(f"Translate from {self.sl} to {self.tl}: {self.input_text}. Translation:", return_tensors="pt")
213
+ outputs = model.generate(inputs)
214
+ translation = tokenizer.decode(outputs[0])
215
+ translation = translation.replace('<pad> ', '').replace('</s>', '')
216
+ return translation
217
+
218
+ def nllb(self):
219
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=self.sl)
220
+ model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, device_map="auto")
221
+ translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
222
+ translated_text = translator(self.input_text, max_length=512)
223
+ return translated_text[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  def teuken(model_name, sl, tl, input_text):
226
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
250
  translation = tokenizer.decode(prediction[0].tolist())
251
  return translation
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def eurollm(model_name, sl, tl, input_text):
254
  tokenizer = AutoTokenizer.from_pretrained(model_name)
255
  model = AutoModelForCausalLM.from_pretrained(model_name)
 
271
  output = output.rsplit(f'{tl}:')[-1].strip().replace('assistant\n', '')
272
  return output
273
 
 
 
 
 
 
 
 
274
  def unbabel(model_name, sl, tl, input_text):
275
  pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
276
  messages = [{"role": "user",
 
292
  split_translated_text = translated_text.split('\n', translated_text.count('\n'))
293
  translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
294
  return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  @spaces.GPU
297
  def translate_text(input_text: str, sselected_language: str, tselected_language: str, model_name: str) -> tuple[str, str]:
 
329
  translated_text = Translators(model_name, sl, tl, input_text).google()
330
 
331
  elif "m2m" in model_name.lower():
332
+ translated_text = Translators(model_name, sl, tl, input_text).mtom()
 
 
 
333
 
 
 
 
334
  elif model_name.startswith('t5'):
335
  translated_text = Translators(model_name, sselected_language, tselected_language, input_text).tfive()
336
 
337
  elif 'flan' in model_name.lower():
338
  translated_text = Translators(model_name, sselected_language, tselected_language, input_text).flan()
339
 
 
 
 
340
  elif 'mt0' in model_name.lower():
341
+ translated_text = Translators(model_name, sselected_language, tselected_language, input_text).bigscience()
342
 
343
  elif 'bloomz' in model_name.lower():
344
+ translated_text = Translators(model_name, sselected_language, tselected_language, input_text).bloomz()
345
 
346
  elif 'nllb' in model_name.lower():
347
  nnlbsl, nnlbtl = languagecodes.nllb_language_codes[sselected_language], languagecodes.nllb_language_codes[tselected_language]
348
+ translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb()
349
 
350
  elif model_name == "facebook/mbart-large-50-many-to-many-mmt":
351
+ translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_many_to_many()
352
 
353
  elif model_name == "facebook/mbart-large-50-one-to-many-mmt":
354
+ translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_one_to_many()
355
 
356
  elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
357
+ translated_text = Translators(model_name, sselected_language, tselected_language, input_text).mbart_many_to_one()
358
 
359
+ elif 'teuken' in model_name.lower():
360
+ translated_text = teuken(model_name, sselected_language, tselected_language, input_text)
361
+
362
+ elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
363
+ translated_text = eurollm_instruct(model_name, sselected_language, tselected_language, input_text)
364
+
365
+ elif model_name == "utter-project/EuroLLM-1.7B":
366
+ translated_text = eurollm(model_name, sselected_language, tselected_language, input_text)
367
+
368
  elif 'Unbabel' in model_name:
369
  translated_text = unbabel(model_name, sselected_language, tselected_language, input_text)
370