|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
from flores200_codes import flores_codes |
|
|
|
model_dict = {} |
|
|
|
|
|
def load_models(model_name: str): |
|
|
|
model_name_dict = { |
|
"nllb-1.3B": "facebook/nllb-200-1.3B", |
|
"nllb-distilled-1.3B": "facebook/nllb-200-distilled-1.3B", |
|
"nllb-3.3B": "facebook/nllb-200-3.3B", |
|
}[model_name] |
|
|
|
print("\tLoading model: %s" % model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_dict) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name_dict) |
|
model_dict[model_name + "_model"] = model |
|
model_dict[model_name + "_tokenizer"] = tokenizer |
|
|
|
return model_dict |
|
|
|
|
|
def translation(model_name: str, source, target, text: str): |
|
|
|
model_dict = load_models(model_name) |
|
|
|
source = flores_codes[source] |
|
target = flores_codes[target] |
|
|
|
model = model_dict[model_name + "_model"] |
|
tokenizer = model_dict[model_name + "_tokenizer"] |
|
|
|
translator = pipeline( |
|
"translation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
src_lang=source, |
|
tgt_lang=target, |
|
) |
|
output = translator(text, max_length=400) |
|
|
|
output = output[0]["translation_text"] |
|
result = { |
|
"source": source, |
|
"target": target, |
|
"result": output, |
|
} |
|
|
|
return result |
|
|
|
|
|
NLLB_EXAMPLES = [ |
|
["nllb-distilled-1.3B", "English", "Shan", "Hello, how are you today?"], |
|
["nllb-distilled-1.3B", "Shan", "English", "မႂ်ႇသုင်ၶႃႈ ယူႇလီယူႇၶႃႈၼေႃႈ"], |
|
[ |
|
"nllb-distilled-1.3B", |
|
"English", |
|
"Shan", |
|
"Forming Myanmar’s New Political System Will Remain an Ideal but Never in Practicality", |
|
], |
|
] |
|
|