Thore Andresen commited on
Commit
a0f3ffa
·
1 Parent(s): a1f826f

Create demo

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. README.md +3 -3
  3. app.py +46 -0
  4. inference.py +57 -0
  5. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/*
2
+ venv/*
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: NLLB NorthFrisian
3
  emoji: 🚀
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.31.5
8
  app_file: app.py
 
1
  ---
2
+ title: NLLB North Frisian
3
  emoji: 🚀
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.31.5
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gio
2
+ from inference import Translator
3
+
4
+
5
+ class TranslationInterface:
6
+ def __init__(self, languages, create_response) -> None:
7
+ self.interface = gio.Interface(
8
+ create_response,
9
+ [
10
+ gio.Dropdown(
11
+ languages,
12
+ label="Input Language",
13
+ info="The language to translate from",
14
+ value=languages[0]
15
+ ),
16
+ gio.Dropdown(
17
+ languages,
18
+ label="Output Language",
19
+ info="The language to translate to",
20
+ value=languages[-1]
21
+ ),
22
+ gio.Textbox(
23
+ label="Input text",
24
+ info="The text to be translated",
25
+ lines=2,
26
+ value="Momme wohnt in Niebüll."
27
+ )
28
+ ],
29
+ "text"
30
+ )
31
+
32
+ def launch(self):
33
+ self.interface.launch()
34
+
35
+ if __name__ == "__main__":
36
+ translator = Translator.from_pretrained('CmdCody/nllb-deu-moo')
37
+
38
+ def generate_translation(src_lang, tgt_lang, message):
39
+ result = translator.translate(message, src_lang=src_lang, tgt_lang=tgt_lang)
40
+ return result[0]
41
+
42
+ interface = TranslationInterface(
43
+ ["deu_Latn", "eng_Latn", "dan_Latn", "moo_Latn"],
44
+ generate_translation
45
+ )
46
+ interface.launch()
inference.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
2
+
3
+
4
+ def create_tokenizer_with_new_lang(model_id, new_lang):
5
+ """
6
+ Add a new language token to the tokenizer vocabulary
7
+ (this should be done each time after its initialization)
8
+ """
9
+ tokenizer = NllbTokenizer.from_pretrained(model_id)
10
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
11
+ tokenizer.lang_code_to_id[new_lang] = old_len-1
12
+ tokenizer.id_to_lang_code[old_len-1] = new_lang
13
+ # always move "mask" to the last position
14
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
15
+
16
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
17
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
18
+ if new_lang not in tokenizer._additional_special_tokens:
19
+ tokenizer._additional_special_tokens.append(new_lang)
20
+ # clear the added token encoder; otherwise a new token may end up there by mistake
21
+ tokenizer.added_tokens_encoder = {}
22
+ tokenizer.added_tokens_decoder = {}
23
+
24
+ return tokenizer
25
+
26
+
27
+ class Translator:
28
+ @classmethod
29
+ def from_pretrained(cls, path, new_lang='moo_Latn'):
30
+ # Does the model need adaptation or not?
31
+ # model, tokenizer = create_model_with_new_lang(
32
+ # model_id=path,
33
+ # new_lang=new_lang,
34
+ # similar_lang='deu_Latn'
35
+ # )
36
+ tokenizer = create_tokenizer_with_new_lang(path, new_lang)
37
+ model = AutoModelForSeq2SeqLM.from_pretrained(path)
38
+ return Translator(model, tokenizer)
39
+
40
+ def __init__(self, model, tokenizer) -> None:
41
+ self.model = model
42
+ self.tokenizer = tokenizer
43
+
44
+ # self.model.cuda()
45
+
46
+ def translate(self, text, src_lang='moo_Latn', tgt_lang='deu_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
47
+ self.tokenizer.src_lang = src_lang
48
+ self.tokenizer.tgt_lang = tgt_lang
49
+ inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
50
+ result = self.model.generate(
51
+ **inputs.to(self.model.device),
52
+ forced_bos_token_id=self.tokenizer.convert_tokens_to_ids(tgt_lang),
53
+ max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
54
+ num_beams=num_beams,
55
+ **kwargs
56
+ )
57
+ return self.tokenizer.batch_decode(result, skip_special_tokens=True)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ transformers==4.33