Update README.md
Browse files
README.md
CHANGED
@@ -175,3 +175,92 @@ target = results[0].hypotheses[0][1:]
|
|
175 |
|
176 |
print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
|
177 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
|
177 |
```
|
178 |
+
|
179 |
+
## How to run this model (batch syntax)
|
180 |
+
|
181 |
+
```
|
182 |
+
import os
|
183 |
+
import ctranslate2
|
184 |
+
import transformers
|
185 |
+
|
186 |
+
#set defaults
|
187 |
+
home_path=os.path.expanduser('~')
|
188 |
+
model_path=home_path+'/Downloads/models/models--facebook--m2m100_418M_ctranslate2'
|
189 |
+
#model_path=home_path+'/Downloads/models/models--facebook--m2m100_1.2B_ctranslate2'
|
190 |
+
|
191 |
+
#available languages list -> https://huggingface.co/facebook/m2m100_1.2B <-
|
192 |
+
source_language_code='ja'
|
193 |
+
target_language_code='en'
|
194 |
+
|
195 |
+
device='cpu'
|
196 |
+
#device='cuda'
|
197 |
+
|
198 |
+
#load data
|
199 |
+
string1='イキリカメラマン'
|
200 |
+
string2='おかあさん'
|
201 |
+
string3='人生はチョコレートの箱のようなものです。彼らは皆毒殺されています。'
|
202 |
+
list_to_translate=[string1,string2,string3]
|
203 |
+
|
204 |
+
#load model and tokenizer
|
205 |
+
translator=ctranslate2.Translator(model_path,device=device)
|
206 |
+
tokenizer=transformers.AutoTokenizer.from_pretrained(model_path,clean_up_tokenization_spaces=True)
|
207 |
+
|
208 |
+
#configure languages
|
209 |
+
tokenizer.src_lang=source_language_code
|
210 |
+
target_language_token=[tokenizer.lang_code_to_token[target_language_code]]
|
211 |
+
|
212 |
+
#encode
|
213 |
+
encoded_list=[]
|
214 |
+
for text in list_to_translate:
|
215 |
+
encoded_list.append(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
|
216 |
+
|
217 |
+
#translate
|
218 |
+
#https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
|
219 |
+
translated_list=translator.translate_batch(encoded_list, target_prefix=[target_language_token]*len(encoded_list))
|
220 |
+
|
221 |
+
#decode
|
222 |
+
for counter,tokens in enumerate(translated_list):
|
223 |
+
translated_list[counter]=tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:]))
|
224 |
+
|
225 |
+
#output
|
226 |
+
for text in translated_list:
|
227 |
+
print(text)
|
228 |
+
```
|
229 |
+
|
230 |
+
[Functional programming](https://docs.python.org/3/howto/functional.html) version
|
231 |
+
|
232 |
+
```
|
233 |
+
import os
|
234 |
+
import ctranslate2
|
235 |
+
import transformers
|
236 |
+
|
237 |
+
#set defaults
|
238 |
+
home_path=os.path.expanduser('~')
|
239 |
+
model_path=home_path+'/Downloads/models/models--facebook--m2m100_418M_ctranslate2'
|
240 |
+
#model_path=home_path+'/Downloads/models/models--facebook--m2m100_1.2B_ctranslate2'
|
241 |
+
|
242 |
+
#available languages list -> https://huggingface.co/facebook/m2m100_1.2B <-
|
243 |
+
source_language_code='ja'
|
244 |
+
target_language_code='es'
|
245 |
+
|
246 |
+
device='cpu'
|
247 |
+
#device='cuda'
|
248 |
+
|
249 |
+
#load data
|
250 |
+
string1='イキリカメラマン'
|
251 |
+
string2='おかあさん'
|
252 |
+
string3='人生はチョコレートの箱のようなものです。彼らは皆毒殺されています。'
|
253 |
+
list_to_translate=[string1,string2,string3]
|
254 |
+
|
255 |
+
#load model and tokenizer
|
256 |
+
translator=ctranslate2.Translator(model_path,device=device)
|
257 |
+
tokenizer=transformers.AutoTokenizer.from_pretrained(model_path,clean_up_tokenization_spaces=True)
|
258 |
+
tokenizer.src_lang=source_language_code
|
259 |
+
|
260 |
+
#invoke witchcraft
|
261 |
+
translated_list=[tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:])) for tokens in translator.translate_batch([tokenizer.convert_ids_to_tokens(tokenizer.encode(i)) for i in list_to_translate], target_prefix=[[tokenizer.lang_code_to_token[target_language_code]]]*len(list_to_translate))]
|
262 |
+
|
263 |
+
#output
|
264 |
+
for text in translated_list:
|
265 |
+
print(text)
|
266 |
+
```
|