entai2965 commited on
Commit
c1ebd41
·
verified ·
1 Parent(s): 536bfe4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +89 -0
README.md CHANGED
@@ -175,3 +175,92 @@ target = results[0].hypotheses[0][1:]
175
 
176
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
177
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
177
  ```
178
+
179
+ ## How to run this model (batch syntax)
180
+
181
+ ```
182
+ import os
183
+ import ctranslate2
184
+ import transformers
185
+
186
+ #set defaults
187
+ home_path=os.path.expanduser('~')
188
+ model_path=home_path+'/Downloads/models/models--facebook--m2m100_418M_ctranslate2'
189
+ #model_path=home_path+'/Downloads/models/models--facebook--m2m100_1.2B_ctranslate2'
190
+
191
+ #available languages list -> https://huggingface.co/facebook/m2m100_1.2B <-
192
+ source_language_code='ja'
193
+ target_language_code='en'
194
+
195
+ device='cpu'
196
+ #device='cuda'
197
+
198
+ #load data
199
+ string1='イキリカメラマン'
200
+ string2='おかあさん'
201
+ string3='人生はチョコレートの箱のようなものです。彼らは皆毒殺されています。'
202
+ list_to_translate=[string1,string2,string3]
203
+
204
+ #load model and tokenizer
205
+ translator=ctranslate2.Translator(model_path,device=device)
206
+ tokenizer=transformers.AutoTokenizer.from_pretrained(model_path,clean_up_tokenization_spaces=True)
207
+
208
+ #configure languages
209
+ tokenizer.src_lang=source_language_code
210
+ target_language_token=[tokenizer.lang_code_to_token[target_language_code]]
211
+
212
+ #encode
213
+ encoded_list=[]
214
+ for text in list_to_translate:
215
+ encoded_list.append(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
216
+
217
+ #translate
218
+ #https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
219
+ translated_list=translator.translate_batch(encoded_list, target_prefix=[target_language_token]*len(encoded_list))
220
+
221
+ #decode
222
+ for counter,tokens in enumerate(translated_list):
223
+ translated_list[counter]=tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:]))
224
+
225
+ #output
226
+ for text in translated_list:
227
+ print(text)
228
+ ```
229
+
230
+ [Functional programming](https://docs.python.org/3/howto/functional.html) version
231
+
232
+ ```
233
+ import os
234
+ import ctranslate2
235
+ import transformers
236
+
237
+ #set defaults
238
+ home_path=os.path.expanduser('~')
239
+ model_path=home_path+'/Downloads/models/models--facebook--m2m100_418M_ctranslate2'
240
+ #model_path=home_path+'/Downloads/models/models--facebook--m2m100_1.2B_ctranslate2'
241
+
242
+ #available languages list -> https://huggingface.co/facebook/m2m100_1.2B <-
243
+ source_language_code='ja'
244
+ target_language_code='es'
245
+
246
+ device='cpu'
247
+ #device='cuda'
248
+
249
+ #load data
250
+ string1='イキリカメラマン'
251
+ string2='おかあさん'
252
+ string3='人生はチョコレートの箱のようなものです。彼らは皆毒殺されています。'
253
+ list_to_translate=[string1,string2,string3]
254
+
255
+ #load model and tokenizer
256
+ translator=ctranslate2.Translator(model_path,device=device)
257
+ tokenizer=transformers.AutoTokenizer.from_pretrained(model_path,clean_up_tokenization_spaces=True)
258
+ tokenizer.src_lang=source_language_code
259
+
260
+ #invoke witchcraft
261
+ translated_list=[tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:])) for tokens in translator.translate_batch([tokenizer.convert_ids_to_tokens(tokenizer.encode(i)) for i in list_to_translate], target_prefix=[[tokenizer.lang_code_to_token[target_language_code]]]*len(list_to_translate))]
262
+
263
+ #output
264
+ for text in translated_list:
265
+ print(text)
266
+ ```