Spaces:

Moleys
/

hirashiba-xomdich-tokenizer

Sleeping

App Files Files Community

Moleys commited on 28 days ago

Commit

9278a67

verified ·

1 Parent(s): 39829ff

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -23

app.py CHANGED Viewed

@@ -2,50 +2,65 @@ import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 import gradio as gr
-# Load model và tokenizer
 model_name = "b3x0m/hirashiba-xomdich-tokenizer"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
 def translate_text(input_text):
-    lines = input_text.split('\n')  # Tách từng dòng
     translated_lines = []
     for line in lines:
         raw_text = line.strip()
         if not raw_text:
-            translated_lines.append('')  # Giữ dòng trống
             continue
-        # Tokenize input
-        inputs = tokenizer(raw_text, return_tensors="pt", padding=True, truncation=True).to(device)
-        # Dịch với mô hình (không cần tính gradient)
         with torch.no_grad():
-            output_tokens = model.generate(**inputs, max_length=512)
-        # Giải mã kết quả
         translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
         translated_lines.append(translated_text)
     return '\n'.join(translated_lines)
 if __name__ == '__main__':
     with gr.Blocks() as app:
         gr.Markdown('## Chinese to Vietnamese Translation')
         with gr.Row():
             with gr.Column(scale=1):
                 input_text = gr.Textbox(label='Input Chinese Text', lines=5, placeholder='Enter Chinese text here...')
                 translate_button = gr.Button('Translate')
                 output_text = gr.Textbox(label='Output Vietnamese Text', lines=5, interactive=False)
-        translate_button.click(
-            fn=translate_text,
-            inputs=input_text,
-            outputs=output_text
-        )
-    app.launch()

 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 import gradio as gr
 model_name = "b3x0m/hirashiba-xomdich-tokenizer"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+# đảm bảo có pad_token để padding không lỗi
+if tokenizer.pad_token is None:
+    # ưu tiên dùng eos_token làm pad nếu có
+    if tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+    else:
+        tokenizer.add_special_tokens({"pad_token": "<pad>"})
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+# nếu vừa thêm pad_token mới, cập nhật cho model
+model.config.pad_token_id = tokenizer.pad_token_id
 def translate_text(input_text):
+    lines = input_text.split('\n')
     translated_lines = []
     for line in lines:
         raw_text = line.strip()
         if not raw_text:
+            translated_lines.append('')
             continue
+        # KHÔNG trả về token_type_ids để tránh lỗi
+        inputs = tokenizer(
+            raw_text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=1024,                 # tránh cảnh báo truncation
+            return_token_type_ids=False
+        ).to(device)
         with torch.no_grad():
+            # dùng max_new_tokens thay vì max_length cho sinh đầu ra
+            output_tokens = model.generate(
+                **inputs,
+                max_new_tokens=512
+            )
         translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
         translated_lines.append(translated_text)
     return '\n'.join(translated_lines)
 if __name__ == '__main__':
     with gr.Blocks() as app:
         gr.Markdown('## Chinese to Vietnamese Translation')
         with gr.Row():
             with gr.Column(scale=1):
                 input_text = gr.Textbox(label='Input Chinese Text', lines=5, placeholder='Enter Chinese text here...')
                 translate_button = gr.Button('Translate')
                 output_text = gr.Textbox(label='Output Vietnamese Text', lines=5, interactive=False)
+        translate_button.click(fn=translate_text, inputs=input_text, outputs=output_text)
+    app.launch()