Spaces:

techindia2025
/

medbot_meditron

Running on Zero

App Files Files Community

techindia2025 commited on 8 days ago

Commit

d7ab2f5

verified ·

1 Parent(s): 804953d

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -82

app.py CHANGED Viewed

@@ -2,18 +2,18 @@ import spaces
 import gradio as gr
 import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-import re
 # Model configurations
 INDIC_EN_MODEL = "ai4bharat/indictrans2-indic-en-1B"
 EN_INDIC_MODEL = "ai4bharat/indictrans2-en-indic-1B"
-# Load tokenizers and models on CPU
-print("Loading IndicTrans2 tokenizers...")
 indic_en_tokenizer = AutoTokenizer.from_pretrained(INDIC_EN_MODEL, trust_remote_code=True)
 en_indic_tokenizer = AutoTokenizer.from_pretrained(EN_INDIC_MODEL, trust_remote_code=True)
-print("Loading IndicTrans2 models on CPU...")
 indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
     INDIC_EN_MODEL,
     trust_remote_code=True,
@@ -28,7 +28,10 @@ en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
     device_map="cpu"
 )
-# Language mappings for IndicTrans2
 LANGUAGE_CODES = {
     "Assamese": "asm_Beng",
     "Bengali": "ben_Beng",
@@ -58,22 +61,9 @@ LANGUAGE_CODES = {
     "English": "eng_Latn"
 }
-def format_input_for_indictrans2(text, src_lang, tgt_lang, direction):
-    """Format input text according to IndicTrans2 requirements"""
-    text = text.strip()
-    if direction == "en_to_indic":
-        # For English to Indic: format as "text </s> src_lang"
-        formatted_input = f"{text} </s> {src_lang}"
-    else:  # indic_to_en
-        # For Indic to English: format as "text </s> src_lang"
-        formatted_input = f"{text} </s> {src_lang}"
-    return formatted_input
 @spaces.GPU(duration=120)
 def translate_text(input_text, source_lang, target_lang, max_length):
-    """Translate text using IndicTrans2 models"""
     if not input_text.strip():
         return "Please enter text to translate."
@@ -85,101 +75,86 @@ def translate_text(input_text, source_lang, target_lang, max_length):
         src_code = LANGUAGE_CODES[source_lang]
         tgt_code = LANGUAGE_CODES[target_lang]
-        # Determine direction and model
         if source_lang == "English" and target_lang != "English":
-            # English to Indic translation
             model_gpu = en_indic_model.to(device)
             tokenizer = en_indic_tokenizer
             direction = "en_to_indic"
         elif source_lang != "English" and target_lang == "English":
-            # Indic to English translation
             model_gpu = indic_en_model.to(device)
             tokenizer = indic_en_tokenizer
             direction = "indic_to_en"
         else:
             return "Please select English as either source or target language (not both)."
-        # Format input properly for IndicTrans2
-        formatted_input = format_input_for_indictrans2(
-            input_text, src_code, tgt_code, direction
         )
-        # Tokenize with proper settings
         inputs = tokenizer(
-            formatted_input,
-            return_tensors="pt",
-            padding=True,
             truncation=True,
-            max_length=256,
-            return_token_type_ids=False
         ).to(device)
-        # Remove any unwanted keys
-        if 'token_type_ids' in inputs:
-            del inputs['token_type_ids']
-        # Set up generation parameters based on direction
-        if direction == "en_to_indic":
-            # Get target language token for decoder start
-            tgt_lang_token = tokenizer.convert_tokens_to_ids(tgt_code)
-        else:
-            # For Indic to English, use English token
-            tgt_lang_token = tokenizer.convert_tokens_to_ids("eng_Latn")
         # Generate translation
         with torch.no_grad():
             generated_tokens = model_gpu.generate(
-                input_ids=inputs['input_ids'],
-                attention_mask=inputs['attention_mask'],
-                decoder_start_token_id=tgt_lang_token if tgt_lang_token != tokenizer.unk_token_id else None,
                 max_length=max_length,
-                min_length=1,
                 num_beams=5,
                 num_return_sequences=1,
                 early_stopping=True,
-                do_sample=False,
                 pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                use_cache=True
             )
-        # Decode output
-        translated_text = tokenizer.decode(
-            generated_tokens[0],
             skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
         )
-        # Clean up the output
-        # Remove language tags and unwanted tokens
-        cleaned_output = re.sub(r'<.*?>', '', translated_text)
-        cleaned_output = cleaned_output.strip()
         # Move model back to CPU
         model_gpu.cpu()
         torch.cuda.empty_cache()
-        return cleaned_output if cleaned_output else "Translation failed. Please try again."
     except Exception as e:
-        # Clean up GPU memory in case of error
         if 'model_gpu' in locals():
             model_gpu.cpu()
         torch.cuda.empty_cache()
         return f"Error during translation: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="IndicTrans2 Translator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🇮🇳 IndicTrans2 - Official AI4Bharat Translator
     High-quality neural machine translation between English and 22 Indian languages.
     **Supported Languages**: Assamese, Bengali, Bodo, Dogri, Gujarati, Hindi, Kannada, Kashmiri,
     Konkani, Maithili, Malayalam, Manipuri, Marathi, Nepali, Odia, Punjabi, Sanskrit, Santali,
     Sindhi, Tamil, Telugu, Urdu.
-    **Note**: Select English as either source OR target language (not both).
     """)
     with gr.Row():
@@ -197,8 +172,6 @@ with gr.Blocks(title="IndicTrans2 Translator", theme=gr.themes.Soft()) as demo:
                     label="Source Language"
                 )
-                swap_btn = gr.Button("⇄", size="sm")
                 target_lang = gr.Dropdown(
                     choices=list(LANGUAGE_CODES.keys()),
                     value="Hindi",
@@ -222,17 +195,17 @@ with gr.Blocks(title="IndicTrans2 Translator", theme=gr.themes.Soft()) as demo:
                 interactive=False
             )
-            clear_btn = gr.Button("Clear All", variant="secondary")
-    # Examples that work with the corrected format
-    gr.Markdown("### 💡 Example Translations:")
     examples = [
-        ["Hello, how are you?", "English", "Hindi", 64],
-        ["Good morning, everyone!", "English", "Bengali", 64],
-        ["आपका नाम क्या है?", "Hindi", "English", 64],
-        ["আপনি কেমন আছেন?", "Bengali", "English", 64],
-        ["Technology is amazing.", "English", "Tamil", 96]
     ]
     gr.Examples(
@@ -243,9 +216,6 @@ with gr.Blocks(title="IndicTrans2 Translator", theme=gr.themes.Soft()) as demo:
     )
     # Event handlers
-    def swap_languages(src, tgt):
-        return tgt, src
     def clear_all():
         return "", ""
@@ -255,12 +225,6 @@ with gr.Blocks(title="IndicTrans2 Translator", theme=gr.themes.Soft()) as demo:
         outputs=output_text
     )
-    swap_btn.click(
-        swap_languages,
-        inputs=[source_lang, target_lang],
-        outputs=[source_lang, target_lang]
-    )
     clear_btn.click(
         clear_all,
         outputs=[input_text, output_text]

 import gradio as gr
 import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from IndicTransToolkit.processor import IndicProcessor
 # Model configurations
 INDIC_EN_MODEL = "ai4bharat/indictrans2-indic-en-1B"
 EN_INDIC_MODEL = "ai4bharat/indictrans2-en-indic-1B"
+print("Loading IndicTrans2 models...")
+# Load tokenizers
 indic_en_tokenizer = AutoTokenizer.from_pretrained(INDIC_EN_MODEL, trust_remote_code=True)
 en_indic_tokenizer = AutoTokenizer.from_pretrained(EN_INDIC_MODEL, trust_remote_code=True)
+# Load models on CPU
 indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
     INDIC_EN_MODEL,
     trust_remote_code=True,
     device_map="cpu"
 )
+# Initialize IndicProcessor (CRUCIAL for proper preprocessing)
+ip = IndicProcessor(inference=True)
+# Language mappings (exact codes from official documentation)
 LANGUAGE_CODES = {
     "Assamese": "asm_Beng",
     "Bengali": "ben_Beng",
     "English": "eng_Latn"
 }
 @spaces.GPU(duration=120)
 def translate_text(input_text, source_lang, target_lang, max_length):
+    """Translate using IndicTrans2 with proper preprocessing"""
     if not input_text.strip():
         return "Please enter text to translate."
         src_code = LANGUAGE_CODES[source_lang]
         tgt_code = LANGUAGE_CODES[target_lang]
+        # Determine direction and select appropriate model/tokenizer
         if source_lang == "English" and target_lang != "English":
+            # English to Indic
             model_gpu = en_indic_model.to(device)
             tokenizer = en_indic_tokenizer
             direction = "en_to_indic"
         elif source_lang != "English" and target_lang == "English":
+            # Indic to English
             model_gpu = indic_en_model.to(device)
             tokenizer = indic_en_tokenizer
             direction = "indic_to_en"
         else:
             return "Please select English as either source or target language (not both)."
+        # CRUCIAL: Use IndicProcessor for proper preprocessing
+        input_sentences = [input_text.strip()]
+        # Preprocess using IndicProcessor (this handles the proper formatting)
+        batch = ip.preprocess_batch(
+            input_sentences,
+            src_lang=src_code,
+            tgt_lang=tgt_code,
         )
+        # Tokenize the preprocessed batch
         inputs = tokenizer(
+            batch,
             truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
         ).to(device)
         # Generate translation
         with torch.no_grad():
             generated_tokens = model_gpu.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
                 max_length=max_length,
                 num_beams=5,
                 num_return_sequences=1,
                 early_stopping=True,
                 pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id
             )
+        # Decode generated tokens
+        generated_tokens = tokenizer.batch_decode(
+            generated_tokens,
             skip_special_tokens=True,
+            clean_up_tokenization_spaces=True,
         )
+        # CRUCIAL: Postprocess using IndicProcessor
+        translations = ip.postprocess_batch(generated_tokens, lang=tgt_code)
         # Move model back to CPU
         model_gpu.cpu()
         torch.cuda.empty_cache()
+        return translations[0] if translations else "Translation failed."
     except Exception as e:
         if 'model_gpu' in locals():
             model_gpu.cpu()
         torch.cuda.empty_cache()
         return f"Error during translation: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="IndicTrans2 Official Translator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🇮🇳 IndicTrans2 - Official AI4Bharat Translator
     High-quality neural machine translation between English and 22 Indian languages.
+    Uses official IndicTransToolkit for proper preprocessing.
     **Supported Languages**: Assamese, Bengali, Bodo, Dogri, Gujarati, Hindi, Kannada, Kashmiri,
     Konkani, Maithili, Malayalam, Manipuri, Marathi, Nepali, Odia, Punjabi, Sanskrit, Santali,
     Sindhi, Tamil, Telugu, Urdu.
     """)
     with gr.Row():
                     label="Source Language"
                 )
                 target_lang = gr.Dropdown(
                     choices=list(LANGUAGE_CODES.keys()),
                     value="Hindi",
                 interactive=False
             )
+            clear_btn = gr.Button("Clear", variant="secondary")
+    # Examples from official documentation
+    gr.Markdown("### 💡 Official Examples:")
     examples = [
+        ["When I was young, I used to go to the park every day.", "English", "Hindi", 128],
+        ["We watched a new movie last week, which was very inspiring.", "English", "Bengali", 128],
+        ["जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।", "Hindi", "English", 128],
+        ["हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।", "Hindi", "English", 128],
+        ["Technology is changing our world rapidly.", "English", "Tamil", 128]
     ]
     gr.Examples(
     )
     # Event handlers
     def clear_all():
         return "", ""
         outputs=output_text
     )
     clear_btn.click(
         clear_all,
         outputs=[input_text, output_text]