File size: 8,524 Bytes
2e10399 0ef21e6 7bbcfbb 0d1731f d3c9765 2e10399 ceb0d96 7bbcfbb 2e10399 7bbcfbb 3a73865 c5b2667 8de5a0d c5b2667 64b8f98 d59e3a5 c5b2667 7bbcfbb 8de5a0d c5b2667 d59e3a5 c5b2667 d59e3a5 c5b2667 d59e3a5 c5b2667 d59e3a5 c5b2667 d59e3a5 82558c5 8ba234c d59e3a5 c5b2667 7bbcfbb 8ba234c 7bbcfbb 8ba234c c5b2667 82558c5 7bbcfbb 8ba234c 82558c5 c5b2667 d59e3a5 8de5a0d 7bbcfbb 8ba234c 2e10399 8ba234c 2e10399 8ba234c d59e3a5 64b8f98 8ba234c 2e10399 8ba234c 2e10399 d59e3a5 64b8f98 8ba234c c5b2667 abf6391 77c2f5a abf6391 77c2f5a abf6391 c5b2667 8de5a0d 7bbcfbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import gradio as gr
import torch
# Define the model
model_name = "facebook/m2m100_418M"
try:
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
except Exception as e:
print(f"Error loading model or tokenizer: {e}")
exit(1)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Predefined common English-to-Farsi phrase mappings
common_phrases = {
"Hello": "سلام",
"Hi!": "سلام!",
"Good morning": "صبح بخیر",
"Good afternoon": "عصر بخیر",
"Good evening": "شب بخیر",
"Goodbye": "خداحافظ",
"Good night": "شب خوش",
"How are you?": "حالت چطوره؟",
"I am fine, thank you. And you?": "خوبم، متشکرم. و شما؟",
"Thank you (very much)": "متشکرم (خیلی ممنون)",
"You're welcome": "خواهش میکنم",
"Excuse me": "ببخشید",
"Pardon me": "معذرت میخواهم",
"I'm sorry": "متأسفم",
"Congratulations": "تبریک میگویم",
"Please sit down": "لطفاً بنشینید",
"Good luck": "موفق باشید",
"Have a good trip": "سفر خوبی داشته باشید",
"What is your name?": "اسم شما چیست؟",
"My name is Sara": "اسم من سارا است",
"Where are you from?": "اهل کجا هستید؟",
"I am from Iran": "من اهل ایران هستم",
"Do you speak English?": "آیا انگلیسی صحبت میکنید؟",
"I don't understand": "من متوجه نمیشوم",
"Please speak slowly": "لطفاً آهسته صحبت کنید",
"Do you have a Persian-English dictionary?": "آیا دیکشنری فارسی-انگلیسی دارید؟",
"How do you say this in English?": "این را در انگلیسی چگونه میگویند؟",
"How much is this?": "این چقدر قیمت دارد؟",
"Where is the bathroom?": "دستشویی کجاست؟",
"Help!": "کمک!",
"I am lost": "من گم شدهام",
"Can you help me?": "میتوانید به من کمک کنید؟",
"What time is it?": "ساعت چند است؟",
"Where is the hospital?": "بیمارستان کجاست؟",
"I love you": "دوستت دارم",
"How can I get to the airport?": "چطور میتوانم به فرودگاه بروم؟",
"I need a doctor": "به یک پزشک نیاز دارم",
"Where can I buy a ticket?": "از کجا میتوانم بلیط بخرم؟",
"I am hungry": "گرسنهام",
"Can I have some water?": "میتوانم کمی آب بگیرم؟",
"It’s very beautiful": "خیلی زیباست",
"See you later": "بعداً میبینمت",
"What is this?": "این چیست؟",
"I am happy": "خوشحالم",
"It is very chilly today": "امروز خیلی سرد است",
"I hope we have better weather tomorrow": "امیدوارم فردا هوا بهتر شود",
}
# Function to split text into smaller phrases
def split_into_phrases(text):
separators = [",", ".", "?", "!"]
phrases = [text]
for sep in separators:
new_phrases = []
for phrase in phrases:
new_phrases.extend(phrase.split(sep))
phrases = new_phrases
return [phrase.strip() for phrase in phrases if phrase.strip()]
# Improved transliteration function (Farsi to Cyrillic)
def transliterate_farsi_to_cyrillic(farsi_text):
word_map = {
"سلام": "Салом",
"خداحافظ": "Худоҳафиз",
"شب بخیر": "Шаб хайр",
"صبح بخیر": "Субҳ хайр",
"ممنون": "Ташаккур",
"خواهش میکنم": "Илтимос",
"چطور هستی؟": "Чӣ тур ҳастӣ?",
"چطور هستید؟": "Шумо чӣ туред?",
"بله": "Ҳа",
"نه": "Не",
"ایران": "Эрон",
"تشکر": "Ташаккур",
"فارسی": "Форсӣ",
"اسم من": "Номи ман",
"لطفا": "Илтимос",
"کمک": "Кумак",
"هستی": "ҳастӣ",
"هستید": "ҳастед",
"است": "аст",
"امروز": "Имрӯз",
"خیلی": "Хеле",
"سرد": "Сард",
"امیدوارم": "Умидворам",
"فردا": "Фардо",
"هوا": "Ҳаво",
"بهتر": "Беҳтар",
"شود": "Шавад",
}
char_map = {
"ا": "а",
"ب": "б",
"پ": "п",
"ت": "т",
"ج": "ж",
"چ": "ч",
"ح": "ҳ",
"خ": "х",
"د": "д",
"ر": "р",
"ز": "з",
"س": "с",
"ش": "ш",
"ص": "с",
"ط": "т",
"ع": "ъ",
"غ": "ғ",
"ف": "ф",
"ق": "қ",
"ک": "к",
"گ": "г",
"ل": "л",
"م": "м",
"ن": "н",
"و": "в",
"ه": "ҳ",
"ی": "й",
"؟": "?",
"،": ",",
" ": " ",
}
def transliterate_name(word):
if any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء" for c in word) and len(word) > 2:
return "".join(char_map.get(c, c) for c in word)
return word
if farsi_text in word_map:
return word_map[farsi_text]
words = farsi_text.split()
cyrillic_words = []
for word in words:
if word in word_map:
cyrillic_words.append(word_map[word])
else:
cyrillic_words.append(transliterate_name(word))
return " ".join(cyrillic_words)
# Translation function with input validation and cleaning
def translate_to_cyrillic_farsi(text):
if not text or not text.strip():
return "Error: Please enter a valid English text.", ""
if not all(ord(char) < 128 for char in text):
return "Error: Please enter text in English (ASCII characters only).", ""
# Try full sentence translation first
tokenizer.src_lang = "en"
encoded_text = tokenizer(text, return_tensors="pt", padding=True).to(device)
translated = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id("fa"))
farsi_text = tokenizer.decode(translated[0], skip_special_tokens=True)
# Clean the Farsi text (remove leading/trailing unwanted punctuation)
farsi_text = farsi_text.strip(".!?, ")
# Check if the translation is valid Farsi
if not farsi_text or not any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء،؟" for c in farsi_text.replace(" ", "")):
# Fall back to phrase-by-phrase translation
phrases = split_into_phrases(text)
farsi_translations = []
for phrase in phrases:
if phrase in common_phrases:
farsi_translations.append(common_phrases[phrase])
else:
tokenizer.src_lang = "en"
encoded_text = tokenizer(phrase, return_tensors="pt", padding=True).to(device)
translated = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id("fa"))
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True).strip(".!?, ")
if any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء،؟" for c in translated_text.replace(" ", "")):
farsi_translations.append(translated_text)
else:
farsi_translations.append(f"[UNTRANSLATED: {phrase}]")
farsi_text = " ".join(farsi_translations)
cyrillic_text = transliterate_farsi_to_cyrillic(farsi_text)
return farsi_text, cyrillic_text
# Gradio Interface
interface = gr.Interface(
fn=translate_to_cyrillic_farsi,
inputs=gr.Textbox(label="Enter Text in English"),
outputs=[
gr.Textbox(label="Farsi Translation (Native Script)"),
gr.Textbox(label="Farsi Translation (Cyrillic Script)"),
],
title="English to Cyrillic Farsi Translator",
description="Enter an English word or sentence, and this tool will translate it to Farsi in both native and Cyrillic scripts."
)
# Launch the app
if __name__ == "__main__":
interface.launch() |