Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Apr 4

Commit

804add3

1 Parent(s): 4d84219

update

Browse files

Files changed (5) hide show

excel/excel_translate.py +2 -1
home.py +4 -0
pages/upload.py +4 -3
translate/translator.py +2 -2
word/word_translate.py +8 -8

excel/excel_translate.py CHANGED Viewed

@@ -95,7 +95,7 @@ def read_csv_with_auto_encoding(csv_path):
     return df
-def translate_csv(file_id, target_lang="vi", chunk_size=50):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client["csv"]
@@ -143,6 +143,7 @@ def translate_csv(file_id, target_lang="vi", chunk_size=50):
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
             target_lang=target_lang
         )

     return df
+def translate_csv(file_id, source_lang, target_lang="vi", chunk_size=50):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client["csv"]
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
+            source_lang=source_lang,
             target_lang=target_lang
         )

home.py CHANGED Viewed

@@ -30,3 +30,7 @@ st.subheader("25/03/2025")
 st.write("1. Đã hoàn thành file Word, Excel")
 st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")
 st.write("3. Sắp tới: file CSV, PDF, JPG")

 st.write("1. Đã hoàn thành file Word, Excel")
 st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")
 st.write("3. Sắp tới: file CSV, PDF, JPG")
+st.subheader("04/04/2025")
+st.write("1. Đã hoàn thành file Word, Excel, PPTX, CSV")
+st.write("2. Sắp tới: file PDF, JPG")

pages/upload.py CHANGED Viewed

@@ -3,6 +3,7 @@ import google.generativeai as genai
 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
 from powerpoint.pptx import translate_pptx
 from excel.xlsx import translate_xlsx
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
 import os
@@ -29,10 +30,10 @@ def process_file(file, file_type):
         progress_bar.progress(60)
     elif file_type == "Excel":
         final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
-    # elif file_type == "CSV":
-    #     final_id = translate_csv(file_id = file_id, target_lang = target_lang)
     elif file_type == "Word":
-        final_id = translate_docx_from_mongodb(file_id, target_lang)
     else:
         st.error("❌ Loại file không hỗ trợ!")
         return

 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
 from powerpoint.pptx import translate_pptx
 from excel.xlsx import translate_xlsx
+from excel.excel_translate import translate_csv
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
 import os
         progress_bar.progress(60)
     elif file_type == "Excel":
         final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
+    elif file_type == "CSV":
+        final_id = translate_csv(file_id = file_id, source_lang = source_lang, target_lang = target_lang)
     elif file_type == "Word":
+        final_id = translate_docx_from_mongodb(file_id, source_lang = source_lang, target_lang = target_lang)
     else:
         st.error("❌ Loại file không hỗ trợ!")
         return

translate/translator.py CHANGED Viewed

@@ -7,12 +7,12 @@ import os
 dotenv.load_dotenv(".env")
-def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi") -> Dict[str, List[str]]:
     def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
         """Translates a single batch of text."""
         prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
-        Read through the entire dictionary, then translate the texts into {target_lang} so that the meaning is as close to the intended context as possible.
         Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
         Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.

 dotenv.load_dotenv(".env")
+def translate_text_dict(text_dict: Dict[str, List[str]], source_lang:  str = "vi", target_lang: str = "vi") -> Dict[str, List[str]]:
     def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
         """Translates a single batch of text."""
         prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
+        Read through the entire dictionary, then translate the texts from {source_lang} into {target_lang} so that the meaning is as close to the intended context as possible.
         Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
         Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.

word/word_translate.py CHANGED Viewed

@@ -17,7 +17,7 @@ api_key = os.getenv("GEMINI_API_KEY")
 genai.configure(api_key=api_key)
 model = genai.GenerativeModel("gemini-2.0-flash")
-def batch_translate(texts, target_lang="Vietnamese"):
     """ Translates multiple text segments in a single API call. """
     if not texts:
         return texts  # Skip if empty
@@ -48,7 +48,7 @@ def batch_translate(texts, target_lang="Vietnamese"):
             - Return the result of translation according to the format. Do NOT return code for translating.
             """
     json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
-    user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
     model = genai.GenerativeModel('gemini-2.0-flash')
     response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
@@ -63,7 +63,7 @@ def batch_translate(texts, target_lang="Vietnamese"):
             translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
     return translated_texts
-def full_translate(texts, target_lang="Vietnamese"):
     full_translated_texts = []
     batch = []
     word_count = 0
@@ -71,13 +71,13 @@ def full_translate(texts, target_lang="Vietnamese"):
     for string in texts:
         if len(string.split()) + word_count >= 1000:
             print('Translating a batch.')
-            full_translated_texts += batch_translate(batch, target_lang)
             batch = []
             word_count = 0
         batch.append(string)
         word_count += len(string.split())
-    full_translated_texts += batch_translate(batch, target_lang)
     return full_translated_texts
 def merge_runs(runs):
@@ -169,7 +169,7 @@ def translate_tables(doc, translated_texts):
                 cell, i = translate_paragraphs(cell, translated_texts, i)
     return doc
-def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
         # Kết nối MongoDB
     client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client["word"]
@@ -183,10 +183,10 @@ def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
     # Lấy nội dung và dịch
     para_texts = get_text_elements_para(doc)
-    translated_para = full_translate(para_texts, target_lang)
     table_texts = get_text_elements_table(doc)
-    translated_tables = full_translate(table_texts, target_lang)
     # Cập nhật nội dung dịch vào document
     doc, _ = translate_paragraphs(doc, translated_para)

 genai.configure(api_key=api_key)
 model = genai.GenerativeModel("gemini-2.0-flash")
+def batch_translate(texts, source_lang = "English", target_lang="Vietnamese"):
     """ Translates multiple text segments in a single API call. """
     if not texts:
         return texts  # Skip if empty
             - Return the result of translation according to the format. Do NOT return code for translating.
             """
     json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
+    user_prompt = f"Source languag: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
     model = genai.GenerativeModel('gemini-2.0-flash')
     response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
             translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
     return translated_texts
+def full_translate(texts, source_lang, target_lang="Vietnamese"):
     full_translated_texts = []
     batch = []
     word_count = 0
     for string in texts:
         if len(string.split()) + word_count >= 1000:
             print('Translating a batch.')
+            full_translated_texts += batch_translate(batch, source_lang, target_lang)
             batch = []
             word_count = 0
         batch.append(string)
         word_count += len(string.split())
+    full_translated_texts += batch_translate(batch, source_lang, target_lang)
     return full_translated_texts
 def merge_runs(runs):
                 cell, i = translate_paragraphs(cell, translated_texts, i)
     return doc
+def translate_docx_from_mongodb(file_id, source_lang, target_lang="Vietnamese"):
         # Kết nối MongoDB
     client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
     db = client["word"]
     # Lấy nội dung và dịch
     para_texts = get_text_elements_para(doc)
+    translated_para = full_translate(para_texts, source_lang, target_lang)
     table_texts = get_text_elements_table(doc)
+    translated_tables = full_translate(table_texts, source_lang, target_lang)
     # Cập nhật nội dung dịch vào document
     doc, _ = translate_paragraphs(doc, translated_para)