Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Mar 25

Commit

7da22a4

1 Parent(s): f543be8

a

Browse files

Files changed (10) hide show

.env +1 -1
excel/__pycache__/excel_translate.cpython-310.pyc +0 -0
excel/excel_translate.py +14 -15
pages/upload.py +3 -3
powerpoint/__pycache__/pptx_object.cpython-310.pyc +0 -0
powerpoint/__pycache__/xml_handling.cpython-310.pyc +0 -0
powerpoint/pptx_object.py +12 -15
powerpoint/xml_handling.py +3 -2
translate/__pycache__/translator.cpython-310.pyc +0 -0
translate/translator.py +1 -1

.env CHANGED Viewed

	@@ -1 +1 @@
1	- GEMINI_API_KEY = ~~AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg~~


1	+ GEMINI_API_KEY = AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8

excel/__pycache__/excel_translate.cpython-310.pyc CHANGED Viewed

Binary files a/excel/__pycache__/excel_translate.cpython-310.pyc and b/excel/__pycache__/excel_translate.cpython-310.pyc differ

excel/excel_translate.py CHANGED Viewed

@@ -10,26 +10,27 @@ import gridfs
 import tempfile
 import os
-def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en', target_lang: str = "fr", gemini_api: str = "", db_name: str = "excel"):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
     fs_input = gridfs.GridFS(db, collection="root_file")
     fs_output = gridfs.GridFS(db, collection="final_file")
     # Tải file từ MongoDB
-    file_data = fs_input.get(file_id).read()
     # Lưu file tạm thời
     with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
-        temp_file.write(file_data)
         temp_file_path = temp_file.name
     # Đọc file Excel bằng openpyxl
     wb = openpyxl.load_workbook(temp_file_path)
-    # Chọn sheet được chỉ định hoặc tất cả các sheet
-    sheets = [wb[sheet_name]] if sheet_name else wb.worksheets
     for ws in sheets:
         max_row = ws.max_row
@@ -48,7 +49,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
                     cell_map[key] = cell
         # Gọi hàm dịch theo dạng bulk
-        translated_dict = translate_text_dict(text_dict, target_lang=target_lang, gemini_api=gemini_api)
         # Cập nhật lại các cell với nội dung đã dịch
         for key, cell in cell_map.items():
@@ -61,7 +62,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
     with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
         wb.save(output_file.name)
         output_file.seek(0)
-        translated_file_id = fs_output.put(output_file.read(), filename=f"translated_{file_id}.xlsx")
     # Đóng workbook và xóa file tạm
     wb.close()
@@ -94,10 +95,10 @@ def read_csv_with_auto_encoding(csv_path):
     return df
-def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
     fs_input = gridfs.GridFS(db, collection="root_file")
     fs_output = gridfs.GridFS(db, collection="final_file")
@@ -113,9 +114,8 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
     # If text_columns is not specified, we assume we want to translate everything that looks like text.
     # Otherwise, only translate the given columns.
-    if text_columns is None:
-        # Example heuristic: choose all object/string columns
-        text_columns = df.select_dtypes(include=["object"]).columns.tolist()
     num_rows = len(df)
     num_chunks = math.ceil(num_rows / chunk_size)
@@ -143,8 +143,7 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
-            target_lang=target_lang,
-            gemini_api=gemini_api
         )
         # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame

 import tempfile
 import os
+def translate_xlsx(file_id: str, target_lang: str = ""):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client["excel"]
     fs_input = gridfs.GridFS(db, collection="root_file")
     fs_output = gridfs.GridFS(db, collection="final_file")
     # Tải file từ MongoDB
+    file_data = fs_input.get(file_id)
     # Lưu file tạm thời
     with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
+        temp_file.write(file_data.read())
         temp_file_path = temp_file.name
     # Đọc file Excel bằng openpyxl
     wb = openpyxl.load_workbook(temp_file_path)
+    sheets = wb.worksheets  # Chọn tất cả sheets nếu sheet_name không hợp lệ
     for ws in sheets:
         max_row = ws.max_row
                     cell_map[key] = cell
         # Gọi hàm dịch theo dạng bulk
+        translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
         # Cập nhật lại các cell với nội dung đã dịch
         for key, cell in cell_map.items():
     with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
         wb.save(output_file.name)
         output_file.seek(0)
+        translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
     # Đóng workbook và xóa file tạm
     wb.close()
     return df
+def translate_csv(file_id, target_lang="vi", chunk_size=50):
     # Kết nối MongoDB
     client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client["csv"]
     fs_input = gridfs.GridFS(db, collection="root_file")
     fs_output = gridfs.GridFS(db, collection="final_file")
     # If text_columns is not specified, we assume we want to translate everything that looks like text.
     # Otherwise, only translate the given columns.
+    text_columns = df.select_dtypes(include=["object"]).columns.tolist()
     num_rows = len(df)
     num_chunks = math.ceil(num_rows / chunk_size)
         # Now call your LLM translator on this dictionary
         translated_chunk = translate_text_dict(
             text_dict=chunk_dict,
+            target_lang=target_lang
         )
         # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame

pages/upload.py CHANGED Viewed

@@ -33,11 +33,11 @@ def process_file(file, file_type):
         translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
         progress_bar.progress(60)
         final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
-        final_id = create_translated_ppt("ppt", file_id, final_xml_id, "final_pptx")
     elif file_type == "Excel":
-        final_id = translate_xlsx(file_id, "en", target_lang, os.getenv("GEMINI_API_KEY"))
     elif file_type == "CSV":
-        final_id = translate_csv(file_id, "en", target_lang, os.getenv("GEMINI_API_KEY"))
     elif file_type == "Word":
         final_id = translate_docx_from_mongodb(file_id, target_lang)
     else:

         translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
         progress_bar.progress(60)
         final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
+        final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
     elif file_type == "Excel":
+        final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
     elif file_type == "CSV":
+        final_id = translate_csv(file_id = file_id, target_lang = target_lang)
     elif file_type == "Word":
         final_id = translate_docx_from_mongodb(file_id, target_lang)
     else:

powerpoint/__pycache__/pptx_object.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/pptx_object.cpython-310.pyc and b/powerpoint/__pycache__/pptx_object.cpython-310.pyc differ

powerpoint/__pycache__/xml_handling.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ

powerpoint/pptx_object.py CHANGED Viewed

@@ -283,7 +283,8 @@ def get_file_from_mongodb(db_name, collection_name, file_id):
     db = client[db_name]
     fs = GridFS(db, collection_name)
     file_data = fs.get(file_id)
-    return BytesIO(file_data.read())
 def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
@@ -292,18 +293,19 @@ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
     db = client[db_name]
     fs = GridFS(db, collection_name)
     file_id = fs.put(file_data, filename=file_name)
     return file_id
 def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
     """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
     try:
         # Kết nối MongoDB và tải file
-        original_ppt_io = get_file_from_mongodb(db_name, "root_file", original_ppt_id)
-        translated_xml_io = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
         # Load PowerPoint gốc và XML dịch
-        prs = Presentation(original_ppt_io)
-        tree = ET.parse(translated_xml_io)
         root = tree.getroot()
         # Áp dụng bản dịch
@@ -335,23 +337,18 @@ def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_co
                             except Exception as e:
                                 print(f"Error applying shape properties: {str(e)}")
-        # Lưu PowerPoint vào MongoDB
         output_io = BytesIO()
         prs.save(output_io)
         output_io.seek(0)  # Reset vị trí đọc
-        file_id = save_file_to_mongodb(db_name, output_collection, "translated_presentation.pptx", output_io)
         print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
         return file_id
     except Exception as e:
         print(f"Error creating translated PowerPoint: {str(e)}")
         return None
-def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
-    """Lưu tệp vào MongoDB GridFS"""
-    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = GridFS(db, collection_name)
-    file_id = fs.put(file_data, filename=file_name)
-    return file_id

     db = client[db_name]
     fs = GridFS(db, collection_name)
     file_data = fs.get(file_id)
+    return file_data
+    # return BytesIO(file_data.read())
 def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
     db = client[db_name]
     fs = GridFS(db, collection_name)
     file_id = fs.put(file_data, filename=file_name)
+    client.close()
     return file_id
 def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
     """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
     try:
         # Kết nối MongoDB và tải file
+        original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
+        translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
         # Load PowerPoint gốc và XML dịch
+        prs = Presentation(BytesIO(original_ppt.read()))
+        tree = ET.parse(BytesIO(translated_xml.read()))
         root = tree.getroot()
         # Áp dụng bản dịch
                             except Exception as e:
                                 print(f"Error applying shape properties: {str(e)}")
+        # Lưu PowerPoint vào MongoDB với tên gốc
         output_io = BytesIO()
         prs.save(output_io)
         output_io.seek(0)  # Reset vị trí đọc
+        # Giữ nguyên tên file gốc, thêm hậu tố "_translated"
+        translated_filename = original_ppt.filename.replace(".xml", ".pptx")
+        file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
         print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
         return file_id
     except Exception as e:
         print(f"Error creating translated PowerPoint: {str(e)}")
         return None

powerpoint/xml_handling.py CHANGED Viewed

@@ -124,7 +124,8 @@ def ppt_to_xml_mongodb(ppt_file_id: str, db_name="pptx"):
         # Lưu XML vào MongoDB
         xml_output = BytesIO(xml_str.encode("utf-8"))
-        xml_file_id = fs_xml.put(xml_output, filename=f"{ppt_file.filename}.xml")
         print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
         client.close()
@@ -363,7 +364,7 @@ def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[
         updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
         # Lưu file cập nhật vào MongoDB (final_xml)
-        new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}_translated.xml")
         print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
         return new_file_id

         # Lưu XML vào MongoDB
         xml_output = BytesIO(xml_str.encode("utf-8"))
+        file_name = ppt_file.filename.replace(".pptx", ".xml")
+        xml_file_id = fs_xml.put(xml_output, filename=file_name)
         print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
         client.close()
         updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
         # Lưu file cập nhật vào MongoDB (final_xml)
+        new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}")
         print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
         return new_file_id

translate/__pycache__/translator.cpython-310.pyc CHANGED Viewed

Binary files a/translate/__pycache__/translator.cpython-310.pyc and b/translate/__pycache__/translator.cpython-310.pyc differ

translate/translator.py CHANGED Viewed

@@ -21,7 +21,7 @@ def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi"
         Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
         genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-        model = genai.GenerativeModel("gemini-1.5-flash")
         response = model.generate_content(prompt) # Use a model appropriate for your needs and API key.  gemini-2.0-flash doesn't exist.  1.5-pro is a good general-purpose model.

         Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
         genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+        model = genai.GenerativeModel("gemini-2.0-flash")
         response = model.generate_content(prompt) # Use a model appropriate for your needs and API key.  gemini-2.0-flash doesn't exist.  1.5-pro is a good general-purpose model.