Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Jun 22

Commit

d300944

1 Parent(s): be5d384

add japanese

Browse files

Files changed (9) hide show

pages/upload.py +2 -2
powerpoint/__pycache__/xml_handling.cpython-310.pyc +0 -0
powerpoint/xml_handling.py +0 -68
translate/__pycache__/translator.cpython-310.pyc +0 -0
translate/translator.py +1 -1
utils/__pycache__/utils.cpython-310.pyc +0 -0
utils/utils.py +3 -3
word/__pycache__/word_helper.cpython-310.pyc +0 -0
word/word_helper.py +1 -1

pages/upload.py CHANGED Viewed

@@ -75,11 +75,11 @@ with st.container():
     with col1:
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ của tài liệu</p>', unsafe_allow_html=True)
-        source_lang = st.selectbox(" ", ["chinese", "english", "vietnamese"], key="source_lang")
     with col2:
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ muốn dịch sang</p>', unsafe_allow_html=True)
-        target_lang = st.selectbox("  ", ["chinese", "english", "vietnamese"], key="target_lang")
 # Xử lý file trực tiếp
 def process_file(file, file_type):

     with col1:
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ của tài liệu</p>', unsafe_allow_html=True)
+        source_lang = st.selectbox(" ", ["Để máy tự xác định", "chinese", "english", "vietnamese", "japanese"], key="source_lang")
     with col2:
         st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ muốn dịch sang</p>', unsafe_allow_html=True)
+        target_lang = st.selectbox("  ", ["chinese", "english", "vietnamese", "japanese"], key="target_lang")
 # Xử lý file trực tiếp
 def process_file(file, file_type):

powerpoint/__pycache__/xml_handling.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ

powerpoint/xml_handling.py CHANGED Viewed

@@ -19,16 +19,6 @@ for prefix, uri in ns.items():
 def _get_paragraph_details(p_element):
-    """
-    Helper function to extract merged text and the first rPr associated with text
-    from a given <a:p> element. Handles text within <a:r> and <a:fld>.
-    Args:
-        p_element (ET.Element): The <a:p> element.
-    Returns:
-        tuple | None: (merged_text, first_rPr_with_text) if text exists, else None.
-    """
     paragraph_text_parts = []
     first_rPr_with_text = None
     found_first_rpr = False # Cờ để chỉ tìm rPr đầu tiên một lần
@@ -75,23 +65,6 @@ def _get_paragraph_details(p_element):
 # --- Hàm trích xuất chính (Trả về list các tuple chi tiết paragraph) ---
 def extract_text_from_slide(slide_file):
-    """
-    Trích xuất chi tiết từ từng thẻ <a:p> trong file slide XML.
-    Args:
-        slide_file (str): Đường dẫn đến file slide XML.
-    Returns:
-        list: Một list các tuple, mỗi tuple có dạng:
-              (paragraph_text, first_rPr_in_paragraph)
-              - paragraph_text (str): Toàn bộ text trong các <a:t> con cháu
-                của <a:p>, đã được ghép và strip().
-              - first_rPr_in_paragraph (ET.Element | None): Phần tử <a:rPr> của
-                <a:r> đầu tiên có chứa text trong <a:p> đó. Là None nếu run
-                đầu tiên có text không có thẻ <a:rPr>, hoặc nếu không có text
-                nào trong paragraph.
-              Trả về list rỗng nếu có lỗi hoặc không tìm thấy paragraph nào có text.
-    """
     # print(f"--- Bắt đầu trích xuất chi tiết từng <a:p> từ file: {slide_file} ---")
     extracted_data = [] # Danh sách kết quả cuối cùng
@@ -148,20 +121,6 @@ def extract_text_from_slide(slide_file):
 def replace_text_in_slide(xml_file_path, list_of_translated_paragraph_data):
-    """
-    Thay thế văn bản trong file XML slide, ghi đè file gốc.
-    *** Logic mới: ***
-    - Giảm cỡ chữ đi 0.85 lần.
-    - Nếu text > 20 chars: Loại bỏ định dạng bold (giữ nguyên case).
-    - Nếu text <= 20 chars: Giữ nguyên định dạng bold gốc (và case).
-    Args:
-        xml_file_path (str): Đường dẫn file XML slide gốc (sẽ bị ghi đè).
-        list_of_translated_paragraph_data (list): List các tuple
-            (translated_paragraph_text, original_first_rPr_in_paragraph).
-    Returns:
-        bool: True nếu thành công (ghi file), False nếu có lỗi.
-    """
     # print(f"\n--- Bắt đầu thay thế PARAGRAPH (ghi đè, logic length/bold) trong file: {os.path.basename(xml_file_path)} ---")
     processed_p_count = 0
@@ -331,19 +290,6 @@ def get_smartart_data_file(rels_file, base_path):
 def extract_text_from_smartart(xml_file_path):
-    """
-    Trích xuất văn bản tổng hợp từ mỗi đoạn <a:p> có chứa text
-    trong file XML SmartArt.
-    Args:
-        xml_file_path (str): Đường dẫn đến file XML SmartArt.
-    Returns:
-        list: Một list các tuple (paragraph_text, first_rPr_in_paragraph).
-              paragraph_text là toàn bộ text trong các <a:t> con cháu của <a:p>.
-              first_rPr_in_paragraph là element <a:rPr> của <a:r> đầu tiên
-              có chứa text trong <a:p> đó. Trả về list rỗng nếu lỗi.
-    """
     paragraph_data = []
     try:
         tree = ET.parse(xml_file_path)
@@ -390,20 +336,6 @@ def extract_text_from_smartart(xml_file_path):
 # --- Hàm thay thế theo từng đoạn <a:p> ---
 def replace_text_in_smartart(xml_file_path, list_of_translated_paragraph_data, output_xml_file_path):
-    """
-    Thay thế văn bản trong file XML SmartArt dựa trên dữ liệu đoạn <a:p> đã dịch.
-    Mỗi mục dịch sẽ thay thế nội dung text của một <a:p> tương ứng,
-    đặt toàn bộ text dịch vào một run <a:r> duy nhất với định dạng rPr được cung cấp.
-    Args:
-        xml_file_path (str): Đường dẫn file XML gốc.
-        list_of_translated_paragraph_data (list): List các tuple
-            (translated_paragraph_text, original_first_rPr_in_paragraph).
-        output_xml_file_path (str): Đường dẫn file XML đầu ra.
-    Returns:
-        bool: True nếu thành công, False nếu lỗi.
-    """
     p_index_for_data = 0 # Index để lấy dữ liệu dịch
     processed_p_count = 0 # Đếm số đoạn <a:p> đã được xử lý (thay thế)
     if not output_xml_file_path:

 def _get_paragraph_details(p_element):
     paragraph_text_parts = []
     first_rPr_with_text = None
     found_first_rpr = False # Cờ để chỉ tìm rPr đầu tiên một lần
 # --- Hàm trích xuất chính (Trả về list các tuple chi tiết paragraph) ---
 def extract_text_from_slide(slide_file):
     # print(f"--- Bắt đầu trích xuất chi tiết từng <a:p> từ file: {slide_file} ---")
     extracted_data = [] # Danh sách kết quả cuối cùng
 def replace_text_in_slide(xml_file_path, list_of_translated_paragraph_data):
     # print(f"\n--- Bắt đầu thay thế PARAGRAPH (ghi đè, logic length/bold) trong file: {os.path.basename(xml_file_path)} ---")
     processed_p_count = 0
 def extract_text_from_smartart(xml_file_path):
     paragraph_data = []
     try:
         tree = ET.parse(xml_file_path)
 # --- Hàm thay thế theo từng đoạn <a:p> ---
 def replace_text_in_smartart(xml_file_path, list_of_translated_paragraph_data, output_xml_file_path):
     p_index_for_data = 0 # Index để lấy dữ liệu dịch
     processed_p_count = 0 # Đếm số đoạn <a:p> đã được xử lý (thay thế)
     if not output_xml_file_path:

translate/__pycache__/translator.cpython-310.pyc CHANGED Viewed

Binary files a/translate/__pycache__/translator.cpython-310.pyc and b/translate/__pycache__/translator.cpython-310.pyc differ

translate/translator.py CHANGED Viewed

@@ -12,7 +12,7 @@ def translate_text_dict(text_dict: Dict[str, List[str]], source_lang:  str = "vi
         """Translates a single batch of text."""
         prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
-        Read through the entire dictionary, then translate the texts from {source_lang} into {target_lang} so that the meaning is as close to the intended context as possible.
         Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
         Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.

         """Translates a single batch of text."""
         prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
+        Read through the entire dictionary, then translate the texts to {target_lang} so that the meaning is as close to the intended context as possible.
         Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
         Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.

utils/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/utils.cpython-310.pyc and b/utils/__pycache__/utils.cpython-310.pyc differ

utils/utils.py CHANGED Viewed

@@ -37,11 +37,11 @@ def translate_single_text(text: str, source_lang: str = 'English', target_lang:
             model = genai.GenerativeModel('gemini-2.0-flash')  # hoặc 'gemini-1.5-flash'
             system_prompt = f"""You are a translation engine.
-Translate the following text accurately from {source_lang} to {target_lang}.
 Provide *only* the translated text as a single string.
 Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""
-            user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}"
             full_prompt = system_prompt.strip() + "\n\n" + user_prompt.strip()
             response = model.generate_content(
@@ -135,7 +135,7 @@ def translate_text(text_dict, source_lang='English', target_lang="Vietnamese", m
         6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
     """
-    user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string}\n\nTranslated JSON Output:"
     raw_translated_json_string = "{}"
     retry_count = 0

             model = genai.GenerativeModel('gemini-2.0-flash')  # hoặc 'gemini-1.5-flash'
             system_prompt = f"""You are a translation engine.
+Translate the following text accurately to {target_lang}.
 Provide *only* the translated text as a single string.
 Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""
+            user_prompt = f"Target language: {target_lang}. Text to translate: {text}"
             full_prompt = system_prompt.strip() + "\n\n" + user_prompt.strip()
             response = model.generate_content(
         6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
     """
+    user_prompt = f"Target language: {target_lang}. JSON String: {json_input_string}\n\nTranslated JSON Output:"
     raw_translated_json_string = "{}"
     retry_count = 0

word/__pycache__/word_helper.cpython-310.pyc CHANGED Viewed

Binary files a/word/__pycache__/word_helper.cpython-310.pyc and b/word/__pycache__/word_helper.cpython-310.pyc differ

word/word_helper.py CHANGED Viewed

@@ -34,7 +34,7 @@ def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
         6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
     """
     json_data = json.dumps({i: t for i, t in enumerate(texts)})
-    user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
     model = genai.GenerativeModel('gemini-2.0-flash')
     response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={

         6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
     """
     json_data = json.dumps({i: t for i, t in enumerate(texts)})
+    user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
     model = genai.GenerativeModel('gemini-2.0-flash')
     response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={