Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Jun 27

Commit

4c846d3

1 Parent(s): d300944

change api

Browse files

Files changed (2) hide show

.env +1 -1
test.ipynb +377 -4

.env CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- GEMINI_API_KEY = ~~AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8~~
2	MONGODB_URI = mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0


1	+ GEMINI_API_KEY = AIzaSyDIPbH7zKoeTS5aKMQuMjzBBiVlWadcmr8
2	MONGODB_URI = mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0

test.ipynb CHANGED Viewed

@@ -252,12 +252,12 @@
      "output_type": "stream",
      "text": [
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n",
-      "✅ Đã xóa 0 file trong collection 'root_file' của db 'excel'\n",
-      "✅ Đã xóa 3 file trong collection 'root_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n",
-      "✅ Đã xóa 0 file trong collection 'final_file' của db 'excel'\n",
-      "✅ Đã xóa 3 file trong collection 'final_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n"
      ]
     }
@@ -864,6 +864,379 @@
     "str = 'Kiểm tra ngoại quan giá đỡ'\n",
     "len(str)"
    ]
   }
  ],
  "metadata": {

      "output_type": "stream",
      "text": [
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'word'\n",
+      "✅ Đã xóa 8 file trong collection 'root_file' của db 'excel'\n",
+      "✅ Đã xóa 0 file trong collection 'root_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'root_file' của db 'csv'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'word'\n",
+      "✅ Đã xóa 7 file trong collection 'final_file' của db 'excel'\n",
+      "✅ Đã xóa 0 file trong collection 'final_file' của db 'pptx'\n",
       "✅ Đã xóa 0 file trong collection 'final_file' của db 'csv'\n"
      ]
     }
     "str = 'Kiểm tra ngoại quan giá đỡ'\n",
     "len(str)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import zipfile\n",
+    "import copy\n",
+    "import time\n",
+    "import xml.etree.ElementTree as ET\n",
+    "from typing import List, Dict, Any, Optional, Tuple\n",
+    "from utils.utils import translate_text, unzip_office_file, preprocess_text, postprocess_text, translate_single_text\n",
+    "from pymongo import MongoClient\n",
+    "import gridfs\n",
+    "from io import BytesIO\n",
+    "import shutil\n",
+    "import io\n",
+    "import re\n",
+    "from typing import Dict\n",
+    "\n",
+    "\n",
+    "NS_MAIN = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}\n",
+    "NS_DRAWING = {'xdr': \"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing\"}\n",
+    "NS_A = {'a': \"http://schemas.openxmlformats.org/drawingml/2006/main\"}\n",
+    "\n",
+    "# --- Hàm đăng ký namespace (quan trọng khi ghi file) ---\n",
+    "def register_namespaces(xml_file):\n",
+    "    \"\"\"Đọc và đăng ký các namespace từ file XML.\"\"\"\n",
+    "    namespaces = dict([\n",
+    "        node for _, node in ET.iterparse(xml_file, events=['start-ns'])\n",
+    "    ])\n",
+    "    for ns, uri in namespaces.items():\n",
+    "        ET.register_namespace(ns, uri)\n",
+    "\n",
+    "    # Đăng ký thêm namespace phổ biến nếu chưa có\n",
+    "    if 'main' not in namespaces and '' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n",
+    "         ET.register_namespace('', NS_MAIN['main'])\n",
+    "    elif 'main' not in namespaces and NS_MAIN['main'] not in namespaces.values():\n",
+    "         ET.register_namespace('main', NS_MAIN['main'])\n",
+    "\n",
+    "    # Đăng ký namespaces cho drawing nếu cần\n",
+    "    if 'xdr' not in namespaces and NS_DRAWING['xdr'] not in namespaces.values():\n",
+    "        ET.register_namespace('xdr', NS_DRAWING['xdr'])\n",
+    "    if 'a' not in namespaces and NS_A['a'] not in namespaces.values():\n",
+    "        ET.register_namespace('a', NS_A['a'])\n",
+    "\n",
+    "\n",
+    "def extract_text_from_sheet(unzipped_folder_path: str) -> Optional[Tuple[List[Dict[str, Any]], Dict[str, Any]]]:\n",
+    "    \"\"\"\n",
+    "    Trích xuất text, lưu lại định dạng của run đầu tiên nếu là Rich Text,\n",
+    "    bao gồm cả text từ TextBoxes trong drawings.\n",
+    "    \"\"\"\n",
+    "    modifiable_nodes = []\n",
+    "    shared_strings_path = os.path.join(unzipped_folder_path, \"xl\", \"sharedStrings.xml\")\n",
+    "    worksheets_folder = os.path.join(unzipped_folder_path, \"xl\", \"worksheets\")\n",
+    "    drawings_folder = os.path.join(unzipped_folder_path, \"xl\", \"drawings\") # Thêm dòng này\n",
+    "\n",
+    "    shared_tree = None\n",
+    "    sheet_trees = {}\n",
+    "    drawing_trees = {} # Thêm dòng này\n",
+    "\n",
+    "    # --- Xử lý sharedStrings.xml ---\n",
+    "    if os.path.exists(shared_strings_path):\n",
+    "        try:\n",
+    "            register_namespaces(shared_strings_path) # Đảm bảo register_namespaces được gọi\n",
+    "            shared_tree = ET.parse(shared_strings_path)\n",
+    "            root_shared = shared_tree.getroot()\n",
+    "\n",
+    "            for si_element in root_shared.findall('main:si', NS_MAIN):\n",
+    "                text_parts = []\n",
+    "                # Tìm tất cả <t> con, bất kể chúng nằm trong <r> hay không\n",
+    "                t_elements = si_element.findall('.//main:t', NS_MAIN)\n",
+    "\n",
+    "                first_r = si_element.find('./main:r', NS_MAIN)\n",
+    "                first_rpr_clone = None\n",
+    "                is_rich_text = first_r is not None # Rich text nếu có ít nhất một <r>\n",
+    "\n",
+    "                if is_rich_text:\n",
+    "                    # Cố gắng tìm <rPr> bên trong <r> đầu tiên\n",
+    "                    first_rpr_candidate = si_element.find('./main:r/main:rPr', NS_MAIN)\n",
+    "                    if first_rpr_candidate is not None:\n",
+    "                        first_rpr_clone = copy.deepcopy(first_rpr_candidate)\n",
+    "                    else:\n",
+    "                        # Nếu <r> đầu tiên không có <rPr>, kiểm tra <si><rPh><rPr> (Phonetic properties, ít gặp hơn)\n",
+    "                        # Hoặc có thể không có định dạng nào cụ thể ở run đầu\n",
+    "                        pass\n",
+    "\n",
+    "\n",
+    "                for t_node in t_elements:\n",
+    "                    if t_node.text:\n",
+    "                        text_parts.append(t_node.text)\n",
+    "                full_text = \"\".join(text_parts)\n",
+    "\n",
+    "                if not full_text or full_text.isspace(): continue\n",
+    "\n",
+    "                # Logic xác định type dựa trên sự hiện diện của <r> và <rPr> đã được điều chỉnh\n",
+    "                if is_rich_text : # Chỉ cần có <r> là đủ, first_rpr_clone có thể là None\n",
+    "                    modifiable_nodes.append({\n",
+    "                        'type': 'shared_rich',\n",
+    "                        'original_text': full_text,\n",
+    "                        'element': si_element,\n",
+    "                        'first_format': first_rpr_clone, # Sẽ là None nếu <r> đầu không có <rPr>\n",
+    "                        'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n",
+    "                        'sheet_name': None\n",
+    "                    })\n",
+    "                elif t_elements:\n",
+    "                    direct_t = si_element.find('./main:t', NS_MAIN)\n",
+    "                    if direct_t is not None:\n",
+    "                         modifiable_nodes.append({\n",
+    "                            'type': 'shared_simple',\n",
+    "                            'original_text': full_text,\n",
+    "                            'element': direct_t, # Tham chiếu <t>\n",
+    "                            'first_format': None,\n",
+    "                            'source_file': os.path.join(\"xl\", \"sharedStrings.xml\"),\n",
+    "                            'sheet_name': None\n",
+    "                        })\n",
+    "                    # else: ít khả năng xảy ra nếu t_elements có phần tử\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            print(f\"Lỗi xử lý sharedStrings: {e}\")\n",
+    "            import traceback\n",
+    "            traceback.print_exc()\n",
+    "\n",
+    "\n",
+    "    # --- Xử lý các file sheetX.xml (Inline Strings) ---\n",
+    "    if os.path.isdir(worksheets_folder):\n",
+    "        for sheet_filename in sorted(os.listdir(worksheets_folder)):\n",
+    "             if sheet_filename.lower().endswith(\".xml\"):\n",
+    "                sheet_file_path = os.path.join(worksheets_folder, sheet_filename)\n",
+    "                try:\n",
+    "                    register_namespaces(sheet_file_path) # Đảm bảo register_namespaces được gọi\n",
+    "                    sheet_tree = ET.parse(sheet_file_path)\n",
+    "                    sheet_trees[sheet_filename] = sheet_tree\n",
+    "                    root_sheet = sheet_tree.getroot()\n",
+    "                    for cell in root_sheet.findall('.//main:c[@t=\"inlineStr\"]', NS_MAIN):\n",
+    "                        t_element = cell.find('.//main:is/main:t', NS_MAIN) # Sửa lại tìm kiếm <t>\n",
+    "                        if t_element is not None and t_element.text is not None and t_element.text.strip():\n",
+    "                             modifiable_nodes.append({\n",
+    "                                'type': 'inline',\n",
+    "                                'original_text': t_element.text,\n",
+    "                                'element': t_element,\n",
+    "                                'first_format': None,\n",
+    "                                'source_file': os.path.join(\"xl\", \"worksheets\", sheet_filename),\n",
+    "                                'sheet_name': sheet_filename\n",
+    "                             })\n",
+    "                except Exception as e:\n",
+    "                     print(f\"Lỗi xử lý sheet {sheet_filename}: {e}\")\n",
+    "                     import traceback\n",
+    "                     traceback.print_exc()\n",
+    "    else:\n",
+    "        print(f\"Cảnh báo: Không tìm thấy thư mục worksheets: {worksheets_folder}\")\n",
+    "\n",
+    "\n",
+    "    # --- Xử lý các file drawingX.xml (Text Boxes, Shapes with Text) ---\n",
+    "    if os.path.isdir(drawings_folder):\n",
+    "        for drawing_filename in sorted(os.listdir(drawings_folder)):\n",
+    "            if drawing_filename.lower().endswith(\".xml\"):\n",
+    "                drawing_file_path = os.path.join(drawings_folder, drawing_filename)\n",
+    "                try:\n",
+    "                    register_namespaces(drawing_file_path) # Đảm bảo register_namespaces được gọi\n",
+    "                    drawing_tree = ET.parse(drawing_file_path)\n",
+    "                    drawing_trees[drawing_filename] = drawing_tree\n",
+    "                    root_drawing = drawing_tree.getroot()\n",
+    "\n",
+    "                    # TextBoxes và Shapes có text thường nằm trong <xdr:sp> (shape) -> <xdr:txBody> (text body)\n",
+    "                    # Bên trong <xdr:txBody> là các <a:p> (paragraph)\n",
+    "                    for p_element in root_drawing.findall('.//xdr:txBody/a:p', {**NS_DRAWING, **NS_A}):\n",
+    "                        text_parts = []\n",
+    "                        # Lấy text từ tất cả <a:t> trong paragraph này\n",
+    "                        t_elements = p_element.findall('.//a:t', NS_A)\n",
+    "\n",
+    "                        first_r = p_element.find('./a:r', NS_A) # Tìm <a:r> con trực tiếp đầu tiên của <a:p>\n",
+    "                        first_rpr_clone = None # Định dạng của run đầu tiên trong paragraph\n",
+    "\n",
+    "                        is_rich_text_paragraph = first_r is not None # Coi là rich nếu có <a:r>\n",
+    "\n",
+    "                        if is_rich_text_paragraph:\n",
+    "                            # Tìm <a:rPr> bên trong <a:r> đầu tiên của <a:p>\n",
+    "                            first_rpr = first_r.find('./a:rPr', NS_A)\n",
+    "                            if first_rpr is not None:\n",
+    "                                first_rpr_clone = copy.deepcopy(first_rpr)\n",
+    "\n",
+    "                        for t_node in t_elements:\n",
+    "                            if t_node.text:\n",
+    "                                text_parts.append(t_node.text)\n",
+    "                        full_text = \"\".join(text_parts)\n",
+    "\n",
+    "                        if not full_text or full_text.isspace(): continue\n",
+    "\n",
+    "                        # Lưu node là <a:p> vì chúng ta sẽ thay thế toàn bộ nội dung của nó\n",
+    "                        # (các <a:r> và <a:t> bên trong)\n",
+    "                        modifiable_nodes.append({\n",
+    "                            'type': 'drawing_text', # Loại mới cho text trong drawing\n",
+    "                            'original_text': full_text,\n",
+    "                            'element': p_element,      # Tham chiếu đến <a:p>\n",
+    "                            'first_format': first_rpr_clone, # Lưu định dạng <a:rPr> của <a:r> đầu tiên (hoặc None)\n",
+    "                            'source_file': os.path.join(\"xl\", \"drawings\", drawing_filename),\n",
+    "                            'sheet_name': None # Có thể tìm cách liên kết ngược lại sheet nếu cần\n",
+    "                        })\n",
+    "                except Exception as e:\n",
+    "                    print(f\"Lỗi xử lý drawing {drawing_filename}: {e}\")\n",
+    "                    import traceback\n",
+    "                    traceback.print_exc()\n",
+    "    else:\n",
+    "        print(f\"Thông tin: Không tìm thấy thư mục drawings: {drawings_folder}\")\n",
+    "\n",
+    "\n",
+    "    global_data = {\n",
+    "        \"shared_tree\": shared_tree,\n",
+    "        \"sheet_trees\": sheet_trees,\n",
+    "        \"drawing_trees\": drawing_trees, # Thêm dòng này\n",
+    "        \"shared_strings_path\": shared_strings_path,\n",
+    "        \"worksheets_folder\": worksheets_folder,\n",
+    "        \"drawings_folder\": drawings_folder # Thêm dòng này\n",
+    "    }\n",
+    "    return modifiable_nodes, global_data\\\n",
+    "\n",
+    "\n",
+    "\n",
+    "def zip_folder_to_excel_file(folder_path, file_name):\n",
+    "    try:\n",
+    "        # Nén thư mục thành file .xlsx trong RAM\n",
+    "        xlsx_buffer = io.BytesIO()\n",
+    "        with zipfile.ZipFile(xlsx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
+    "            for root, _, files in os.walk(folder_path):\n",
+    "                for file in files:\n",
+    "                    file_path = os.path.join(root, file)\n",
+    "                    archive_path = os.path.relpath(file_path, folder_path)\n",
+    "                    zipf.write(file_path, archive_path)\n",
+    "\n",
+    "        xlsx_buffer.seek(0)\n",
+    "\n",
+    "        client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
+    "        db = client['excel']  \n",
+    "        fs = gridfs.GridFS(db, collection='final_file')\n",
+    "\n",
+    "        file_id = fs.put(xlsx_buffer.read(), filename=file_name)\n",
+    "        print(f\"✅ Đã lưu file Excel vào MongoDB với ID: {file_id}\")\n",
+    "        return file_id\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Lỗi khi nén và lưu Excel vào MongoDB: {e}\")\n",
+    "        return None\n",
+    "    \n",
+    "\n",
+    "\n",
+    "def translate_xlsx(file_id, file_name, source_lang='en', target_lang='vi', batch_size_segments=50, max_words_per_segment=100, delay_between_requests=1):\n",
+    "    \n",
+    "    client = MongoClient(\"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0\")\n",
+    "    db = client['excel']\n",
+    "    fs = gridfs.GridFS(db, collection='root_file')\n",
+    "    \n",
+    "    ppt_file = fs.get(file_id)\n",
+    "    excel_file = BytesIO(ppt_file.read())\n",
+    "\n",
+    "    xml_folder = unzip_office_file(excel_file)\n",
+    "    path_to_workbook_xml = os.path.join(xml_folder, \"xl\", \"workbook.xml\")\n",
+    "    translate_sheet_names_via_regex(path_to_workbook_xml, source_lang, target_lang)\n",
+    "\n",
+    "    modifiable_nodes, global_data = extract_text_from_sheet(xml_folder)\n",
+    "\n",
+    "    original_texts = get_text_list_from_nodes(modifiable_nodes)\n",
+    "\n",
+    "    all_results = [None] * len(original_texts)\n",
+    "    current_index = 0\n",
+    "    processed_count = 0\n",
+    "    api_call_counter = 0 # Track API calls for delay logic\n",
+    "\n",
+    "    while current_index < len(original_texts):\n",
+    "        batch_texts_to_translate = []\n",
+    "        batch_original_indices = [] # 0-based indices for assignment\n",
+    "        batch_end_index = min(current_index + batch_size_segments, len(original_texts))\n",
+    "        found_long_segment_at = -1 # 0-based index in original_texts\n",
+    "\n",
+    "        # 1. Build the next potential batch, stopping if a long segment is found\n",
+    "        for i in range(current_index, batch_end_index):\n",
+    "            segment = original_texts[i]\n",
+    "            word_count = count_words(segment)\n",
+    "\n",
+    "            if word_count <= max_words_per_segment:\n",
+    "                batch_texts_to_translate.append(segment)\n",
+    "                batch_original_indices.append(i)\n",
+    "            else:\n",
+    "                found_long_segment_at = i\n",
+    "                break # Stop building this batch\n",
+    "\n",
+    "        # --- Process the findings ---\n",
+    "\n",
+    "        # 2. Translate the VALID batch collected *before* the long segment (if any)\n",
+    "        if batch_texts_to_translate:\n",
+    "            # Add delay BEFORE the API call if it's not the very first call\n",
+    "            if api_call_counter > 0 and delay_between_requests > 0:\n",
+    "                    time.sleep(delay_between_requests)\n",
+    "\n",
+    "            translated_batch = _translate_batch_helper(\n",
+    "                batch_texts_to_translate,\n",
+    "                [idx + 1 for idx in batch_original_indices], # 1-based for logging\n",
+    "                source_lang,\n",
+    "                target_lang\n",
+    "            )\n",
+    "            api_call_counter += 1\n",
+    "            # Assign results back\n",
+    "            for batch_idx, original_idx in enumerate(batch_original_indices):\n",
+    "                all_results[original_idx] = translated_batch[batch_idx]\n",
+    "            processed_count += len(batch_texts_to_translate)\n",
+    "\n",
+    "        # 3. Handle the long segment INDIVIDUALLY (if one was found)\n",
+    "        if found_long_segment_at != -1:\n",
+    "            long_segment_index = found_long_segment_at\n",
+    "            long_segment_text = str(original_texts[long_segment_index])\n",
+    "            # word_count = count_words(long_segment_text) # Recalculate for log clarity\n",
+    "\n",
+    "            try:\n",
+    "                translated = translate_single_text(long_segment_text, source_lang, target_lang)\n",
+    "                \n",
+    "                final = [translated]\n",
+    "                api_call_counter += 1\n",
+    "\n",
+    "                if len(final) == 1:\n",
+    "                    all_results[long_segment_index] = final[0]\n",
+    "                else:\n",
+    "                    print(f\"    *** CRITICAL ERROR: Long segment translation result count mismatch! Expected 1, got {len(final)}. Marking as failed.\")\n",
+    "                    all_results[long_segment_index] = \"<translation_length_mismatch_error>\"\n",
+    "\n",
+    "            except Exception as e:\n",
+    "                print(f\"    *** ERROR during translation of long segment {long_segment_index + 1}: {e}. Marking as failed.\")\n",
+    "                # traceback.print_exc() # Uncomment for detailed debug\n",
+    "                all_results[long_segment_index] = \"<translation_api_error>\"\n",
+    "                # Do not increment api_call_counter if the API call itself failed before returning\n",
+    "\n",
+    "            processed_count += 1\n",
+    "            # Update current_index to start AFTER this long segment\n",
+    "            current_index = long_segment_index + 1\n",
+    "\n",
+    "        else:\n",
+    "            # No long segment was found in the range checked.\n",
+    "            # Move current_index to the end of the range examined.\n",
+    "            current_index = batch_end_index\n",
+    "\n",
+    "    missing_count = 0\n",
+    "    final_texts_for_nodes = []\n",
+    "    for i, res in enumerate(all_results):\n",
+    "            if res is None:\n",
+    "                print(f\"LỖI LOGIC: Segment {i+1} không được xử lý! Giữ lại text gốc: '{original_texts[i]}'\")\n",
+    "                final_texts_for_nodes.append(original_texts[i])\n",
+    "                missing_count += 1\n",
+    "            else:\n",
+    "                final_texts_for_nodes.append(res)\n",
+    "\n",
+    "    if missing_count > 0:\n",
+    "            print(f\"CẢNH BÁO NGHIÊM TRỌNG: {missing_count} segments bị bỏ lỡ trong quá trình xử lý.\")\n",
+    "\n",
+    "    if len(final_texts_for_nodes) != len(original_texts):\n",
+    "        print(f\"LỖI NGHIÊM TRỌNG: Số lượng text cuối cùng ({len(final_texts_for_nodes)}) không khớp với gốc ({len(original_texts)}). Hủy bỏ cập nhật.\")\n",
+    "    else:\n",
+    "        # Gán vào node\n",
+    "        for i, node_info in enumerate(modifiable_nodes):\n",
+    "            node_info['modified_text'] = final_texts_for_nodes[i]\n",
+    "        \n",
+    "        save_success = apply_and_save_changes(modifiable_nodes, global_data)\n",
+    "        if not save_success:\n",
+    "            print(\"LỖI NGHIÊM TRỌNG: Không thể lưu thay đổi vào file XML.\")\n",
+    "        else:\n",
+    "            # Only zip if saving XML was successful\n",
+    "            final_id = zip_folder_to_excel_file(xml_folder, file_name)\n",
+    "            if final_id:\n",
+    "                shutil.rmtree(xml_folder) # Mark folder as 'handled' by zipping\n",
+    "            else:\n",
+    "                print(\"LỖI NGHIÊM TRỌNG: Không thể tạo file XLSX đã dịch cuối cùng.\")\n",
+    "    return final_id"
+   ]
   }
  ],
  "metadata": {