mintlee commited on
Commit
0e9ff78
·
1 Parent(s): 95bd308

Add application file

Browse files
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY = AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg
README.md CHANGED
@@ -1,13 +1,3 @@
1
- ---
2
- title: MT Deploy
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.43.2
8
- app_file: app.py
9
- pinned: false
10
- short_description: deploy Machine Translation
11
- ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Machine-Translation
 
 
 
 
 
 
 
 
 
 
2
 
3
+ - Link drive: https://drive.google.com/drive/folders/19htOXYBz88eNIWU0-_3xEn1JRU-JaIvW?usp=drive_link
db/mongodb.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ import gridfs
3
+ from bson import ObjectId
4
+ import os
5
+ from io import BytesIO
6
+ import magic
7
+
8
+ def connect_mongodb(db_name, collection_name):
9
+ client = MongoClient("mongodb://localhost:27017")
10
+ db = client[db_name]
11
+ fs = gridfs.GridFS(db, collection=collection_name)
12
+ return fs
13
+
14
+
15
+
16
+ def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_file", file_name=None, file_tail=".pptx"):
17
+ """
18
+ Lưu file PowerPoint (pptx) vào MongoDB bằng GridFS
19
+ nhưng không lưu nếu tên file đã tồn tại.
20
+
21
+ :param uploaded_file: đối tượng UploadedFile từ Streamlit
22
+ :param db_name: Tên database trong MongoDB
23
+ :param collection_name: Tên collection GridFS
24
+ :param file_name: Tên file muốn lưu (không cần .pptx). Nếu để None, lấy tên gốc.
25
+ :return: file_id nếu lưu thành công, None nếu file đã tồn tại
26
+ """
27
+ client = MongoClient("mongodb://localhost:27017/")
28
+ db = client[db_name]
29
+ fs = gridfs.GridFS(db, collection=collection_name)
30
+
31
+ # Xác định tên file
32
+ if not file_name:
33
+ # Lấy tên file từ uploaded_file (VD: "slide.pptx")
34
+ file_name = uploaded_file.name
35
+ else:
36
+ # Nếu người dùng chỉ truyền tên, thêm .pptx nếu chưa có
37
+ if not file_name.endswith(file_tail):
38
+ file_name = file_name + file_tail
39
+
40
+ # Kiểm tra file đã tồn tại trong MongoDB chưa
41
+ existing_file = fs.find_one({"filename": file_name})
42
+ if existing_file:
43
+ print(f"⚠️ File '{file_name}' đã tồn tại trong MongoDB. Không lưu lại. Xin vui lòng đổi tên.")
44
+ client.close()
45
+ return None
46
+
47
+ # Đảm bảo con trỏ file đang ở đầu
48
+ uploaded_file.seek(0)
49
+ file_bytes = uploaded_file.read()
50
+
51
+ # Lưu nội dung file (bytes) vào MongoDB
52
+ file_id = fs.put(file_bytes, filename=file_name)
53
+ print(f"✅ File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id}")
54
+ client.close()
55
+ return file_id
56
+
57
+ def delete_pptx_from_mongodb(file_id, db_name="ppt", collection_name="root_file"):
58
+ """
59
+ Xóa file PowerPoint khỏi MongoDB theo ID.
60
+
61
+ :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)
62
+ :param db_name: Tên database trong MongoDB
63
+ :param collection_name: Tên collection GridFS
64
+ """
65
+ # Kết nối đến MongoDB
66
+ client = MongoClient("mongodb://localhost:27017/")
67
+ db = client[db_name]
68
+ fs = gridfs.GridFS(db, collection=collection_name)
69
+
70
+ try:
71
+ # Chuyển đổi ID nếu cần
72
+ if not isinstance(file_id, ObjectId):
73
+ file_id = ObjectId(file_id)
74
+
75
+ # Kiểm tra file có tồn tại không
76
+ if fs.exists(file_id):
77
+ fs.delete(file_id)
78
+ print(f"✅ Đã xóa file với ID: {file_id}")
79
+ else:
80
+ print(f"⚠️ Không tìm thấy file với ID: {file_id}")
81
+ except Exception as e:
82
+ print(f"❌ Lỗi khi xóa file: {e}")
83
+
84
+ client.close()
85
+
86
+ def download_pptx_from_mongodb(file_id, save_path, save_name, db_name="ppt", collection_name="root_file"):
87
+ """
88
+ Tải file PowerPoint từ MongoDB GridFS và lưu về máy.
89
+
90
+ :param file_id: ID của file cần tải (dạng chuỗi hoặc ObjectId)
91
+ :param save_path: Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')
92
+ :param save_name: Tên file khi lưu (VD: 'my_presentation.pptx')
93
+ :param db_name: Tên database trong MongoDB (mặc định: 'ppt')
94
+ :param collection_name: Tên collection GridFS (mặc định: 'root_file')
95
+ """
96
+ # Đảm bảo thư mục lưu file tồn tại
97
+ os.makedirs(save_path, exist_ok=True)
98
+
99
+ # Tạo đường dẫn đầy đủ cho file
100
+ full_file_path = os.path.join(save_path, save_name)
101
+
102
+ # Kết nối đến MongoDB
103
+ client = MongoClient("mongodb://localhost:27017/")
104
+ db = client[db_name]
105
+ fs = gridfs.GridFS(db, collection=collection_name)
106
+
107
+ try:
108
+ # Chuyển đổi ID nếu cần
109
+ if not isinstance(file_id, ObjectId):
110
+ file_id = ObjectId(file_id)
111
+
112
+ # Lấy dữ liệu file từ GridFS
113
+ file_data = fs.get(file_id)
114
+
115
+ # Ghi dữ liệu ra file
116
+ with open(full_file_path, "wb") as f:
117
+ f.write(file_data.read())
118
+
119
+ print(f"✅ File đã được tải về: {full_file_path}")
120
+ except Exception as e:
121
+ print(f"❌ Lỗi khi tải file: {e}")
122
+ finally:
123
+ client.close()
124
+
125
+ def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
126
+ """
127
+ Lưu XML vào MongoDB GridFS.
128
+
129
+ :param xml_content: Chuỗi XML cần lưu
130
+ :param file_name: Tên file XML
131
+ :param db_name: Tên database MongoDB
132
+ :param collection_name: Tên collection GridFS
133
+ """
134
+ client = MongoClient("mongodb://localhost:27017/")
135
+ db = client[db_name]
136
+ fs = gridfs.GridFS(db, collection=collection_name)
137
+
138
+ # Kiểm tra file đã tồn tại chưa
139
+ existing_file = fs.find_one({"filename": file_name})
140
+ if existing_file:
141
+ print(f"⚠️ File '{file_name}' đã tồn tại trong GridFS. Không lưu lại.")
142
+ return
143
+
144
+ # Chuyển đổi chuỗi XML thành bytes và lưu vào GridFS
145
+ file_id = fs.put(xml_content.encode("utf-8"), filename=file_name)
146
+ print(f"✅ XML '{file_name}' đã được lưu vào GridFS với ID: {file_id}")
147
+
148
+ def fetch_file_from_mongodb(db_name, collection_name, file_id):
149
+ client = MongoClient("mongodb://localhost:27017/") # Cập nhật nếu cần
150
+ db = client[db_name]
151
+ fs = gridfs.GridFS(db, collection_name)
152
+
153
+ try:
154
+ file_data = fs.get(file_id)
155
+ pptx_io = BytesIO(file_data.read())
156
+ pptx_io.seek(0) # Đặt lại vị trí đầu file
157
+ return pptx_io, file_data.filename
158
+ except Exception as e:
159
+ print(f"Lỗi khi lấy file từ MongoDB: {e}")
160
+ return None, None
161
+
162
+ def detect_file_type(uploaded_file):
163
+ if uploaded_file is not None:
164
+ try:
165
+ file_bytes = uploaded_file.read(4096) # Đọc nhiều bytes hơn để nhận diện MIME
166
+ mime = magic.Magic(mime=True)
167
+ file_type = mime.from_buffer(file_bytes)
168
+ except Exception as e:
169
+ print(f"Error detecting file type: {e}")
170
+ file_type = "Unknown"
171
+
172
+ # Danh sách MIME types phổ biến
173
+ mime_types = {
174
+ "application/pdf": "PDF",
175
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "PPTX",
176
+ "application/vnd.ms-powerpoint": "PPTX",
177
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel",
178
+ "application/vnd.ms-excel": "Excel",
179
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word",
180
+ "application/msword": "Word",
181
+ "text/csv": "CSV",
182
+ "text/plain": "CSV" # Một số file CSV có thể nhận diện là text/plain
183
+ }
184
+
185
+ detected_type = mime_types.get(file_type, "Unknown")
186
+
187
+ # Nếu vẫn không chắc, kiểm tra phần mở rộng file
188
+ if detected_type == "Unknown":
189
+ ext = os.path.splitext(uploaded_file.name)[1].lower()
190
+ ext_mapping = {".csv": "CSV", ".docx": "Word", ".doc": "Word", ".xlsx": "Excel", ".pptx": "PPTX", ".pdf": "PDF"}
191
+ detected_type = ext_mapping.get(ext, "Unknown")
192
+
193
+ return detected_type
194
+ return None
excel/excel_translate.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xlwings as xw
2
+ from typing import Dict, List
3
+ from translate.translator import translate_text_dict
4
+ import math
5
+ import chardet
6
+ import io
7
+ import pandas as pd
8
+ import pymongo
9
+ import gridfs
10
+ from io import BytesIO
11
+ import tempfile
12
+ import os
13
+
14
+ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en', target_lang: str = "fr", gemini_api: str = "", db_name: str = "excel"):
15
+ # Kết nối MongoDB
16
+ client = pymongo.MongoClient("mongodb://localhost:27017")
17
+ db = client[db_name]
18
+ fs_input = gridfs.GridFS(db, collection="root_file")
19
+ fs_output = gridfs.GridFS(db, collection="final_file")
20
+
21
+ # Tải file từ MongoDB
22
+ file_data = fs_input.get(file_id).read()
23
+
24
+ # Lưu file tạm thời
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
26
+ temp_file.write(file_data)
27
+ temp_file_path = temp_file.name
28
+
29
+ # Khởi tạo xlwings (chạy ẩn ứng dụng Excel)
30
+ app = xw.App(visible=False)
31
+ wb = xw.Book(temp_file_path) # Mở workbook từ file tạm
32
+
33
+ # Chọn sheet được chỉ định hoặc tất cả các sheet
34
+ sheets = [wb.sheets[sheet_name]] if sheet_name else wb.sheets
35
+
36
+ for sheet in sheets:
37
+ last_row = sheet.used_range.rows.count
38
+ last_col = sheet.used_range.columns.count
39
+
40
+ # Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
41
+ text_dict: Dict[str, List[str]] = {}
42
+ cell_map: Dict[str, any] = {} # lưu mapping key -> cell object
43
+
44
+ for row in range(1, last_row + 1):
45
+ for col in range(1, last_col + 1):
46
+ cell = sheet.cells[row, col]
47
+ if isinstance(cell.value, str):
48
+ key = f"R{row}C{col}" # key theo dạng R{row}C{col}
49
+ text_dict[key] = [cell.value] # Lưu giá trị dưới dạng danh sách với 1 phần tử
50
+ cell_map[key] = cell
51
+
52
+ # Gọi hàm dịch theo dạng bulk
53
+ translated_dict = translate_text_dict(text_dict, source_lang=from_lang, target_lang=target_lang, gemini_api=gemini_api)
54
+
55
+ # Cập nhật lại các cell với nội dung đã dịch
56
+ for key, cell in cell_map.items():
57
+ if key in translated_dict:
58
+ translated_text_list = translated_dict[key]
59
+ if translated_text_list and len(translated_text_list) > 0:
60
+ cell.value = translated_text_list[0]
61
+
62
+ # Lưu workbook vào file tạm thời
63
+ wb.save(temp_file_path)
64
+ wb.close()
65
+ app.quit()
66
+
67
+ # Đọc lại file tạm để lưu vào MongoDB
68
+ with open(temp_file_path, "rb") as f:
69
+ translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.xlsx")
70
+
71
+ # Xóa file tạm
72
+ os.remove(temp_file_path)
73
+
74
+ print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
75
+ return translated_file_id
76
+
77
+
78
+ def read_csv_with_auto_encoding(csv_path):
79
+ # Đọc file dưới dạng nhị phân
80
+ with open(csv_path, "rb") as f:
81
+ raw_data = f.read()
82
+ # Dò tìm encoding
83
+ detect_result = chardet.detect(raw_data)
84
+ encoding = detect_result["encoding"]
85
+ confidence = detect_result["confidence"]
86
+
87
+ print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
88
+
89
+ # Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
90
+ if encoding is None:
91
+ encoding = "utf-8"
92
+
93
+ decoded_data = raw_data.decode(encoding, errors='replace')
94
+
95
+ # Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
96
+ csv_data = io.StringIO(decoded_data)
97
+ df = pd.read_csv(csv_data)
98
+ return df
99
+
100
+
101
+ def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
102
+ # Kết nối MongoDB
103
+ client = pymongo.MongoClient("mongodb://localhost:27017")
104
+ db = client[db_name]
105
+ fs_input = gridfs.GridFS(db, collection="root_file")
106
+ fs_output = gridfs.GridFS(db, collection="final_file")
107
+
108
+ # Tải file từ MongoDB
109
+ file_data = fs_input.get(file_id).read()
110
+
111
+ # Lưu file tạm thời
112
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
113
+ temp_file.write(file_data)
114
+ temp_file_path = temp_file.name
115
+
116
+ df = read_csv_with_auto_encoding(temp_file_path)
117
+
118
+ # If text_columns is not specified, we assume we want to translate everything that looks like text.
119
+ # Otherwise, only translate the given columns.
120
+ if text_columns is None:
121
+ # Example heuristic: choose all object/string columns
122
+ text_columns = df.select_dtypes(include=["object"]).columns.tolist()
123
+
124
+ num_rows = len(df)
125
+ num_chunks = math.ceil(num_rows / chunk_size)
126
+
127
+ translated_df = df.copy() # copy to store the final translations
128
+
129
+ for chunk_index in range(num_chunks):
130
+ start_idx = chunk_index * chunk_size
131
+ end_idx = min((chunk_index + 1) * chunk_size, num_rows)
132
+ chunk_df = df.iloc[start_idx:end_idx]
133
+
134
+ # Build a dictionary structure. For example, row-based:
135
+ # {
136
+ # "0": {"colA": "some text", "colB": "some text"},
137
+ # "1": {"colA": "some text", "colB": "some text"},
138
+ # ...
139
+ # }
140
+ chunk_dict = {}
141
+ for i, row in chunk_df.iterrows():
142
+ row_dict = {}
143
+ for col in text_columns:
144
+ row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
145
+ chunk_dict[str(i)] = row_dict
146
+
147
+ # Now call your LLM translator on this dictionary
148
+ translated_chunk = translate_text_dict(
149
+ text_dict=chunk_dict,
150
+ source_lang=source_lang,
151
+ target_lang=target_lang,
152
+ gemini_api=gemini_api
153
+ )
154
+
155
+ # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
156
+ for i_str, row_data in translated_chunk.items():
157
+ i = int(i_str)
158
+ for col, translated_val in row_data.items():
159
+ translated_df.at[i, col] = translated_val
160
+
161
+ # Lưu file dịch vào tệp tạm thời
162
+ translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
163
+ translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
164
+
165
+ # Đọc lại file tạm để lưu vào MongoDB
166
+ with open(translated_file_path, "rb") as f:
167
+ translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
168
+
169
+ # Xóa file tạm
170
+ os.remove(temp_file_path)
171
+ os.remove(translated_file_path)
172
+
173
+ print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
174
+ return translated_file_id
home.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ st.title("Some Streamlit Demo, maybe")
6
+
7
+ st.sidebar.header("Input")
8
+ num_rows = st.sidebar.slider("Number of rows", min_value=10, max_value=100, value=20)
9
+ num_cols = st.sidebar.slider("Number of columns", min_value=2, max_value=10, value=3)
10
+
11
+ data = np.random.randn(num_rows, num_cols)
12
+ columns = [f"Column {i+1}" for i in range(num_cols)]
13
+ df = pd.DataFrame(data, columns=columns)
14
+
15
+ st.subheader("Generated Data Table")
16
+ st.dataframe(df)
17
+
18
+ st.subheader("Line Chart of the Data")
19
+ st.line_chart(df)
20
+
21
+ st.subheader("Statistics")
22
+ st.write(df.describe())
pages/upload.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import google.generativeai as genai
3
+ from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
4
+ from powerpoint.xml_handling import (
5
+ extract_text_from_xml, update_xml_with_translated_text_mongodb, ppt_to_xml_mongodb
6
+ )
7
+ from translate.translator import translate_text_dict
8
+ from powerpoint.pptx_object import create_translated_ppt
9
+ from excel.excel_translate import translate_xlsx, translate_csv
10
+ from word.word_translate import translate_docx
11
+
12
+ import dotenv
13
+ import os
14
+
15
+ dotenv.load_dotenv(".env")
16
+
17
+ # Cấu hình API key
18
+ api_key = os.getenv("GEMINI_API_KEY")
19
+ genai.configure(api_key=api_key)
20
+ model = genai.GenerativeModel("gemini-1.5-flash")
21
+
22
+ # Giao diện Streamlit
23
+ st.title("Upload PPTX to MongoDB")
24
+
25
+ uploaded_file = st.file_uploader("Chọn file PPTX", type=["pptx, excel, csv, docx"])
26
+ file_name_input = st.text_input("Tên file để lưu (không cần .pptx)", value="")
27
+
28
+ final_pptx_id = None # Biến lưu ID file sau khi xử lý
29
+
30
+ if uploaded_file is not None:
31
+ if st.button("Upload"):
32
+ file_type = detect_file_type(uploaded_file)
33
+ st.write(f"Detected file type: {file_type}")
34
+ if file_type == "PPTX":
35
+
36
+ file_id = save_file_to_mongodb(uploaded_file=uploaded_file, file_name=file_name_input)
37
+ st.write(f"File ID: {file_id}")
38
+
39
+ xml_file_id = ppt_to_xml_mongodb(file_id)
40
+ text_dict = extract_text_from_xml(file_id=xml_file_id)
41
+ translated_dict = translate_text_dict(text_dict, source_lang="VietNamese", target_lang="English", gemini_api=api_key)
42
+
43
+ final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
44
+ st.write(f"Final XML ID: {final_xml_id}")
45
+
46
+ # Lưu ID file PPTX cuối cùng
47
+ final_pptx_id = create_translated_ppt(
48
+ db_name="ppt", original_ppt_id=file_id,
49
+ translated_xml_id=final_xml_id, output_collection="final_pptx"
50
+ )
51
+ st.write(f"Final PPTX ID: {final_pptx_id}")
52
+
53
+ # Hiển thị ảnh slide trước khi tải xuống
54
+ if final_pptx_id:
55
+ st.write("✅ File đã sẵn sàng để tải xuống!")
56
+
57
+ pptx_io, pptx_filename = fetch_file_from_mongodb("ppt", "final_pptx", final_pptx_id)
58
+
59
+ if pptx_io:
60
+ # Nút tải file sau khi xem trước
61
+ st.download_button(
62
+ label="Click to Download",
63
+ data=pptx_io.getvalue(), # Chuyển thành bytes để tải về
64
+ file_name=pptx_filename,
65
+ mime="application/vnd.openxmlformats-officedocument.presentationml.presentation"
66
+ )
67
+ else:
68
+ st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
69
+
70
+
71
+
72
+ elif file_type == "Excel":
73
+ file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="excel", collection_name="root_file", file_name=file_name_input, file_tail=".xlsx")
74
+ st.write(f"File ID: {file_id}")
75
+
76
+ final_id = translate_xlsx(file_id=file_id, from_lang="en", target_lang="vi", gemini_api=api_key)
77
+ st.write(f"Final Excel ID: {final_id}")
78
+ if final_id:
79
+ st.write("✅ File đã sẵn sàng để tải xuống!")
80
+
81
+ excel_io, excel_filename = fetch_file_from_mongodb("excel", "final_file", final_id)
82
+
83
+ if excel_io:
84
+ st.download_button(
85
+ label="Click to Download",
86
+ data=excel_io.getvalue(),
87
+ file_name=excel_filename,
88
+ mime="application/vnd.ms-excel"
89
+ )
90
+ else:
91
+ st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
92
+ elif file_type == "CSV":
93
+ file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="csv", collection_name="root_file", file_name=file_name_input, file_tail=".csv")
94
+ st.write(f"File ID: {file_id}")
95
+
96
+ final_id = translate_csv(file_id=file_id, source_lang="en", target_lang="vi", gemini_api=api_key)
97
+ st.write(f"Final CSV ID: {final_id}")
98
+ if final_id:
99
+ st.write("✅ File đã sẵn sàng để tải xuống!")
100
+
101
+ csv_io, csv_filename = fetch_file_from_mongodb("csv", "final_file", final_id)
102
+
103
+ if csv_io:
104
+ st.download_button(
105
+ label="Click to Download",
106
+ data=csv_io.getvalue(),
107
+ file_name=csv_filename,
108
+ mime="text/csv"
109
+ )
110
+ else:
111
+ st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
112
+
113
+ elif file_type == "Word":
114
+ file_id = save_file_to_mongodb(uploaded_file=uploaded_file, db_name="word", collection_name="root_file", file_name=file_name_input, file_tail=".docx")
115
+ st.write(f"File ID: {file_id}")
116
+
117
+ final_id = translate_docx(file_id=file_id, source_lang="en", target_lang="vi")
118
+ st.write(f"Final CSV ID: {final_id}")
119
+ if final_id:
120
+ st.write("✅ File đã sẵn sàng để tải xuống!")
121
+
122
+ docx_io, docx_filename = fetch_file_from_mongodb("word", "final_file", final_id)
123
+
124
+ if docx_io:
125
+ st.download_button(
126
+ label="Click to Download",
127
+ data=docx_io.getvalue(),
128
+ file_name=docx_filename,
129
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
130
+ )
131
+ else:
132
+ st.error("❌ Không thể tải xuống file. Kiểm tra lại ID hoặc thử lại sau.")
133
+
134
+
powerpoint/__init__.py ADDED
File without changes
powerpoint/pptx_object.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ppt_objects.py
2
+ from pptx import Presentation
3
+ from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
4
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
5
+ import xml.etree.ElementTree as ET
6
+ from pptx.util import Pt
7
+ from pptx.dml.color import RGBColor
8
+ import re
9
+ import json
10
+
11
+ from pymongo import MongoClient
12
+ from gridfs import GridFS
13
+ import json
14
+ import xml.etree.ElementTree as ET
15
+ from io import BytesIO
16
+
17
+
18
+ def apply_group_properties_recursive(shape, shape_index, parent_element):
19
+ """Recursively applies properties to shapes within groups."""
20
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
21
+ group_element = parent_element.find(f".//group_element[@shape_index='{shape_index}']")
22
+ if group_element is not None:
23
+ for i, sub_shape in enumerate(shape.shapes):
24
+ apply_group_properties_recursive(sub_shape, i, group_element)
25
+
26
+ # Apply properties for sub-shapes WITHIN the group, based on their type.
27
+ if sub_shape.shape_type == MSO_SHAPE_TYPE.TABLE:
28
+ table_element = group_element.find(f".//table_element[@shape_index='{i}']")
29
+ if table_element: # Use a shorter name for clarity
30
+ props_element = table_element.find("properties")
31
+ if props_element is not None and props_element.text:
32
+ try:
33
+ table_data = json.loads(props_element.text)
34
+ apply_table_properties(sub_shape.table, table_data)
35
+ except (json.JSONDecodeError, KeyError) as e:
36
+ print(f"Error applying table properties (in group): {str(e)}")
37
+
38
+ elif hasattr(sub_shape, "text_frame") and sub_shape.text_frame:
39
+ text_element = group_element.find(f".//text_element[@shape_index='{i}']")
40
+ if text_element: # Shorter name
41
+ props_element = text_element.find("properties")
42
+ if props_element is not None and props_element.text:
43
+ try:
44
+ shape_data = json.loads(props_element.text)
45
+ apply_shape_properties(sub_shape, shape_data)
46
+ except (json.JSONDecodeError, KeyError) as e:
47
+ print(f"Error applying shape properties (in group): {str(e)}")
48
+
49
+ def get_alignment_value(alignment_str):
50
+ """Convert alignment string (with extra characters) to PP_ALIGN enum value."""
51
+ alignment_map = {
52
+ 'center': PP_ALIGN.CENTER,
53
+ 'left': PP_ALIGN.LEFT,
54
+ 'right': PP_ALIGN.RIGHT,
55
+ 'justify': PP_ALIGN.JUSTIFY
56
+ }
57
+ match = re.match(r"([A-Za-z]+)", alignment_str)
58
+ return alignment_map.get(match.group(1).lower()) if match else None
59
+
60
+ def get_vertical_anchor(value):
61
+ """Converts vertical_anchor string to MSO_ANCHOR enum."""
62
+ mapping = {
63
+ "TOP": MSO_ANCHOR.TOP,
64
+ "MIDDLE": MSO_ANCHOR.MIDDLE,
65
+ "BOTTOM": MSO_ANCHOR.BOTTOM
66
+ }
67
+ return mapping.get(value.upper().split()[0], MSO_ANCHOR.TOP)
68
+
69
+ def get_table_properties(table):
70
+ """Extract complete table properties."""
71
+ table_data = {
72
+ 'rows': len(table.rows),
73
+ 'cols': len(table.columns),
74
+ 'cells': []
75
+ }
76
+ for row in table.rows:
77
+ row_data = []
78
+ for cell in row.cells:
79
+ cell_data = {
80
+ 'text': cell.text.strip(),
81
+ 'font_size': None,
82
+ 'font_name': None,
83
+ 'alignment': None,
84
+ 'margin_left': cell.margin_left,
85
+ 'margin_right': cell.margin_right,
86
+ 'margin_top': cell.margin_top,
87
+ 'margin_bottom': cell.margin_bottom,
88
+ 'vertical_anchor': str(cell.vertical_anchor) if cell.vertical_anchor else None,
89
+ 'font_color': None
90
+ }
91
+ if cell.text_frame.paragraphs:
92
+ paragraph = cell.text_frame.paragraphs[0]
93
+ if paragraph.runs:
94
+ run = paragraph.runs[0]
95
+ if hasattr(run.font, 'size') and run.font.size is not None:
96
+ cell_data['font_size'] = run.font.size.pt
97
+ if hasattr(run.font, 'name'):
98
+ cell_data['font_name'] = run.font.name
99
+ if hasattr(run.font, 'bold'):
100
+ cell_data['bold'] = run.font.bold
101
+ if hasattr(run.font, 'italic'):
102
+ cell_data['italic'] = run.font.italic
103
+ if (hasattr(run.font, 'color') and
104
+ run.font.color is not None and
105
+ hasattr(run.font.color, 'rgb') and
106
+ run.font.color.rgb is not None):
107
+ cell_data['font_color'] = str(run.font.color.rgb)
108
+ if hasattr(paragraph, 'alignment'):
109
+ cell_data['alignment'] = f"{paragraph.alignment}" if paragraph.alignment else None
110
+ row_data.append(cell_data)
111
+ table_data['cells'].append(row_data)
112
+ return table_data
113
+
114
+ def get_shape_properties(shape):
115
+ """Extract all properties from a shape, with detailed debug prints."""
116
+ shape_data = {
117
+ 'text': '',
118
+ 'font_size': None,
119
+ 'font_name': None,
120
+ 'alignment': None,
121
+ 'width': shape.width,
122
+ 'height': shape.height,
123
+ 'left': shape.left,
124
+ 'top': shape.top,
125
+ 'bold': None,
126
+ 'italic': None,
127
+ 'line_spacing_info': {
128
+ 'rule': None,
129
+ 'value': None
130
+ },
131
+ 'space_before': None,
132
+ 'space_after': None,
133
+ 'font_color': None
134
+ }
135
+
136
+ if hasattr(shape, "text"):
137
+ shape_data['text'] = shape.text.strip()
138
+ if hasattr(shape, 'text_frame'):
139
+ for paragraph_index, paragraph in enumerate(shape.text_frame.paragraphs):
140
+ if paragraph.runs:
141
+ run = paragraph.runs[0] # Assuming properties are mostly consistent in the first run
142
+ if hasattr(run.font, 'size') and run.font.size is not None:
143
+ shape_data['font_size'] = run.font.size.pt
144
+ if hasattr(run.font, 'name'):
145
+ shape_data['font_name'] = run.font.name
146
+ if hasattr(run.font, 'bold'):
147
+ shape_data['bold'] = run.font.bold
148
+ if hasattr(run.font, 'italic'):
149
+ shape_data['italic'] = run.font.italic
150
+ if (hasattr(run.font, 'color') and
151
+ run.font.color is not None and
152
+ hasattr(run.font.color, 'rgb') and
153
+ run.font.color.rgb is not None):
154
+ shape_data['font_color'] = str(run.font.color.rgb)
155
+
156
+ if hasattr(paragraph, 'alignment') and paragraph.alignment is not None:
157
+ shape_data['alignment'] = str(paragraph.alignment).split('.')[-1]
158
+ if hasattr(paragraph, 'space_before'):
159
+ shape_data['space_before'] = paragraph.space_before.pt if paragraph.space_before else None
160
+ if hasattr(paragraph, 'space_after'):
161
+ shape_data['space_after'] = paragraph.space_after.pt if paragraph.space_after else None
162
+
163
+ if hasattr(paragraph, 'line_spacing') and paragraph.line_spacing:
164
+ line_spacing = paragraph.line_spacing
165
+
166
+ # Nếu line_spacing là một số lớn (ví dụ: 84.99 pt), có thể là EXACTLY
167
+ if isinstance(line_spacing, Pt) or line_spacing > 10:
168
+ line_spacing_rule = "EXACTLY"
169
+ elif isinstance(line_spacing, float):
170
+ line_spacing_rule = "MULTIPLE"
171
+ else:
172
+ line_spacing_rule = "UNKNOWN"
173
+
174
+ shape_data['line_spacing_info'] = {
175
+ 'rule': line_spacing_rule,
176
+ 'value': line_spacing if isinstance(line_spacing, float) else None
177
+ }
178
+
179
+ return shape_data
180
+
181
+ def apply_shape_properties(shape, shape_data):
182
+ """Apply saved properties to a shape."""
183
+ try:
184
+ shape.width = shape_data['width']
185
+ shape.height = shape_data['height']
186
+ shape.left = shape_data['left']
187
+ shape.top = shape_data['top']
188
+ shape.text = ""
189
+ paragraph = shape.text_frame.paragraphs[0]
190
+ run = paragraph.add_run()
191
+ run.text = shape_data['text']
192
+ if shape_data['font_size']:
193
+ adjusted_size = shape_data['font_size'] * 0.9
194
+ run.font.size = Pt(adjusted_size)
195
+
196
+ if shape_data.get('font_name'):
197
+ run.font.name = shape_data['font_name']
198
+ else:
199
+ run.font.name = "Arial"
200
+ if shape_data.get('font_color'):
201
+ run.font.color.rgb = RGBColor.from_string(shape_data['font_color'])
202
+ if shape_data['bold'] is not None:
203
+ run.font.bold = shape_data['bold']
204
+ if shape_data['italic'] is not None:
205
+ run.font.italic = shape_data['italic']
206
+ if shape_data['alignment']:
207
+ paragraph.alignment = get_alignment_value(shape_data['alignment'])
208
+
209
+ line_spacing_info = shape_data.get('line_spacing_info', {})
210
+ line_spacing_rule = line_spacing_info.get('rule')
211
+ line_spacing_value = line_spacing_info.get('value')
212
+
213
+ if line_spacing_rule and line_spacing_value is not None:
214
+ if line_spacing_rule == "EXACTLY":
215
+ paragraph.line_spacing = Pt(line_spacing_value)
216
+ elif line_spacing_rule == "AT_LEAST":
217
+ paragraph.line_spacing = Pt(line_spacing_value)
218
+ elif line_spacing_rule == "MULTIPLE":
219
+ paragraph.line_spacing = line_spacing_value
220
+ else:
221
+ print(f"⚠️ Unknown line spacing rule: {line_spacing_rule}")
222
+
223
+ if shape_data['space_before']:
224
+ paragraph.space_before = shape_data['space_before']
225
+ if shape_data['space_after']:
226
+ paragraph.space_after = shape_data['space_after']
227
+
228
+
229
+ except Exception as e:
230
+ print(f"Error applying shape properties: {str(e)}")
231
+
232
+
233
+ def apply_table_properties(table, table_data):
234
+ """Áp dụng các thuộc tính đã lưu vào bảng PowerPoint."""
235
+ for row_idx, row in enumerate(table.rows):
236
+ for col_idx, cell in enumerate(row.cells):
237
+ try:
238
+ cell_data = table_data['cells'][row_idx][col_idx]
239
+
240
+ # Áp dụng margin
241
+ cell.margin_left = cell_data.get('margin_left', 0)
242
+ cell.margin_right = cell_data.get('margin_right', 0)
243
+ cell.margin_top = cell_data.get('margin_top', 0)
244
+ cell.margin_bottom = cell_data.get('margin_bottom', 0)
245
+
246
+ # Áp dụng vertical_anchor (tránh dùng eval)
247
+ if 'vertical_anchor' in cell_data:
248
+ cell.vertical_anchor = get_vertical_anchor(cell_data['vertical_anchor'])
249
+
250
+ # Xóa nội dung cũ và thiết lập văn bản mới
251
+ cell.text = ""
252
+ paragraph = cell.text_frame.paragraphs[0]
253
+ run = paragraph.add_run()
254
+ run.text = cell_data.get('text', "")
255
+
256
+ # Thiết lập kích thước font
257
+ if 'font_size' in cell_data:
258
+ adjusted_size = cell_data['font_size'] * 0.9 # Giữ tỉ lệ font
259
+ run.font.size = Pt(adjusted_size)
260
+
261
+ # Thiết lập font chữ
262
+ run.font.name = cell_data.get('font_name', 'Arial')
263
+
264
+ # Màu chữ
265
+ if 'font_color' in cell_data:
266
+ run.font.color.rgb = RGBColor.from_string(cell_data['font_color'])
267
+
268
+ # In đậm & in nghiêng
269
+ run.font.bold = cell_data.get('bold', False)
270
+ run.font.italic = cell_data.get('italic', False)
271
+
272
+ # Căn lề văn bản
273
+ if 'alignment' in cell_data:
274
+ paragraph.alignment = get_alignment_value(cell_data['alignment'])
275
+
276
+ except Exception as e:
277
+ print(f"Lỗi khi thiết lập thuộc tính ô [{row_idx}, {col_idx}]: {str(e)}")
278
+
279
+
280
+ def get_file_from_mongodb(db_name, collection_name, file_id):
281
+ """Tải tệp từ MongoDB GridFS"""
282
+ client = MongoClient("mongodb://localhost:27017/")
283
+ db = client[db_name]
284
+ fs = GridFS(db, collection_name)
285
+ file_data = fs.get(file_id)
286
+ return BytesIO(file_data.read())
287
+
288
+
289
+ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
290
+ """Lưu tệp vào MongoDB GridFS"""
291
+ client = MongoClient("mongodb://localhost:27017/")
292
+ db = client[db_name]
293
+ fs = GridFS(db, collection_name)
294
+ file_id = fs.put(file_data, filename=file_name)
295
+ return file_id
296
+
297
+ def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
298
+ """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
299
+ try:
300
+ # Kết nối MongoDB và tải file
301
+ original_ppt_io = get_file_from_mongodb(db_name, "root_file", original_ppt_id)
302
+ translated_xml_io = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
303
+
304
+ # Load PowerPoint gốc và XML dịch
305
+ prs = Presentation(original_ppt_io)
306
+ tree = ET.parse(translated_xml_io)
307
+ root = tree.getroot()
308
+
309
+ # Áp dụng bản dịch
310
+ for slide_number, slide in enumerate(prs.slides, 1):
311
+ xml_slide = root.find(f".//slide[@number='{slide_number}']")
312
+ if xml_slide is None:
313
+ continue
314
+ for shape_index, shape in enumerate(slide.shapes):
315
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
316
+ apply_group_properties_recursive(shape, shape_index, xml_slide)
317
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
318
+ table_element = xml_slide.find(f".//table_element[@shape_index='{shape_index}']")
319
+ if table_element is not None:
320
+ props_element = table_element.find("properties")
321
+ if props_element is not None and props_element.text:
322
+ try:
323
+ table_data = json.loads(props_element.text)
324
+ apply_table_properties(shape.table, table_data)
325
+ except Exception as e:
326
+ print(f"Error applying table properties: {str(e)}")
327
+ elif hasattr(shape, "text"):
328
+ text_element = xml_slide.find(f".//text_element[@shape_index='{shape_index}']")
329
+ if text_element is not None:
330
+ props_element = text_element.find("properties")
331
+ if props_element is not None and props_element.text:
332
+ try:
333
+ shape_data = json.loads(props_element.text)
334
+ apply_shape_properties(shape, shape_data)
335
+ except Exception as e:
336
+ print(f"Error applying shape properties: {str(e)}")
337
+
338
+ # Lưu PowerPoint vào MongoDB
339
+ output_io = BytesIO()
340
+ prs.save(output_io)
341
+ output_io.seek(0) # Reset vị trí đọc
342
+
343
+ file_id = save_file_to_mongodb(db_name, output_collection, "translated_presentation.pptx", output_io)
344
+ print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
345
+
346
+ return file_id
347
+ except Exception as e:
348
+ print(f"Error creating translated PowerPoint: {str(e)}")
349
+ return None
350
+
351
+ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
352
+ """Lưu tệp vào MongoDB GridFS"""
353
+ client = MongoClient("mongodb://localhost:27017/")
354
+ db = client[db_name]
355
+ fs = GridFS(db, collection_name)
356
+ file_id = fs.put(file_data, filename=file_name)
357
+ return file_id
powerpoint/pptx_processor.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ppt_processor.py
2
+ from pathlib import Path
3
+ from xml_handling import ppt_to_xml, translate_xml_file
4
+ from pptx_object import create_translated_ppt
5
+ import os
6
+
7
+ def process_ppt_file(ppt_path: Path, source_lang: str, target_lang: str):
8
+ """Process a single PPT/PPTX file from XML extraction to final translation."""
9
+ ppt_path = ppt_path.strip("'\"")
10
+ ppt_path = ppt_path.replace("\\ ", " ")
11
+ ppt_path = ppt_path.replace("\\'", "'")
12
+ ppt_path = os.path.expanduser(ppt_path)
13
+ ppt_path = Path(ppt_path).resolve()
14
+ # chuyển thành link DB trên server
15
+ try:
16
+ if not ppt_path.is_file():
17
+ print(f"Error: '{ppt_path}' is not a valid file.")
18
+ return
19
+ if ppt_path.suffix.lower() not in ['.ppt', '.pptx']:
20
+ print(f"Error: '{ppt_path}' is not a PowerPoint file.")
21
+ return
22
+
23
+ base_dir = ppt_path.parent
24
+
25
+ # Original XML
26
+ print(f"Generating original XML for {ppt_path.name}...")
27
+ original_xml = ppt_to_xml(str(ppt_path))
28
+ if original_xml:
29
+ original_output_path = base_dir / f"{ppt_path.stem}_original.xml"
30
+ with open(original_output_path, 'w', encoding='utf-8') as f:
31
+ f.write(original_xml)
32
+ print(f"Original XML saved: {original_output_path}")
33
+
34
+ # Save original XML to MongoDB
35
+ # save_xml_to_mongodb(original_xml, ppt_path.stem + "_original.xml")
36
+
37
+ # Translated XML
38
+ print(f"Generating translated XML (from {source_lang} to {target_lang}) for {ppt_path.name}...")
39
+ translated_output_path = base_dir / f"{ppt_path.stem}_translated.xml"
40
+ original_xml_path = base_dir / f"{ppt_path.stem}_original.xml"
41
+ translate_xml_file(str(original_xml_path), str(translated_output_path), source_lang, target_lang)
42
+
43
+ # Create Translated PPT
44
+ print(f"Creating translated PPT for {ppt_path.name}...")
45
+ output_filename = f"{ppt_path.stem}_translated{ppt_path.suffix}"
46
+ output_ppt_path = base_dir / output_filename
47
+ create_translated_ppt(str(ppt_path), str(translated_output_path), str(output_ppt_path))
48
+
49
+ except Exception as e:
50
+ print(f"Error in process_ppt_file for {ppt_path}: {str(e)}")
powerpoint/xml_handling.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xml.etree.ElementTree as ET
2
+ from xml.dom import minidom
3
+ import json
4
+ from typing import Dict, List
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from pptx import Presentation
7
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
8
+ from powerpoint.pptx_object import get_table_properties, get_shape_properties
9
+ from pymongo import MongoClient
10
+ import gridfs
11
+ from bson import ObjectId
12
+ from io import BytesIO
13
+
14
+
15
+ gemini_api = "AIzaSyDtBIjTSfbvuEsobNwjtdyi9gVpDrCaWPM"
16
+
17
+ def extract_text_from_group(group_shape, slide_number, shape_index, slide_element):
18
+ """Extracts text from shapes within a group, only adding the group if it contains text."""
19
+ group_element = ET.SubElement(slide_element, "group_element")
20
+ group_element.set("shape_index", str(shape_index))
21
+ group_element.set("group_name", group_shape.name) # Add group name
22
+
23
+ group_has_text = False # Flag to track if the group contains any text
24
+
25
+ for i, shape in enumerate(group_shape.shapes):
26
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
27
+ # Recursively check nested groups, and update group_has_text
28
+ if extract_text_from_group(shape, slide_number, i, group_element):
29
+ group_has_text = True
30
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
31
+ table_element = ET.SubElement(group_element, "table_element")
32
+ table_element.set("shape_index", str(i))
33
+ table_data = get_table_properties(shape.table)
34
+ props_element = ET.SubElement(table_element, "properties")
35
+ props_element.text = json.dumps(table_data, indent=2)
36
+ group_has_text = True
37
+ elif hasattr(shape, "text_frame") and shape.text_frame:
38
+ text_element = ET.SubElement(group_element, "text_element")
39
+ text_element.set("shape_index", str(i))
40
+ shape_data = get_shape_properties(shape)
41
+ props_element = ET.SubElement(text_element, "properties")
42
+ props_element.text = json.dumps(shape_data, indent=2)
43
+ if shape_data.get("text") or (
44
+ "paragraphs" in shape_data
45
+ and any(p.get("text") for p in shape_data["paragraphs"])
46
+ ):
47
+ group_has_text = True
48
+
49
+ # Only keep the group element if it contains text
50
+ if not group_has_text:
51
+ slide_element.remove(group_element)
52
+ return False
53
+ return True
54
+
55
+ def extract_text_from_slide(slide, slide_number, translate=False):
56
+ """Extract all text elements from a slide."""
57
+ slide_element = ET.Element("slide")
58
+ slide_element.set("number", str(slide_number))
59
+
60
+ for shape_index, shape in enumerate(slide.shapes):
61
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
62
+ extract_text_from_group(shape, slide_number, shape_index, slide_element)
63
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
64
+ table_element = ET.SubElement(slide_element, "table_element")
65
+ table_element.set("shape_index", str(shape_index))
66
+ table_data = get_table_properties(shape.table)
67
+ props_element = ET.SubElement(table_element, "properties")
68
+ props_element.text = json.dumps(table_data, indent=2)
69
+ elif hasattr(shape, "text"):
70
+ text_element = ET.SubElement(slide_element, "text_element")
71
+ text_element.set("shape_index", str(shape_index))
72
+ shape_data = get_shape_properties(shape)
73
+ props_element = ET.SubElement(text_element, "properties")
74
+ props_element.text = json.dumps(shape_data, indent=2)
75
+ return slide_element
76
+
77
+ def ppt_to_xml_mongodb(ppt_file_id: str, db_name="ppt"):
78
+ """
79
+ Chuyển PowerPoint từ MongoDB thành XML và lưu vào MongoDB.
80
+
81
+ :param ppt_file_id: ID của file PPT gốc trong MongoDB (original_pptx)
82
+ :param db_name: Tên database MongoDB
83
+ :return: ID của file XML trong MongoDB (original_xml)
84
+ """
85
+ # Kết nối MongoDB
86
+ client = MongoClient("mongodb://localhost:27017/")
87
+ db = client[db_name]
88
+
89
+ fs_ppt = gridfs.GridFS(db, collection="root_file") # PPT gốc
90
+ fs_xml = gridfs.GridFS(db, collection="original_xml") # XML lưu trữ
91
+
92
+ try:
93
+ # Lấy file PPT từ MongoDB
94
+ if not isinstance(ppt_file_id, ObjectId):
95
+ ppt_file_id = ObjectId(ppt_file_id)
96
+ ppt_file = fs_ppt.get(ppt_file_id)
97
+ prs = Presentation(BytesIO(ppt_file.read()))
98
+
99
+ # Tạo XML
100
+ root = ET.Element("presentation")
101
+ root.set("file_name", ppt_file.filename)
102
+
103
+ with ThreadPoolExecutor(max_workers=4) as executor:
104
+ future_to_slide = {
105
+ executor.submit(extract_text_from_slide, slide, slide_number): slide_number
106
+ for slide_number, slide in enumerate(prs.slides, 1)
107
+ }
108
+ for future in future_to_slide:
109
+ slide_number = future_to_slide[future]
110
+ try:
111
+ slide_element = future.result()
112
+ root.append(slide_element)
113
+ except Exception as e:
114
+ print(f"Error processing slide {slide_number}: {str(e)}")
115
+
116
+ xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
117
+
118
+ # Lưu XML vào MongoDB
119
+ xml_output = BytesIO(xml_str.encode("utf-8"))
120
+ xml_file_id = fs_xml.put(xml_output, filename=f"{ppt_file.filename}.xml")
121
+
122
+ print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
123
+
124
+ return xml_file_id
125
+
126
+ except Exception as e:
127
+ print(f"❌ Lỗi khi chuyển PPT sang XML: {str(e)}")
128
+ return None
129
+ finally:
130
+ client.close()
131
+
132
+
133
+
134
+
135
+ def extract_text_from_xml(file_id=None, filename=None, db_name="ppt", collection_name="original_xml") -> Dict[str, List[str]]:
136
+ """
137
+ Tải XML từ MongoDB và trích xuất văn bản từ các slide.
138
+
139
+ :param file_id: ID của file trong MongoDB (dạng ObjectId hoặc string)
140
+ :param filename: Tên file cần tìm trong MongoDB (VD: "file.xml")
141
+ :param db_name: Tên database MongoDB
142
+ :param collection_name: Tên collection GridFS
143
+ :return: Dictionary {slide_number: [text1, text2, ...]}
144
+ """
145
+ # Kết nối MongoDB
146
+ client = MongoClient("mongodb://localhost:27017/")
147
+ db = client[db_name]
148
+ fs = gridfs.GridFS(db, collection=collection_name)
149
+
150
+ try:
151
+ # Tìm file theo file_id hoặc filename
152
+ if file_id:
153
+ if not isinstance(file_id, ObjectId):
154
+ file_id = ObjectId(file_id)
155
+ file_data = fs.get(file_id)
156
+ elif filename:
157
+ file_data = fs.find_one({"filename": filename})
158
+ if not file_data:
159
+ print(f"❌ Không tìm thấy file '{filename}' trong MongoDB!")
160
+ return {}
161
+ else:
162
+ print("❌ Cần cung cấp 'file_id' hoặc 'filename' để tải file.")
163
+ return {}
164
+
165
+ # Đọc nội dung XML từ MongoDB
166
+ xml_content = file_data.read().decode("utf-8")
167
+ # print(f"✅ xml_content: {xml_content}")
168
+ # Chuyển đổi thành cây XML
169
+ root = ET.fromstring(xml_content)
170
+ slide_texts = {}
171
+
172
+ # Duyệt qua từng slide
173
+ for slide in root.findall("slide"):
174
+ slide_number = slide.get("number")
175
+ texts = []
176
+ # Helper function to extract text recursively
177
+ def extract_text_recursive(element):
178
+ if element.tag == "text_element":
179
+ props = element.find("properties")
180
+ if props is not None and props.text:
181
+ try:
182
+ shape_data = json.loads(props.text)
183
+ # Handle both direct 'text' and paragraph-based text
184
+ if 'text' in shape_data:
185
+ texts.append(shape_data['text'])
186
+ elif 'paragraphs' in shape_data:
187
+ for paragraph in shape_data['paragraphs']:
188
+ if 'text' in paragraph:
189
+ texts.append(paragraph['text'])
190
+ #Also extract run level text
191
+ elif 'runs' in paragraph:
192
+ for run in paragraph['runs']:
193
+ if 'text' in run:
194
+ texts.append(run['text'])
195
+
196
+
197
+ except json.JSONDecodeError:
198
+ pass # Ignore if JSON is invalid
199
+
200
+ elif element.tag == "table_element":
201
+ props = element.find("properties")
202
+ if props is not None and props.text:
203
+ try:
204
+ table_data = json.loads(props.text)
205
+ for row in table_data.get("cells", []):
206
+ for cell in row:
207
+ texts.append(cell.get("text", ""))
208
+ except json.JSONDecodeError:
209
+ pass # Ignore if JSON is invalid
210
+
211
+ # Recursively process children of group_element
212
+ elif element.tag == "group_element":
213
+ for child in element:
214
+ extract_text_recursive(child)
215
+
216
+ # Iterate through all direct children of the slide
217
+ for child in slide:
218
+ extract_text_recursive(child)
219
+
220
+ slide_texts[str(slide_number)] = texts # Ensure slide number is a string
221
+ print(slide_texts)
222
+ return slide_texts
223
+
224
+ except Exception as e:
225
+ print(f"❌ Lỗi khi xử lý XML: {e}")
226
+ return {}
227
+ finally:
228
+ client.close()
229
+
230
+
231
+
232
+
233
+
234
+
235
+ def adjust_size(original_text, translated_text, data_container):
236
+ """Adjust font size if translated text is significantly longer."""
237
+
238
+ if not original_text or not translated_text:
239
+ return
240
+
241
+ original_len = len(original_text)
242
+ translated_len = len(translated_text)
243
+ length_ratio = translated_len / original_len if original_len >0 else 1 # Avoid division by 0
244
+
245
+ if length_ratio > 1.5: # Adjust threshold as needed
246
+ if 'paragraphs' in data_container:
247
+ for paragraph in data_container['paragraphs']:
248
+ if 'runs' in paragraph:
249
+ for run in paragraph['runs']:
250
+ if run.get('font') and run['font'].get('size'):
251
+ run['font']['size'] = max(6, int(run['font']['size'] * 0.8))
252
+
253
+ elif 'font' in data_container and data_container['font'].get('size'):
254
+ data_container['font']['size'] = max(6, int(data_container['font']['size'] * 0.8))
255
+
256
+
257
+
258
+
259
+
260
+ def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[str, List[str]], db_name="ppt"):
261
+ """
262
+ Tải XML từ MongoDB (collection original_xml), cập nhật nội dung dịch, và lưu lại vào collection final_xml.
263
+
264
+ :param file_id: ID của file trong MongoDB (original_xml)
265
+ :param translated_dict: Dictionary {slide_number: [translated_text1, translated_text2, ...]}
266
+ :param db_name: Tên database MongoDB
267
+ """
268
+ # Kết nối MongoDB
269
+ client = MongoClient("mongodb://localhost:27017/")
270
+ db = client[db_name]
271
+
272
+ fs_original = gridfs.GridFS(db, collection="original_xml") # Lấy file từ original_xml
273
+ fs_final = gridfs.GridFS(db, collection="final_xml") # Lưu file vào final_xml
274
+
275
+ try:
276
+ # Tải file từ MongoDB (original_xml)
277
+ if not isinstance(file_id, ObjectId):
278
+ file_id = ObjectId(file_id)
279
+ file_data = fs_original.get(file_id)
280
+ xml_content = file_data.read().decode("utf-8")
281
+
282
+ # Chuyển đổi XML string thành cây XML
283
+ root = ET.fromstring(xml_content)
284
+
285
+ # Cập nhật nội dung dịch
286
+ for slide in root.findall("slide"):
287
+ slide_num = slide.get("number")
288
+ if slide_num in translated_dict:
289
+ translated_texts = translated_dict[slide_num]
290
+ text_index = 0 # Keep track of the current translated text
291
+
292
+ def update_element_recursive(element):
293
+ nonlocal text_index # Access and modify the outer scope's index
294
+
295
+ if element.tag == "text_element":
296
+ props = element.find("properties")
297
+ if props is not None and props.text:
298
+ try:
299
+ shape_data = json.loads(props.text)
300
+ original_text = ""
301
+
302
+ # Handle direct text and paragraph-based text
303
+ if 'text' in shape_data:
304
+ original_text = shape_data['text']
305
+ if text_index < len(translated_texts):
306
+ shape_data['text'] = translated_texts[text_index]
307
+ adjust_size(original_text, translated_texts[text_index], shape_data)
308
+ text_index += 1
309
+ elif 'paragraphs' in shape_data:
310
+ for paragraph in shape_data['paragraphs']:
311
+ if 'text' in paragraph:
312
+ original_text = paragraph['text']
313
+ if text_index < len(translated_texts):
314
+ paragraph['text'] = translated_texts[text_index]
315
+ adjust_size(original_text, translated_texts[text_index], paragraph)
316
+ text_index += 1
317
+ elif 'runs' in paragraph:
318
+ for run in paragraph['runs']:
319
+ if 'text' in run:
320
+ original_text = run['text']
321
+ if text_index < len(translated_texts):
322
+ run['text'] = translated_texts[text_index]
323
+ adjust_size(original_text, translated_texts[text_index], run)
324
+ text_index += 1
325
+ props.text = json.dumps(shape_data, indent=2)
326
+ except json.JSONDecodeError:
327
+ print(f"JSONDecodeError in text_element on slide {slide_num}")
328
+
329
+ elif element.tag == "table_element":
330
+ props = element.find("properties")
331
+ if props is not None and props.text:
332
+ try:
333
+ table_data = json.loads(props.text)
334
+ for row in table_data.get("cells", []):
335
+ for cell in row:
336
+ original_text = cell.get('text', '')
337
+ if text_index < len(translated_texts):
338
+ cell['text'] = translated_texts[text_index]
339
+ adjust_size(original_text, translated_texts[text_index], cell)
340
+ text_index += 1
341
+ props.text = json.dumps(table_data, indent=2)
342
+ except json.JSONDecodeError:
343
+ print(f"JSONDecodeError in table_element on slide {slide_num}")
344
+
345
+ elif element.tag == "group_element":
346
+ print("Group element found")
347
+ for child in element:
348
+ update_element_recursive(child) # Recursively process children
349
+
350
+ # Start the recursive update from the slide's direct children
351
+ for child in slide:
352
+ update_element_recursive(child)
353
+
354
+ # Chuyển XML thành chuỗi và làm đẹp định dạng
355
+ updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
356
+
357
+ # Lưu file cập nhật vào MongoDB (final_xml)
358
+ new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}_translated.xml")
359
+ print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
360
+
361
+ return new_file_id
362
+
363
+ except Exception as e:
364
+ print(f"❌ Lỗi khi cập nhật XML: {e}")
365
+ return None
366
+ finally:
367
+ client.close()
368
+
test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
translate/translator.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, List
3
+ from google import genai
4
+
5
+ def translate_text_dict(text_dict: Dict[str, List[str]], source_lang: str, target_lang: str = "vi", gemini_api: str = "") -> Dict[str, List[str]]:
6
+ def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
7
+ """Translates a single batch of text."""
8
+ prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}
9
+ The text is in {source_lang}, with a chance of there being phrases in other languages as well.
10
+
11
+ Read through the entire dictionary, then translate the texts into {target_lang} so that the meaning is as close to the intended context as possible.
12
+
13
+ Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
14
+ Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.
15
+
16
+ Aim for brevity if possible so that the length of the translations match the length of the original texts, but prioritize accuracy above all .
17
+ Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
18
+
19
+ client = genai.Client(api_key=gemini_api)
20
+ response = client.models.generate_content(
21
+ model="gemini-2.0-flash", contents=prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
22
+
23
+ # Handle potential errors in the response, including rate limits and invalid JSON.
24
+ try:
25
+ # More robust JSON parsing: Handle code blocks, markdown, and other variations.
26
+ response_text = response.text
27
+ start = response_text.find('{')
28
+ end = response_text.rfind('}') + 1
29
+
30
+ if start == -1 or end == -1:
31
+ raise ValueError("Invalid JSON response from Gemini API: No object found.")
32
+
33
+ json_string = response_text[start:end]
34
+ trans_dict = json.loads(json_string)
35
+ return trans_dict
36
+ except (ValueError, json.JSONDecodeError) as e:
37
+ print(f"Error processing Gemini API response: {e}")
38
+ print(f"Raw response text: {response.text}") # Print the raw response for debugging
39
+ return {} # Return an empty dict on error (or raise, depending on your needs)
40
+ except Exception as e:
41
+ print(f"An unexpected error occur: {e}")
42
+ return {}
43
+
44
+
45
+ batch_size = 30 # Adjust as needed, based on testing and Gemini's context window limits
46
+ translated_dict = {}
47
+ keys = list(text_dict.keys())
48
+
49
+ # Process in batches
50
+ for i in range(0, len(keys), batch_size):
51
+ batch_keys = keys[i:i + batch_size]
52
+ batch_dict = {key: text_dict[key] for key in batch_keys}
53
+ translated_batch = translate_batch(batch_dict)
54
+
55
+ # Merge results
56
+ if translated_batch: # Only merge if the translation was successful
57
+ translated_dict.update(translated_batch)
58
+ else:
59
+ print(f"Skipping batch {i // batch_size} due to translation error.")
60
+
61
+ return translated_dict
62
+
63
+
64
+
word/word_translate.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import docx
3
+ from docx import Document
4
+ from google import genai # Use OpenAI for LLM translation
5
+ import ast
6
+ import json
7
+ from docx.oxml import OxmlElement
8
+ from copy import deepcopy
9
+ import io
10
+ from pymongo import MongoClient
11
+ from gridfs import GridFS
12
+ from docx import Document
13
+ from deep_translator import GoogleTranslator
14
+
15
+ gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg"
16
+ target_language = 'vi'
17
+ source_language = 'en'
18
+
19
+ def batch_translate(texts, source_lang = 'en', target_lang="fr"):
20
+ """ Translates multiple text segments in a single API call. """
21
+ if not texts:
22
+ return texts # Skip if empty
23
+
24
+ prompt = f"""
25
+ Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting:
26
+ {json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])}
27
+
28
+ - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
29
+ - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
30
+ - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
31
+ - Return only valid JSON — a Python array of translated objects.
32
+ - If the original array is empty, return an empty array.
33
+ """
34
+
35
+ client = genai.Client(api_key=gemini_api)
36
+ response = client.models.generate_content(
37
+ model="gemini-2.0-flash", contents=prompt)
38
+
39
+ translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
40
+
41
+ return [item["text"] for item in translated_output]
42
+
43
+ def merge_runs(runs):
44
+ """ Merges adjacent runs with the same style. """
45
+ merged_runs = []
46
+ for run in runs:
47
+ if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run):
48
+ if (
49
+ merged_runs and
50
+ run.style == merged_runs[-1].style and
51
+ merged_runs[-1].bold == run.bold and
52
+ merged_runs[-1].italic == run.italic and
53
+ merged_runs[-1].underline == run.underline and
54
+ merged_runs[-1].font.size == run.font.size and
55
+ merged_runs[-1].font.color.rgb == run.font.color.rgb and
56
+ merged_runs[-1].font.name == run.font.name
57
+ ):
58
+ merged_runs[-1].text += run.text
59
+ else:
60
+ merged_runs.append(run)
61
+ return merged_runs
62
+
63
+ NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
64
+
65
+ def translate_paragraphs(doc, source_lang, target_lang):
66
+ paragraphs = []
67
+ for para in doc.paragraphs:
68
+ for run in merge_runs(para.iter_inner_content()):
69
+ if isinstance(run, docx.text.run.Run):
70
+ paragraphs.append(run.text)
71
+ # paragraphs = merge_runs(paragraphs)
72
+ translated_paragraphs = []
73
+ temp_batch = []
74
+ words = 0
75
+ for para in paragraphs:
76
+ if len(para) + words > 5000:
77
+ translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
78
+ temp_batch = []
79
+ words = 0
80
+ words += len(para)
81
+ temp_batch.append(para)
82
+ translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang)
83
+ # translated_paragraphs = batch_translate(paragraphs, target_lang)
84
+
85
+ if len(translated_paragraphs) > 0:
86
+ # Replace translated text back
87
+ para_index = 0
88
+ for para in doc.paragraphs:
89
+ original_para = deepcopy(para)
90
+ para.clear() # Remove text while keeping paragraph properties
91
+ for run in merge_runs(original_para.iter_inner_content()):
92
+ if isinstance(run, docx.text.run.Run):
93
+ translated_text = translated_paragraphs[para_index]
94
+ try:
95
+ translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
96
+ except UnicodeEncodeError:
97
+ translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
98
+ drawing = run._element.find(f".//{NS_W}drawing")
99
+ pict = run._element.find(".//{NS_W}pict")
100
+
101
+ # Create a new run with translated text and copy the formatting
102
+ new_run = para.add_run(translated_text)
103
+ new_run.style = run.style
104
+
105
+ if drawing is not None:
106
+ new_run._element.append(drawing)
107
+ elif pict is not None:
108
+ new_run._element.append(pict)
109
+
110
+ # Copy formatting from original run
111
+ new_run.bold = run.bold
112
+ new_run.italic = run.italic
113
+ new_run.underline = run.underline
114
+ new_run.font.size = run.font.size
115
+ new_run.font.color.rgb = run.font.color.rgb
116
+ new_run.font.name = run.font.name
117
+ para_index += 1
118
+ elif isinstance(run, docx.text.hyperlink.Hyperlink):
119
+ parent = run._element
120
+ tag = parent.tag.split("}")[-1]
121
+
122
+ # Create a new hyperlink element with the correct namespace
123
+ new_hyperlink = OxmlElement(f"w:{tag}")
124
+ for attr in parent.attrib:
125
+ new_hyperlink.set(attr, parent.get(attr))
126
+ for child in parent:
127
+ new_hyperlink.append(child)
128
+ para._element.append(new_hyperlink)
129
+
130
+
131
+ def translate_tables(doc, source_lang, target_lang):
132
+ table_texts = []
133
+ run_mapping = {}
134
+
135
+
136
+ for table in doc.tables:
137
+ for row in table.rows:
138
+ for cell in row.cells:
139
+ for para in cell.paragraphs:
140
+ for run in merge_runs(para.iter_inner_content()):
141
+ if isinstance(run, docx.text.run.Run):
142
+ table_texts.append(run.text)
143
+
144
+ translated_tables = []
145
+ temp_batch = []
146
+ words = 0
147
+ for para in table_texts:
148
+ if len(para) + words > 5000:
149
+ translated_tables += batch_translate(temp_batch, source_lang, target_lang)
150
+ temp_batch = []
151
+ words = 0
152
+ words += len(para)
153
+ temp_batch.append(para)
154
+ translated_tables += batch_translate(temp_batch, source_lang, target_lang)
155
+ # translated_tables = batch_translate(table_texts, target_lang)
156
+
157
+ if len(translated_tables) > 0:
158
+ table_index = 0
159
+ for table in doc.tables:
160
+ for row in table.rows:
161
+ for cell in row.cells:
162
+ for para in cell.paragraphs:
163
+ original_para = deepcopy(para)
164
+ para.clear() # Remove text while keeping paragraph properties
165
+ for run in merge_runs(original_para.iter_inner_content()):
166
+ if isinstance(run, docx.text.run.Run):
167
+ translated_text = translated_tables[table_index]
168
+ try:
169
+ translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters
170
+ except UnicodeEncodeError:
171
+ translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters
172
+ drawing = run._element.find(f".//{NS_W}drawing")
173
+ pict = run._element.find(".//{NS_W}pict")
174
+
175
+ # Create a new run with translated text and copy the formatting
176
+ new_run = para.add_run(translated_text)
177
+ new_run.style = run.style
178
+
179
+ if drawing is not None:
180
+ new_run._element.append(drawing)
181
+ elif pict is not None:
182
+ new_run._element.append(pict)
183
+
184
+ # Copy formatting from original run
185
+ new_run.bold = run.bold
186
+ new_run.italic = run.italic
187
+ new_run.underline = run.underline
188
+ new_run.font.size = run.font.size
189
+ new_run.font.color.rgb = run.font.color.rgb
190
+ new_run.font.name = run.font.name
191
+ table_index += 1
192
+ elif isinstance(run, docx.text.hyperlink.Hyperlink):
193
+ parent = run._element
194
+ tag = parent.tag.split("}")[-1]
195
+
196
+ # Create a new hyperlink element with the correct namespace
197
+ new_hyperlink = OxmlElement(f"w:{tag}")
198
+ for attr in parent.attrib:
199
+ new_hyperlink.set(attr, parent.get(attr))
200
+ for child in parent:
201
+ new_hyperlink.append(child)
202
+ para._element.append(new_hyperlink)
203
+
204
+ def translate_header_footer(doc, source_lang, target_lang):
205
+ head_foot = []
206
+ for section in doc.sections:
207
+ for header in section.header.paragraphs:
208
+ for run in header.runs:
209
+ head_foot.append(run.text)
210
+ for footer in section.footer.paragraphs:
211
+ for run in footer.runs:
212
+ head_foot.append(run.text)
213
+ translated_head_foot = batch_translate(head_foot, source_lang, target_lang)
214
+
215
+ i = 0
216
+ for section in doc.sections:
217
+ for header in section.header.paragraphs:
218
+ for run in header.runs:
219
+ run.text = translated_head_foot[i]
220
+ i += 1
221
+ for footer in section.footer.paragraphs:
222
+ for run in footer.runs:
223
+ run.text = translated_head_foot[i]
224
+ i += 1
225
+
226
+ def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'):
227
+ client = MongoClient('mongodb://localhost:27017/')
228
+ db = client[db_name]
229
+ fs_input = GridFS(db, collection="root_file")
230
+ fs_output = GridFS(db, collection="final_file")
231
+
232
+ file_data = fs_input.get(file_id).read()
233
+ input_doc = Document(io.BytesIO(file_data))
234
+
235
+ translate_paragraphs(input_doc, source_lang, target_lang)
236
+ translate_tables(input_doc, source_lang, target_lang)
237
+ translate_header_footer(input_doc, source_lang, target_lang)
238
+
239
+ output_stream = io.BytesIO()
240
+ input_doc.save(output_stream)
241
+ output_stream.seek(0)
242
+
243
+ translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx")
244
+ print(f"Translation complete! Saved with file ID: {translated_file_id}")
245
+
246
+ return translated_file_id