mintlee commited on
Commit
bdcb5e5
·
1 Parent(s): ce94f1c
Files changed (4) hide show
  1. pages/upload.py +92 -25
  2. test.ipynb +0 -0
  3. word/word_helper.py +29 -26
  4. word_helper.py +398 -0
pages/upload.py CHANGED
@@ -12,43 +12,110 @@ dotenv.load_dotenv(".env")
12
 
13
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
14
 
15
- st.title("Translate Your File Easily! 🌍")
 
16
 
17
- uploaded_file = st.file_uploader("📂 Chọn file để dịch")
18
- source_lang = st.selectbox("🌐 Chọn ngôn ngữ của tài liệu", ["english", "vietnamese"])
19
- target_lang = st.selectbox("🌐 Chọn ngôn ngữ muốn dịch sang", ["english", "vietnamese"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def process_file(file, file_type):
23
  progress_bar = st.progress(0)
24
- file_id, file_name = save_file_to_mongodb(uploaded_file=file, db_name=file_type.lower(), collection_name="root_file")
25
- progress_bar.progress(20)
26
- st.write(f"📂 File ID: {file_id}")
27
-
28
- if file_type == "PPTX":
29
- final_id = translate_pptx(file_id, file_name, source_lang = source_lang, target_lang = target_lang, slides_per_batch=5)
30
- progress_bar.progress(60)
31
- elif file_type == "Excel":
32
- final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
33
- elif file_type == "CSV":
34
- final_id = translate_csv(file_id = file_id, source_lang = source_lang, target_lang = target_lang)
35
- elif file_type == "Word":
36
- final_id = translate_docx(file_id = file_id, file_name = file_name , source_lang = source_lang, target_lang = target_lang)
37
- else:
38
- st.error("❌ Loại file không hỗ trợ!")
39
- return
 
 
40
 
41
  progress_bar.progress(80)
42
- st.write("✅ File đã được dịch xong!")
43
- file_io, file_name = fetch_file_from_mongodb(file_type.lower(), "final_file", final_id)
44
- progress_bar.progress(100)
 
45
 
46
  if file_io:
 
47
  st.download_button("⬇️ Tải file về", data=file_io.getvalue(), file_name=file_name)
48
  else:
49
  st.error("❌ Không thể tải xuống file. Vui lòng thử lại!")
50
 
51
  if uploaded_file and st.button("🚀 Upload và dịch ngay!"):
52
- file_type = detect_file_type(uploaded_file)
53
- st.write(f"🔍 Loại file phát hiện: {file_type}")
 
54
  process_file(uploaded_file, file_type)
 
12
 
13
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
14
 
15
+ # Cấu hình trang
16
+ st.set_page_config(page_title="Translate Your File", page_icon="🌍", layout="centered")
17
 
18
+ # CSS custom
19
+ st.markdown("""
20
+ <style>
21
+ body {
22
+ background-color: #ffffff;
23
+ color: #333333;
24
+ }
25
+ .main {
26
+ background-color: #ffffff;
27
+ color: #333333;
28
+ }
29
+ h1, h2, h3 {
30
+ color: #007acc;
31
+ text-align: center;
32
+ }
33
+ .stButton>button {
34
+ background-color: #007acc;
35
+ color: white;
36
+ border-radius: 10px;
37
+ padding: 0.75em 2em;
38
+ font-size: 1.1em;
39
+ border: none;
40
+ transition: 0.3s;
41
+ }
42
+ .stButton>button:hover {
43
+ background-color: #005f99;
44
+ color: white;
45
+ }
46
+ .stFileUploader {
47
+ border: 2px dashed #007acc;
48
+ padding: 20px;
49
+ border-radius: 10px;
50
+ text-align: center;
51
+ background-color: #f9f9f9;
52
+ }
53
+ div[data-baseweb="select"] > div {
54
+ background-color: white !important;
55
+ color: black !important;
56
+ border-radius: 8px;
57
+ }
58
+ /* Thu hẹp khoảng cách giữa label và selectbox */
59
+ .stSelectbox label {
60
+ margin-bottom: 0.2rem;
61
+ font-weight: bold;
62
+ color: #333333;
63
+ }
64
+ footer {visibility: hidden;}
65
+ </style>
66
+ """, unsafe_allow_html=True)
67
 
68
+ # Upload file section
69
+ with st.container():
70
+ st.markdown("### 📂 Chọn file để dịch")
71
+ uploaded_file = st.file_uploader("Kéo thả hoặc chọn file", type=['pptx', 'xlsx', 'csv', 'docx'])
72
+
73
+ with st.container():
74
+ col1, col2 = st.columns(2)
75
+
76
+ with col1:
77
+ st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ của tài liệu</p>', unsafe_allow_html=True)
78
+ source_lang = st.selectbox(" ", ["english", "vietnamese"], key="source_lang")
79
+
80
+ with col2:
81
+ st.markdown('<p style="font-size:16px; font-weight:bold; margin-bottom:4px;">🌐 Ngôn ngữ muốn dịch sang</p>', unsafe_allow_html=True)
82
+ target_lang = st.selectbox(" ", ["english", "vietnamese"], key="target_lang")
83
 
84
  def process_file(file, file_type):
85
  progress_bar = st.progress(0)
86
+ with st.spinner("🔄 Đang lưu file lên hệ thống..."):
87
+ file_id, file_name = save_file_to_mongodb(uploaded_file=file, db_name=file_type.lower(), collection_name="root_file")
88
+ progress_bar.progress(20)
89
+ st.write(f"📂 **File ID:** `{file_id}`")
90
+
91
+ with st.spinner("🔍 Đang xử dịch tài liệu..."):
92
+ if file_type == "PPTX":
93
+ final_id = translate_pptx(file_id, file_name, source_lang=source_lang, target_lang=target_lang, slides_per_batch=5)
94
+ progress_bar.progress(60)
95
+ elif file_type == "Excel":
96
+ final_id = translate_xlsx(file_id=file_id, file_name=file_name, source_lang=source_lang, target_lang=target_lang)
97
+ elif file_type == "CSV":
98
+ final_id = translate_csv(file_id=file_id, source_lang=source_lang, target_lang=target_lang)
99
+ elif file_type == "Word":
100
+ final_id = translate_docx(file_id=file_id, file_name=file_name, source_lang=source_lang, target_lang=target_lang)
101
+ else:
102
+ st.error("❌ Loại file không hỗ trợ!")
103
+ return
104
 
105
  progress_bar.progress(80)
106
+
107
+ with st.spinner("📦 Đang tải file đã dịch..."):
108
+ file_io, file_name = fetch_file_from_mongodb(file_type.lower(), "final_file", final_id)
109
+ progress_bar.progress(100)
110
 
111
  if file_io:
112
+ st.success("🎉 File đã được dịch thành công!")
113
  st.download_button("⬇️ Tải file về", data=file_io.getvalue(), file_name=file_name)
114
  else:
115
  st.error("❌ Không thể tải xuống file. Vui lòng thử lại!")
116
 
117
  if uploaded_file and st.button("🚀 Upload và dịch ngay!"):
118
+ with st.spinner("🔎 Đang phát hiện loại file..."):
119
+ file_type = detect_file_type(uploaded_file)
120
+ st.write(f"🔍 **Loại file phát hiện:** `{file_type}`")
121
  process_file(uploaded_file, file_type)
test.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
word/word_helper.py CHANGED
@@ -200,6 +200,8 @@ def brute_force_fix(batch, translated_batch):
200
  return translated_batch
201
 
202
  def batch_translate_loop(batch, source_lang, target_lang):
 
 
203
  translated_batch_response = batch_translate(batch, source_lang, target_lang)
204
  try:
205
  translated_batch = response_to_dict(translated_batch_response)
@@ -214,13 +216,12 @@ def batch_translate_loop(batch, source_lang, target_lang):
214
  break
215
  except:
216
  pass
217
- try:
218
- translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip("json```").strip("```").strip().strip("\""))
 
219
  except:
220
- try:
221
- translated_batch = response_to_dict(translated_batch_response)
222
- except:
223
- raise ValueError("The translated batch is not a list.")
224
  if len(translated_batch) != len(batch):
225
  print("Length mismatch after translation. Brute Force Fixing...")
226
  translated_batch = brute_force_fix(batch, translated_batch)
@@ -229,33 +230,35 @@ def batch_translate_loop(batch, source_lang, target_lang):
229
  print(len(batch), len(translated_batch))
230
  return translated_batch
231
 
232
- def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
233
- full_translated_texts = []
234
  batch = []
235
  word_count = 0
236
- global time_spent_sleeping
237
 
238
  for string in texts:
239
- if len(string.split()) + word_count >= 2000:
240
- print('Translating a batch.')
241
-
242
- translated_batch = batch_translate_loop(batch, source_lang, target_lang)
243
- full_translated_texts += translated_batch
244
-
245
- time.sleep(3)
246
- time_spent_sleeping += 3
247
  batch = []
248
  word_count = 0
249
  batch.append(string)
250
  word_count += len(string)
251
-
252
- print('Translating a batch.')
253
- if len(batch) == 0:
254
- return full_translated_texts
255
-
256
- translated_batch = batch_translate_loop(batch, source_lang, target_lang)
257
- full_translated_texts += translated_batch
258
-
 
 
 
 
 
 
 
 
 
259
  return full_translated_texts
260
 
261
  def merge_runs(runs):
@@ -285,7 +288,7 @@ def translate_header_footer(doc, source_lang, target_lang):
285
  for footer in section.footer.paragraphs:
286
  for run in footer.runs:
287
  head_foot.append(run.text)
288
- translated_head_foot = full_translate(head_foot, source_lang, target_lang)
289
 
290
  i = 0
291
  for section in doc.sections:
 
200
  return translated_batch
201
 
202
  def batch_translate_loop(batch, source_lang, target_lang):
203
+ if not batch:
204
+ return batch
205
  translated_batch_response = batch_translate(batch, source_lang, target_lang)
206
  try:
207
  translated_batch = response_to_dict(translated_batch_response)
 
216
  break
217
  except:
218
  pass
219
+
220
+ try:
221
+ translated_batch = response_to_dict(translated_batch_response)
222
  except:
223
+ raise ValueError("The translated batch is not a list.")
224
+
 
 
225
  if len(translated_batch) != len(batch):
226
  print("Length mismatch after translation. Brute Force Fixing...")
227
  translated_batch = brute_force_fix(batch, translated_batch)
 
230
  print(len(batch), len(translated_batch))
231
  return translated_batch
232
 
233
+ def get_batches(texts, limit = 1000):
234
+ batches = []
235
  batch = []
236
  word_count = 0
 
237
 
238
  for string in texts:
239
+ if len(string.split()) + word_count >= limit:
240
+ batches.append(batch)
 
 
 
 
 
 
241
  batch = []
242
  word_count = 0
243
  batch.append(string)
244
  word_count += len(string)
245
+
246
+ batches.append(batch)
247
+ return batches
248
+
249
+ def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
250
+ full_translated_texts = []
251
+ batches = get_batches(texts, limit = 1000)
252
+ word_count = 0
253
+ global time_spent_sleeping
254
+
255
+ for batch in batches:
256
+ translated_batch = batch_translate_loop(batch, source_lang, target_lang)
257
+ full_translated_texts += translated_batch
258
+
259
+ time.sleep(3)
260
+ time_spent_sleeping += 3
261
+
262
  return full_translated_texts
263
 
264
  def merge_runs(runs):
 
288
  for footer in section.footer.paragraphs:
289
  for run in footer.runs:
290
  head_foot.append(run.text)
291
+ translated_head_foot = batch_translate_loop(head_foot, source_lang, target_lang)
292
 
293
  i = 0
294
  for section in doc.sections:
word_helper.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import docx
3
+ from docx import Document
4
+ import google.generativeai as genai
5
+ import ast
6
+ import json
7
+ import re
8
+ import time
9
+
10
+ genai.configure(api_key="AIzaSyC5-TFxp9AinBx2_HsIL9SMA4CykkLVG8w")
11
+
12
+
13
+ time_spent_sleeping = 0
14
+ mismatches = 0
15
+
16
+ def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
17
+ """ Translates multiple text segments in a single API call. """
18
+ if not texts:
19
+ return texts # Skip if empty
20
+
21
+ system_prompt = """
22
+ Translate the contents of a JSON file from the specified source language to the specified target language while preserving the structure, spaces, and context of the original text.
23
+
24
+ Instructions:
25
+ 1. You will be given three inputs: source language, target language, and a JSON file.
26
+ 2. The JSON file contains a Python dictionary where each key is an integer, and each value is a string.
27
+ 3. Ensure one-to-one correspondence—each input item must correspond to exactly one output item with the same number of items.
28
+ 4. The names of people, places, and organizations should be preserved in the translation.
29
+ 5. Preserve spaces before or after strings. Do not remove, merge, split, or omit any strings.
30
+ 6. Translate paragraphs and ensure the translation makes sense when text is put together.
31
+ 7. Translate split words so that the word is not split in the translation.
32
+ 8. Return a JSON object that is a Python dictionary containing as many items as the original JSON file, with keys and order preserved.
33
+ 9. The output must be a syntactically correct Python dictionary.
34
+
35
+ Additional Examples:
36
+ **Input 1**:
37
+ - Source language: English
38
+ - Target language: Vietnamese
39
+ - JSON file:
40
+ ```json
41
+ {"0": "My name is ", "1": "Huy", "2": ".", "3": " Today is ", "4": "a ", "5": "good day", "6": ".", "7": ""}
42
+ ```
43
+ **Output 1**:
44
+ ```json
45
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": " Hôm nay là ", "4": "một ", "5": "ngày đẹp", "6": ".", "7": ""}
46
+ ```
47
+
48
+ **Input 2**:
49
+ - Source language: English
50
+ - Target language: Spanish
51
+ - JSON file:
52
+ ```json
53
+ {"0": "The sky is ", "1": "blue", "2": ".", "3": " Water is ", "4": "essential", "5": " for ", "6": "life", "7": "."}
54
+ ```
55
+ **Output 2**:
56
+ ```json
57
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la vida", "7": "."}
58
+ ```
59
+
60
+ **Input 3**:
61
+ - Source language: English
62
+ - Target language: French
63
+ - JSON file:
64
+ ```json
65
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog", "7": "."}
66
+ ```
67
+ **Output 3**:
68
+ ```json
69
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux", "7": "."}
70
+ ```
71
+
72
+ Perform the translation and return the result as specified above. Do not include any additional text other than the translated JSON object.
73
+ """
74
+ json_data = json.dumps({i: t for i, t in enumerate(texts)})
75
+ user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
76
+
77
+ model = genai.GenerativeModel('gemini-2.0-flash')
78
+ response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
79
+ 'temperature': 1, # Adjust temperature for desired creativity
80
+ 'top_p': 1,
81
+ 'top_k': 1,})
82
+ # response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\""))
83
+ # print(len(texts), len(list(response_dict.values())))
84
+ # return list(response_dict.values())
85
+
86
+ return response
87
+
88
+ def response_to_dict(response):
89
+ return list(ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")).values())
90
+
91
+ def fix_translate(texts, translated_text):
92
+ """ Translates multiple text segments in a single API call. """
93
+ if not texts:
94
+ return texts # Skip if empty
95
+
96
+ system_prompt = """
97
+ You are given the original JSON dictionary and the translated response text. Your task is to ensure that the translated text is in the correct format and has the same number of items as the original JSON dictionary.
98
+
99
+ Steps to follow:
100
+ 1. Parse the original and translated JSON dictionaries.
101
+ 2. Ensure that the keys in both dictionaries are strings (i.e., "1" instead of 1).
102
+ 3. Compare the number of items in both dictionaries.
103
+ 4. If the number of items in the translated dictionary is not equal to the number of items in the original dictionary, adjust the translated dictionary by:
104
+ a. Adding missing items with empty strings if there are fewer items.
105
+ b. Merging or splitting items to ensure correspondence with the original items if there are more items.
106
+ 5. Ensure that each item in the translated dictionary is in the correct order, with the same key as the original item.
107
+ 6. Preserve any leading or trailing spaces in the original strings.
108
+ 7. Ensure the output is a syntactically correct Python dictionary, with proper opening and closing braces.
109
+ 8. If the translated dictionary is already correct, return it as is.
110
+ 9. Return the corrected JSON dictionary in proper Python dictionary format.
111
+
112
+ Example Inputs and Outputs:
113
+
114
+ **Input:**
115
+ - Original JSON dictionary:
116
+ ```json
117
+ {"0": "My name is ", "1": "Huy", "2": ".", "3": " Today is ", "4": "a ", "5": "good day", "6": ".", "7": ""}
118
+ ```
119
+ - Translated response text with fewer items:
120
+ ```json
121
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": "Hôm nay ", "4": "là một ", "5": "ngày đẹp", "6": "."}
122
+ ```
123
+
124
+ **Output:**
125
+ ```json
126
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": "Hôm nay ", "4": "là một ", "5": "ngày đẹp", "6": ".", "7": ""}
127
+ ```
128
+
129
+ **Input:**
130
+ - Original JSON dictionary:
131
+ ```json
132
+ {"0": "The sky is ", "1": "blue", "2": ".", "3": " Water is ", "4": "essential", "5": " for ", "6": "life", "7": "."}
133
+ ```
134
+ - Translated response text with more items:
135
+ ```json
136
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la", "7": " vida", "8": "."}
137
+ ```
138
+
139
+ **Output:**
140
+ ```json
141
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la vida", "7": "."}
142
+ ```
143
+
144
+ **Input:**
145
+ - Original JSON dictionary:
146
+ ```json
147
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog", "7": "."}
148
+ ```
149
+ - Translated response text with issues:
150
+ ```json
151
+ {"0": "Le renard ", "1": "brun ", 2: "rapide ", 3: "saute ", 4: "par-dessus ", "5": "le ", "6": "chien ", "7": "paresseux", 8: "."}
152
+ ```
153
+
154
+ **Output:**
155
+ ```json
156
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux", "7": "."}
157
+ ```
158
+
159
+ **Input:**
160
+ - Original JSON dictionary:
161
+ ```json
162
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog."}
163
+ ```
164
+ - Translated response text with wrong formatting:
165
+ ```json
166
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux".}
167
+ ```
168
+
169
+ **Output:**
170
+ ```json
171
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux."}
172
+ ```
173
+
174
+ Perform the corrections and return the result as a properly formatted Python dictionary.
175
+ """
176
+ json_data = json.dumps({i: t for i, t in enumerate(texts)})
177
+ user_prompt = f"Original JSON dictionary: {json_data}. Translated response text: {translated_text}"
178
+
179
+ model = genai.GenerativeModel('gemini-2.0-flash')
180
+ response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
181
+ 'temperature': 1, # Adjust temperature for desired creativity
182
+ 'top_p': 1,
183
+ 'top_k': 1,})
184
+ return response_to_dict(response)
185
+ # return response
186
+
187
+ def brute_force_fix(batch, translated_batch):
188
+ if len(batch) > len(translated_batch):
189
+ translated_batch += [""] * (len(batch) - len(translated_batch))
190
+ elif len(batch) < len(translated_batch):
191
+ translated_batch = translated_batch[:len(batch)]
192
+ return translated_batch
193
+
194
+ def batch_translate_loop(batch, source_lang, target_lang):
195
+ if not batch:
196
+ return batch
197
+ translated_batch_response = batch_translate(batch, source_lang, target_lang)
198
+ try:
199
+ translated_batch = response_to_dict(translated_batch_response)
200
+ assert(len(translated_batch) == len(batch))
201
+
202
+ except:
203
+ for i in range(10):
204
+ print(f'I am ChatGPT and I am retarded, retrying translation time {i}:')
205
+ try:
206
+ translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip("json```").strip("```").strip().strip("\""))
207
+ assert(len(translated_batch) == len(batch))
208
+ break
209
+ except:
210
+ pass
211
+
212
+ try:
213
+ translated_batch = response_to_dict(translated_batch_response)
214
+ except:
215
+ raise ValueError("The translated batch is not a list.")
216
+
217
+ if len(translated_batch) != len(batch):
218
+ print("Length mismatch after translation. Brute Force Fixing...")
219
+ translated_batch = brute_force_fix(batch, translated_batch)
220
+ global mismatches
221
+ mismatches += 1
222
+ print(len(batch), len(translated_batch))
223
+ return translated_batch
224
+
225
+ def get_batches(texts, limit = 1000):
226
+ batches = []
227
+ batch = []
228
+ word_count = 0
229
+
230
+ for string in texts:
231
+ if len(string.split()) + word_count >= limit:
232
+ batches.append(batch)
233
+ batch = []
234
+ word_count = 0
235
+ batch.append(string)
236
+ word_count += len(string)
237
+
238
+ batches.append(batch)
239
+ return batches
240
+
241
+ def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
242
+ full_translated_texts = []
243
+ batches = get_batches(texts, limit = 1000)
244
+ word_count = 0
245
+ global time_spent_sleeping
246
+
247
+ for batch in batches:
248
+ translated_batch = batch_translate_loop(batch, source_lang, target_lang)
249
+ full_translated_texts += translated_batch
250
+
251
+ time.sleep(3)
252
+ time_spent_sleeping += 3
253
+
254
+ return full_translated_texts
255
+
256
+ def merge_runs(runs):
257
+ """ Merges adjacent runs with the same style. """
258
+ merged_runs = []
259
+ for run in runs:
260
+ if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and
261
+ run.style == merged_runs[-1].style and
262
+ merged_runs[-1].bold == run.bold and
263
+ merged_runs[-1].italic == run.italic and
264
+ merged_runs[-1].underline == run.underline and
265
+ merged_runs[-1].font.size == run.font.size and
266
+ merged_runs[-1].font.color.rgb == run.font.color.rgb and
267
+ merged_runs[-1].font.name == run.font.name):
268
+ merged_runs[-1].text += run.text
269
+ else:
270
+ merged_runs.append(run)
271
+ return merged_runs
272
+
273
+ NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
274
+ def translate_header_footer(doc, source_lang, target_lang):
275
+ head_foot = []
276
+ for section in doc.sections:
277
+ for header in section.header.paragraphs:
278
+ for run in header.runs:
279
+ head_foot.append(run.text)
280
+ for footer in section.footer.paragraphs:
281
+ for run in footer.runs:
282
+ head_foot.append(run.text)
283
+ translated_head_foot = batch_translate_loop(head_foot, source_lang, target_lang)
284
+
285
+ i = 0
286
+ for section in doc.sections:
287
+ for header in section.header.paragraphs:
288
+ for run in header.runs:
289
+ run.text = translated_head_foot[i]
290
+ i += 1
291
+ for footer in section.footer.paragraphs:
292
+ for run in footer.runs:
293
+ run.text = translated_head_foot[i]
294
+ i += 1
295
+
296
+ def get_text_elements_para(doc):
297
+ para_texts = []
298
+ for para in doc.paragraphs:
299
+ for element in para._element.iter():
300
+ if element.tag.endswith('t'):
301
+ if element.text:
302
+ emoji_pattern = r'[\U00010000-\U0010FFFF]'
303
+ # Split the text but keep emojis as separate elements
304
+ parts = re.split(f'({emoji_pattern})', element.text)
305
+ for part in parts:
306
+ if re.match(emoji_pattern, part):
307
+ continue
308
+ if len(part.strip()) != 0:
309
+ para_texts.append(part)
310
+
311
+ return para_texts
312
+
313
+ def get_text_elements_table(doc):
314
+ table_texts = []
315
+ for table in doc.tables:
316
+ for row in table.rows:
317
+ for cell in row.cells:
318
+ table_texts += get_text_elements_para(cell)
319
+ return table_texts
320
+
321
+ def translate_paragraphs(doc, translated_texts, i = 0):
322
+ for para in doc.paragraphs:
323
+ for element in para._element.iter():
324
+ if element.tag.endswith('t'):
325
+ if element.text:
326
+ emoji_pattern = r'[\U00010000-\U0010FFFF]'
327
+ # Split the text but keep emojis as separate elements
328
+ parts = re.split(f'({emoji_pattern})', element.text)
329
+ for j in range(len(parts)):
330
+ if re.match(emoji_pattern, parts[j]):
331
+ continue
332
+ if len(parts[j].strip()) != 0:
333
+ translated_text = translated_texts[i]
334
+ i += 1
335
+ parts[j] = translated_text
336
+ element.text = "".join(parts)
337
+ return doc, i
338
+
339
+ def translate_tables(doc, translated_texts):
340
+ i = 0
341
+ for table in doc.tables:
342
+ for row in table.rows:
343
+ for cell in row.cells:
344
+ cell, i = translate_paragraphs(cell, translated_texts, i)
345
+ return doc
346
+
347
+ def is_same_formatting(text1, text2):
348
+ """
349
+ Check if two texts have the same formatting.
350
+ """
351
+ return (text1.bold == text2.bold \
352
+ and text1.italic == text2.italic \
353
+ and text1.underline == text2.underline \
354
+ and text1.font.size == text2.font.size \
355
+ and text1.font.color.rgb == text2.font.color.rgb \
356
+ and text1.font.name == text2.font.name)
357
+
358
+ def merge_elements(doc):
359
+ for para in doc.paragraphs:
360
+ current_run = []
361
+ for element in para.iter_inner_content():
362
+ if isinstance(element, docx.text.run.Run):
363
+ if current_run == []:
364
+ current_run = [element]
365
+ elif is_same_formatting(current_run[0], element):
366
+ current_run[0].text += element.text
367
+ element.text = ""
368
+ else:
369
+ current_run = [element]
370
+ return doc
371
+
372
+ def translate_docx(input_file, source_lang = "English", target_lang="Vietnamese", output_num = ''):
373
+ """ Translates a Word document efficiently using batch processing. """
374
+ doc = Document(input_file)
375
+ output_file = os.path.join(os.path.dirname(input_file), f"{output_num}{target_lang}_translated_{os.path.basename(input_file)}")
376
+
377
+ doc = merge_elements(doc)
378
+
379
+ print('Translating paragraphs.')
380
+ para_texts = get_text_elements_para(doc)
381
+ translated_para = full_translate(para_texts, source_lang = source_lang, target_lang = target_lang)
382
+ print('Done translating pararaphs.')
383
+
384
+ print('Translating tables.')
385
+ table_texts = get_text_elements_table(doc)
386
+ translated_tables = full_translate(table_texts, source_lang = source_lang, target_lang = target_lang)
387
+ print('Done translating tables.')
388
+
389
+ print('Inserting paragaphs')
390
+ doc, _ = translate_paragraphs(doc, translated_para)
391
+ print('Inserting tables.')
392
+ doc = translate_tables(doc, translated_tables)
393
+
394
+ translate_header_footer(doc, source_lang, target_lang)
395
+ print('Done translating headers & footers.')
396
+
397
+ doc.save(output_file)
398
+ print(f"Translation complete! Saved as {output_file}")