mintlee commited on
Commit
70517a9
·
1 Parent(s): 804add3
Files changed (4) hide show
  1. pages/upload.py +1 -1
  2. test.ipynb +7 -7
  3. word/word_translate.ipynb +580 -0
  4. word/word_translate.py +280 -82
pages/upload.py CHANGED
@@ -26,7 +26,7 @@ def process_file(file, file_type):
26
  st.write(f"📂 File ID: {file_id}")
27
 
28
  if file_type == "PPTX":
29
- final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
30
  progress_bar.progress(60)
31
  elif file_type == "Excel":
32
  final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
 
26
  st.write(f"📂 File ID: {file_id}")
27
 
28
  if file_type == "PPTX":
29
+ final_id = translate_pptx(file_id, file_name, source_lang = source_lang, target_lang = target_lang, slides_per_batch=5)
30
  progress_bar.progress(60)
31
  elif file_type == "Excel":
32
  final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
test.ipynb CHANGED
@@ -30,7 +30,7 @@
30
  },
31
  {
32
  "cell_type": "code",
33
- "execution_count": 2,
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
@@ -235,10 +235,10 @@
235
  "name": "stdout",
236
  "output_type": "stream",
237
  "text": [
238
- "✅ Đã xóa 14 file trong collection 'root_file'\n",
239
- "✅ Đã xóa 1 file trong collection 'final_pptx'\n",
240
- "✅ Đã xóa 12 file trong collection 'original_xml'\n",
241
- "✅ Đã xóa 8 file trong collection 'final_xml'\n"
242
  ]
243
  }
244
  ],
@@ -668,7 +668,7 @@
668
  ],
669
  "metadata": {
670
  "kernelspec": {
671
- "display_name": "base",
672
  "language": "python",
673
  "name": "python3"
674
  },
@@ -682,7 +682,7 @@
682
  "name": "python",
683
  "nbconvert_exporter": "python",
684
  "pygments_lexer": "ipython3",
685
- "version": "3.12.9"
686
  }
687
  },
688
  "nbformat": 4,
 
30
  },
31
  {
32
  "cell_type": "code",
33
+ "execution_count": 3,
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
 
235
  "name": "stdout",
236
  "output_type": "stream",
237
  "text": [
238
+ "✅ Đã xóa 4 file trong collection 'root_file'\n",
239
+ "✅ Đã xóa 0 file trong collection 'final_pptx'\n",
240
+ "✅ Đã xóa 1 file trong collection 'original_xml'\n",
241
+ "✅ Đã xóa 1 file trong collection 'final_xml'\n"
242
  ]
243
  }
244
  ],
 
668
  ],
669
  "metadata": {
670
  "kernelspec": {
671
+ "display_name": "machine_translate",
672
  "language": "python",
673
  "name": "python3"
674
  },
 
682
  "name": "python",
683
  "nbconvert_exporter": "python",
684
  "pygments_lexer": "ipython3",
685
+ "version": "3.10.16"
686
  }
687
  },
688
  "nbformat": 4,
word/word_translate.ipynb ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 14,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import docx\n",
11
+ "from docx import Document\n",
12
+ "import google.generativeai as genai\n",
13
+ "import ast\n",
14
+ "import json\n",
15
+ "import re\n",
16
+ "import time\n",
17
+ "\n",
18
+ "genai.configure(api_key=\"AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8\")\n",
19
+ "target_language = 'English' \n",
20
+ "source_language = 'VietNamese'\n",
21
+ "\n",
22
+ "time_spent_sleeping = 0\n",
23
+ "mismatches = 0\n",
24
+ "\n",
25
+ "def batch_translate(texts, source_lang = 'English', target_lang=\"Vietnamese\"):\n",
26
+ " \"\"\" Translates multiple text segments in a single API call. \"\"\"\n",
27
+ " if not texts:\n",
28
+ " return texts # Skip if empty\n",
29
+ " \n",
30
+ " system_prompt = \"\"\"\n",
31
+ " Translate the contents of a JSON file from the specified source language to the specified target language while preserving the structure, spaces, and context of the original text.\n",
32
+ "\n",
33
+ " Instructions:\n",
34
+ " 1. You will be given three inputs: source language, target language, and a JSON file.\n",
35
+ " 2. The JSON file contains a Python dictionary where each key is an integer, and each value is a string.\n",
36
+ " 3. Ensure one-to-one correspondence—each input item must correspond to exactly one output item with the same number of items.\n",
37
+ " 4. Preserve spaces before or after strings. Do not remove, merge, split, or omit any strings.\n",
38
+ " 5. Translate paragraphs and ensure the translation makes sense when text is put together.\n",
39
+ " 6. Translate split words so that the word is not split in the translation.\n",
40
+ " 7. Return a JSON object that is a Python dictionary containing as many items as the original JSON file, with keys and order preserved.\n",
41
+ " 8. The output must be a syntactically correct Python dictionary.\n",
42
+ "\n",
43
+ " Additional Examples:\n",
44
+ " **Input 1**: \n",
45
+ " - Source language: English\n",
46
+ " - Target language: Vietnamese\n",
47
+ " - JSON file: \n",
48
+ " ```json\n",
49
+ " {\"0\": \"My name is \", \"1\": \"Huy\", \"2\": \".\", \"3\": \" Today is \", \"4\": \"a \", \"5\": \"good day\", \"6\": \".\", \"7\": \"\"}\n",
50
+ " ```\n",
51
+ " **Output 1**:\n",
52
+ " ```json\n",
53
+ " {\"0\": \"Tên tôi là \", \"1\": \"Huy\", \"2\": \".\", \"3\": \" Hôm nay là \", \"4\": \"một \", \"5\": \"ngày đẹp\", \"6\": \".\", \"7\": \"\"}\n",
54
+ " ```\n",
55
+ "\n",
56
+ " **Input 2**: \n",
57
+ " - Source language: English\n",
58
+ " - Target language: Spanish\n",
59
+ " - JSON file: \n",
60
+ " ```json\n",
61
+ " {\"0\": \"The sky is \", \"1\": \"blue\", \"2\": \".\", \"3\": \" Water is \", \"4\": \"essential\", \"5\": \" for \", \"6\": \"life\", \"7\": \".\"}\n",
62
+ " ```\n",
63
+ " **Output 2**:\n",
64
+ " ```json\n",
65
+ " {\"0\": \"El cielo es \", \"1\": \"azul\", \"2\": \".\", \"3\": \" El agua es \", \"4\": \"esencial\", \"5\": \" para \", \"6\": \"la vida\", \"7\": \".\"}\n",
66
+ " ```\n",
67
+ "\n",
68
+ " **Input 3**: \n",
69
+ " - Source language: English\n",
70
+ " - Target language: French \n",
71
+ " - JSON file: \n",
72
+ " ```json\n",
73
+ " {\"0\": \"The quick brown \", \"1\": \"fox \", \"2\": \"jumps \", \"3\": \"over \", \"4\": \"the \", \"5\": \"lazy \", \"6\": \"dog\", \"7\": \".\"}\n",
74
+ " ```\n",
75
+ " **Output 3**:\n",
76
+ " ```json\n",
77
+ " {\"0\": \"Le renard brun \", \"1\": \"rapide \", \"2\": \"saute \", \"3\": \"par-dessus \", \"4\": \"le \", \"5\": \"chien \", \"6\": \"paresseux\", \"7\": \".\"}\n",
78
+ " ```\n",
79
+ "\n",
80
+ " Perform the translation and return the result as specified above. Do not include any additional text other than the translated JSON object.\n",
81
+ " \"\"\"\n",
82
+ " json_data = json.dumps({i: t for i, t in enumerate(texts)})\n",
83
+ " user_prompt = f\"Source language: {source_lang}. Target language: {target_lang}. JSON file: {json_data}\" \n",
84
+ " \n",
85
+ " model = genai.GenerativeModel('gemini-2.0-flash')\n",
86
+ " response = model.generate_content(contents = system_prompt.strip() + \"\\n\" + user_prompt.strip(), generation_config={\n",
87
+ " 'temperature': 1, # Adjust temperature for desired creativity\n",
88
+ " 'top_p': 1,\n",
89
+ " 'top_k': 1,})\n",
90
+ " # response_dict = ast.literal_eval(response.text.strip().strip(\"json```\").strip(\"```\").strip().strip(\"\\\"\"))\n",
91
+ " # print(len(texts), len(list(response_dict.values())))\n",
92
+ " # return list(response_dict.values())\n",
93
+ "\n",
94
+ " return response\n",
95
+ "\n",
96
+ "def response_to_dict(response):\n",
97
+ " return list(ast.literal_eval(response.text.strip().strip(\"json```\").strip(\"```\").strip().strip(\"\\\"\")).values())\n",
98
+ "\n",
99
+ "def fix_translate(texts, translated_text):\n",
100
+ " \"\"\" Translates multiple text segments in a single API call. \"\"\"\n",
101
+ " if not texts:\n",
102
+ " return texts # Skip if empty\n",
103
+ " \n",
104
+ " system_prompt = \"\"\"\n",
105
+ " You are given the original JSON dictionary and the translated response text. Your task is to ensure that the translated text is in the correct format and has the same number of items as the original JSON dictionary.\n",
106
+ "\n",
107
+ " Steps to follow:\n",
108
+ " 1. Parse the original and translated JSON dictionaries.\n",
109
+ " 2. Ensure that the keys in both dictionaries are strings (i.e., \"1\" instead of 1).\n",
110
+ " 3. Compare the number of items in both dictionaries.\n",
111
+ " 4. If the number of items in the translated dictionary is not equal to the number of items in the original dictionary, adjust the translated dictionary by:\n",
112
+ " a. Adding missing items with empty strings if there are fewer items.\n",
113
+ " b. Merging or splitting items to ensure correspondence with the original items if there are more items.\n",
114
+ " 5. Ensure that each item in the translated dictionary is in the correct order, with the same key as the original item.\n",
115
+ " 6. Preserve any leading or trailing spaces in the original strings.\n",
116
+ " 7. Ensure the output is a syntactically correct Python dictionary, with proper opening and closing braces.\n",
117
+ " 8. If the translated dictionary is already correct, return it as is.\n",
118
+ " 9. Return the corrected JSON dictionary in proper Python dictionary format.\n",
119
+ "\n",
120
+ " Example Inputs and Outputs:\n",
121
+ "\n",
122
+ " **Input:**\n",
123
+ " - Original JSON dictionary:\n",
124
+ " ```json\n",
125
+ " {\"0\": \"My name is \", \"1\": \"Huy\", \"2\": \".\", \"3\": \" Today is \", \"4\": \"a \", \"5\": \"good day\", \"6\": \".\", \"7\": \"\"}\n",
126
+ " ```\n",
127
+ " - Translated response text with fewer items:\n",
128
+ " ```json\n",
129
+ " {\"0\": \"Tên tôi là \", \"1\": \"Huy\", \"2\": \".\", \"3\": \"Hôm nay \", \"4\": \"là một \", \"5\": \"ngày đẹp\", \"6\": \".\"}\n",
130
+ " ```\n",
131
+ "\n",
132
+ " **Output:**\n",
133
+ " ```json\n",
134
+ " {\"0\": \"Tên tôi là \", \"1\": \"Huy\", \"2\": \".\", \"3\": \"Hôm nay \", \"4\": \"là một \", \"5\": \"ngày đẹp\", \"6\": \".\", \"7\": \"\"}\n",
135
+ " ```\n",
136
+ "\n",
137
+ " **Input:**\n",
138
+ " - Original JSON dictionary:\n",
139
+ " ```json\n",
140
+ " {\"0\": \"The sky is \", \"1\": \"blue\", \"2\": \".\", \"3\": \" Water is \", \"4\": \"essential\", \"5\": \" for \", \"6\": \"life\", \"7\": \".\"}\n",
141
+ " ```\n",
142
+ " - Translated response text with more items:\n",
143
+ " ```json\n",
144
+ " {\"0\": \"El cielo es \", \"1\": \"azul\", \"2\": \".\", \"3\": \" El agua es \", \"4\": \"esencial\", \"5\": \" para \", \"6\": \"la\", \"7\": \" vida\", \"8\": \".\"}\n",
145
+ " ```\n",
146
+ "\n",
147
+ " **Output:**\n",
148
+ " ```json\n",
149
+ " {\"0\": \"El cielo es \", \"1\": \"azul\", \"2\": \".\", \"3\": \" El agua es \", \"4\": \"esencial\", \"5\": \" para \", \"6\": \"la vida\", \"7\": \".\"}\n",
150
+ " ```\n",
151
+ "\n",
152
+ " **Input:**\n",
153
+ " - Original JSON dictionary:\n",
154
+ " ```json\n",
155
+ " {\"0\": \"The quick brown \", \"1\": \"fox \", \"2\": \"jumps \", \"3\": \"over \", \"4\": \"the \", \"5\": \"lazy \", \"6\": \"dog\", \"7\": \".\"}\n",
156
+ " ```\n",
157
+ " - Translated response text with issues:\n",
158
+ " ```json\n",
159
+ " {\"0\": \"Le renard \", \"1\": \"brun \", 2: \"rapide \", 3: \"saute \", 4: \"par-dessus \", \"5\": \"le \", \"6\": \"chien \", \"7\": \"paresseux\", 8: \".\"}\n",
160
+ " ```\n",
161
+ "\n",
162
+ " **Output:**\n",
163
+ " ```json\n",
164
+ " {\"0\": \"Le renard brun \", \"1\": \"rapide \", \"2\": \"saute \", \"3\": \"par-dessus \", \"4\": \"le \", \"5\": \"chien \", \"6\": \"paresseux\", \"7\": \".\"}\n",
165
+ " ```\n",
166
+ "\n",
167
+ " **Input:**\n",
168
+ " - Original JSON dictionary:\n",
169
+ " ```json\n",
170
+ " {\"0\": \"The quick brown \", \"1\": \"fox \", \"2\": \"jumps \", \"3\": \"over \", \"4\": \"the \", \"5\": \"lazy \", \"6\": \"dog.\"}\n",
171
+ " ```\n",
172
+ " - Translated response text with wrong formatting:\n",
173
+ " ```json\n",
174
+ " {\"0\": \"Le renard brun \", \"1\": \"rapide \", \"2\": \"saute \", \"3\": \"par-dessus \", \"4\": \"le \", \"5\": \"chien \", \"6\": \"paresseux\".}\n",
175
+ " ```\n",
176
+ "\n",
177
+ " **Output:**\n",
178
+ " ```json\n",
179
+ " {\"0\": \"Le renard brun \", \"1\": \"rapide \", \"2\": \"saute \", \"3\": \"par-dessus \", \"4\": \"le \", \"5\": \"chien \", \"6\": \"paresseux.\"}\n",
180
+ " ```\n",
181
+ "\n",
182
+ " Perform the corrections and return the result as a properly formatted Python dictionary.\n",
183
+ "\"\"\"\n",
184
+ " json_data = json.dumps({i: t for i, t in enumerate(texts)})\n",
185
+ " user_prompt = f\"Original JSON dictionary: {json_data}. Translated response text: {translated_text}\" \n",
186
+ " \n",
187
+ " model = genai.GenerativeModel('gemini-2.0-flash')\n",
188
+ " response = model.generate_content(contents = system_prompt.strip() + \"\\n\" + user_prompt.strip(), generation_config={\n",
189
+ " 'temperature': 1, # Adjust temperature for desired creativity\n",
190
+ " 'top_p': 1,\n",
191
+ " 'top_k': 1,})\n",
192
+ " return response_to_dict(response)\n",
193
+ " # return response \n",
194
+ "\n",
195
+ "def brute_force_fix(batch, translated_batch):\n",
196
+ " if len(batch) > len(translated_batch):\n",
197
+ " translated_batch += [\"\"] * (len(batch) - len(translated_batch))\n",
198
+ " elif len(batch) < len(translated_batch):\n",
199
+ " translated_batch = translated_batch[:len(batch)]\n",
200
+ " return translated_batch\n",
201
+ "\n",
202
+ "def batch_translate_loop(batch, source_lang, target_lang):\n",
203
+ " translated_batch_response = batch_translate(batch, source_lang, target_lang)\n",
204
+ " try:\n",
205
+ " translated_batch = response_to_dict(translated_batch_response)\n",
206
+ " assert(len(translated_batch) == len(batch))\n",
207
+ "\n",
208
+ " except:\n",
209
+ " for i in range(10):\n",
210
+ " print(f'I am ChatGPT and I am retarded, retrying translation time {i}:')\n",
211
+ " try: \n",
212
+ " translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip(\"json```\").strip(\"```\").strip().strip(\"\\\"\"))\n",
213
+ " assert(len(translated_batch) == len(batch))\n",
214
+ " break\n",
215
+ " except:\n",
216
+ " pass\n",
217
+ " try: \n",
218
+ " translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip(\"json```\").strip(\"```\").strip().strip(\"\\\"\"))\n",
219
+ " except:\n",
220
+ " try:\n",
221
+ " translated_batch = response_to_dict(translated_batch_response)\n",
222
+ " except:\n",
223
+ " raise ValueError(\"The translated batch is not a list.\")\n",
224
+ " if len(translated_batch) != len(batch):\n",
225
+ " print(\"Length mismatch after translation. Brute Force Fixing...\")\n",
226
+ " translated_batch = brute_force_fix(batch, translated_batch)\n",
227
+ " global mismatches\n",
228
+ " mismatches += 1\n",
229
+ " print(len(batch), len(translated_batch))\n",
230
+ " return translated_batch\n",
231
+ "\n",
232
+ "def full_translate(texts, source_lang = 'English', target_lang=\"Vietnamese\"):\n",
233
+ " full_translated_texts = []\n",
234
+ " batch = []\n",
235
+ " word_count = 0\n",
236
+ " global time_spent_sleeping\n",
237
+ "\n",
238
+ " for string in texts:\n",
239
+ " if len(string.split()) + word_count >= 2000:\n",
240
+ " print('Translating a batch.')\n",
241
+ "\n",
242
+ " translated_batch = batch_translate_loop(batch, source_lang, target_lang)\n",
243
+ " full_translated_texts += translated_batch\n",
244
+ " \n",
245
+ " time.sleep(1)\n",
246
+ " time_spent_sleeping += 1\n",
247
+ " batch = []\n",
248
+ " word_count = 0\n",
249
+ " batch.append(string)\n",
250
+ " word_count += len(string)\n",
251
+ " \n",
252
+ " print('Translating a batch.')\n",
253
+ " if len(batch) == 0:\n",
254
+ " return full_translated_texts\n",
255
+ " \n",
256
+ " translated_batch = batch_translate_loop(batch, source_lang, target_lang)\n",
257
+ " full_translated_texts += translated_batch\n",
258
+ " \n",
259
+ " return full_translated_texts\n",
260
+ "\n",
261
+ "def merge_runs(runs):\n",
262
+ " \"\"\" Merges adjacent runs with the same style. \"\"\"\n",
263
+ " merged_runs = []\n",
264
+ " for run in runs:\n",
265
+ " if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and \n",
266
+ " run.style == merged_runs[-1].style and \n",
267
+ " merged_runs[-1].bold == run.bold and\n",
268
+ " merged_runs[-1].italic == run.italic and\n",
269
+ " merged_runs[-1].underline == run.underline and \n",
270
+ " merged_runs[-1].font.size == run.font.size and\n",
271
+ " merged_runs[-1].font.color.rgb == run.font.color.rgb and\n",
272
+ " merged_runs[-1].font.name == run.font.name):\n",
273
+ " merged_runs[-1].text += run.text\n",
274
+ " else:\n",
275
+ " merged_runs.append(run)\n",
276
+ " return merged_runs\n",
277
+ "\n",
278
+ "NS_W = \"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}\"\n",
279
+ "def translate_header_footer(doc, source_lang, target_lang):\n",
280
+ " head_foot = []\n",
281
+ " for section in doc.sections:\n",
282
+ " for header in section.header.paragraphs:\n",
283
+ " for run in header.runs:\n",
284
+ " head_foot.append(run.text) \n",
285
+ " for footer in section.footer.paragraphs:\n",
286
+ " for run in footer.runs:\n",
287
+ " head_foot.append(run.text) \n",
288
+ " translated_head_foot = full_translate(head_foot, source_lang, target_lang)\n",
289
+ "\n",
290
+ " i = 0\n",
291
+ " for section in doc.sections:\n",
292
+ " for header in section.header.paragraphs:\n",
293
+ " for run in header.runs:\n",
294
+ " run.text = translated_head_foot[i]\n",
295
+ " i += 1\n",
296
+ " for footer in section.footer.paragraphs:\n",
297
+ " for run in footer.runs:\n",
298
+ " run.text = translated_head_foot[i]\n",
299
+ " i += 1 \n",
300
+ "\n",
301
+ "def get_text_elements_para(doc):\n",
302
+ " para_texts = []\n",
303
+ " for para in doc.paragraphs:\n",
304
+ " for element in para._element.iter():\n",
305
+ " if element.tag.endswith('t'):\n",
306
+ " if element.text:\n",
307
+ " emoji_pattern = r'[\\U00010000-\\U0010FFFF]' \n",
308
+ " # Split the text but keep emojis as separate elements\n",
309
+ " parts = re.split(f'({emoji_pattern})', element.text)\n",
310
+ " for part in parts:\n",
311
+ " if re.match(emoji_pattern, part):\n",
312
+ " continue\n",
313
+ " if len(part.strip()) != 0:\n",
314
+ " para_texts.append(part)\n",
315
+ "\n",
316
+ " return para_texts\n",
317
+ "\n",
318
+ "def get_text_elements_table(doc):\n",
319
+ " table_texts = []\n",
320
+ " for table in doc.tables:\n",
321
+ " for row in table.rows:\n",
322
+ " for cell in row.cells:\n",
323
+ " table_texts += get_text_elements_para(cell)\n",
324
+ " return table_texts\n",
325
+ "\n",
326
+ "def translate_paragraphs(doc, translated_texts, i = 0):\n",
327
+ " for para in doc.paragraphs:\n",
328
+ " for element in para._element.iter():\n",
329
+ " if element.tag.endswith('t'):\n",
330
+ " if element.text:\n",
331
+ " emoji_pattern = r'[\\U00010000-\\U0010FFFF]' \n",
332
+ " # Split the text but keep emojis as separate elements\n",
333
+ " parts = re.split(f'({emoji_pattern})', element.text)\n",
334
+ " for j in range(len(parts)):\n",
335
+ " if re.match(emoji_pattern, parts[j]):\n",
336
+ " continue\n",
337
+ " if len(parts[j].strip()) != 0: \n",
338
+ " translated_text = translated_texts[i]\n",
339
+ " i += 1\n",
340
+ " parts[j] = translated_text\n",
341
+ " element.text = \"\".join(parts) \n",
342
+ " return doc, i\n",
343
+ "\n",
344
+ "def translate_tables(doc, translated_texts):\n",
345
+ " i = 0\n",
346
+ " for table in doc.tables:\n",
347
+ " for row in table.rows:\n",
348
+ " for cell in row.cells:\n",
349
+ " cell, i = translate_paragraphs(cell, translated_texts, i)\n",
350
+ " return doc\n",
351
+ "\n",
352
+ "def is_same_formatting(text1, text2):\n",
353
+ " \"\"\"\n",
354
+ " Check if two texts have the same formatting.\n",
355
+ " \"\"\"\n",
356
+ " return (text1.bold == text2.bold \\\n",
357
+ " and text1.italic == text2.italic \\\n",
358
+ " and text1.underline == text2.underline \\\n",
359
+ " and text1.font.size == text2.font.size \\\n",
360
+ " and text1.font.color.rgb == text2.font.color.rgb \\\n",
361
+ " and text1.font.name == text2.font.name)\n",
362
+ " \n",
363
+ "def merge_elements(doc):\n",
364
+ " for para in doc.paragraphs:\n",
365
+ " current_run = []\n",
366
+ " for element in para.iter_inner_content():\n",
367
+ " if isinstance(element, docx.text.run.Run):\n",
368
+ " if current_run == []:\n",
369
+ " current_run = [element]\n",
370
+ " elif is_same_formatting(current_run[0], element):\n",
371
+ " current_run[0].text += element.text\n",
372
+ " element.text = \"\"\n",
373
+ " else:\n",
374
+ " current_run = [element]\n",
375
+ " return doc\n",
376
+ "\n",
377
+ "def translate_docx(input_file, source_lang = \"English\", target_lang=\"Vietnamese\", output_num = ''):\n",
378
+ " \"\"\" Translates a Word document efficiently using batch processing. \"\"\"\n",
379
+ " doc = Document(input_file)\n",
380
+ " output_file = os.path.join(os.path.dirname(input_file), f\"{output_num}{target_language}_translated_{os.path.basename(input_file)}\")\n",
381
+ "\n",
382
+ " doc = merge_elements(doc)\n",
383
+ " \n",
384
+ " print('Translating paragraphs.')\n",
385
+ " para_texts = get_text_elements_para(doc)\n",
386
+ " translated_para = full_translate(para_texts, source_lang = source_lang, target_lang = target_lang)\n",
387
+ " print('Done translating pararaphs.')\n",
388
+ "\n",
389
+ " print('Translating tables.')\n",
390
+ " table_texts = get_text_elements_table(doc)\n",
391
+ " translated_tables = full_translate(table_texts, source_lang = source_lang, target_lang = target_lang)\n",
392
+ " print('Done translating tables.')\n",
393
+ "\n",
394
+ " print('Inserting paragaphs')\n",
395
+ " doc, _ = translate_paragraphs(doc, translated_para)\n",
396
+ " print('Inserting tables.')\n",
397
+ " doc = translate_tables(doc, translated_tables)\n",
398
+ "\n",
399
+ " translate_header_footer(doc, source_lang, target_lang)\n",
400
+ " print('Done translating headers & footers.')\n",
401
+ "\n",
402
+ " doc.save(output_file)\n",
403
+ " print(f\"Translation complete! Saved as {output_file}\")\n",
404
+ "\n",
405
+ " # return para_texts, translated_para\n",
406
+ "\n",
407
+ "\n",
408
+ "# input_file = r\"C:\\Users\\huyvu\\Downloads\\wordnet-an-electronic-lexical-database-language-speech-and-communication.9780262061971.33119-1.docx\"\n",
409
+ "input_file = r\"D:\\Show_me_everything\\Machine Translation\\input\\test1.docx\"\n",
410
+ "# input_file = r\"C:\\Users\\huyvu\\Documents\\Machine Translation\\Machine-Translation\\data\\input\\Data Engineering Practice.docx\""
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": 11,
416
+ "metadata": {},
417
+ "outputs": [],
418
+ "source": [
419
+ "input_file = r\"D:\\Show_me_everything\\Machine Translation\\input\\Tổng quan cuộc thi_ Data Unlock.docx\""
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 15,
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "name": "stdout",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "Translating paragraphs.\n",
432
+ "Translating a batch.\n",
433
+ "37 37\n",
434
+ "Translating a batch.\n",
435
+ "27 27\n",
436
+ "Translating a batch.\n",
437
+ "28 28\n",
438
+ "Translating a batch.\n",
439
+ "20 20\n",
440
+ "Translating a batch.\n",
441
+ "16 16\n",
442
+ "Translating a batch.\n",
443
+ "23 23\n",
444
+ "Translating a batch.\n",
445
+ "37 37\n",
446
+ "Translating a batch.\n",
447
+ "32 32\n",
448
+ "Translating a batch.\n",
449
+ "28 28\n",
450
+ "Translating a batch.\n",
451
+ "13 13\n",
452
+ "Translating a batch.\n",
453
+ "36 36\n",
454
+ "Translating a batch.\n",
455
+ "23 23\n",
456
+ "Translating a batch.\n",
457
+ "25 25\n",
458
+ "Translating a batch.\n",
459
+ "10 10\n",
460
+ "Translating a batch.\n",
461
+ "14 14\n",
462
+ "Translating a batch.\n",
463
+ "13 13\n",
464
+ "Translating a batch.\n",
465
+ "11 11\n",
466
+ "Translating a batch.\n",
467
+ "18 18\n",
468
+ "Translating a batch.\n",
469
+ "14 14\n",
470
+ "Translating a batch.\n",
471
+ "23 23\n",
472
+ "Translating a batch.\n",
473
+ "24 24\n",
474
+ "Translating a batch.\n",
475
+ "21 21\n",
476
+ "Translating a batch.\n",
477
+ "10 10\n",
478
+ "Translating a batch.\n",
479
+ "23 23\n",
480
+ "Translating a batch.\n",
481
+ "12 12\n",
482
+ "Translating a batch.\n",
483
+ "18 18\n",
484
+ "Translating a batch.\n",
485
+ "17 17\n",
486
+ "Translating a batch.\n",
487
+ "29 29\n",
488
+ "Translating a batch.\n",
489
+ "15 15\n",
490
+ "Translating a batch.\n",
491
+ "21 21\n",
492
+ "Translating a batch.\n",
493
+ "2 2\n",
494
+ "Done translating pararaphs.\n",
495
+ "Translating tables.\n",
496
+ "Translating a batch.\n",
497
+ "21 21\n",
498
+ "Done translating tables.\n",
499
+ "Inserting paragaphs\n",
500
+ "Inserting tables.\n",
501
+ "Translating a batch.\n",
502
+ "Done translating headers & footers.\n",
503
+ "Translation complete! Saved as D:\\Show_me_everything\\Machine Translation\\input\\English_translated_test1.docx\n"
504
+ ]
505
+ }
506
+ ],
507
+ "source": [
508
+ "translate_docx(input_file, source_lang = source_language, target_lang = target_language)"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": 16,
514
+ "metadata": {},
515
+ "outputs": [
516
+ {
517
+ "data": {
518
+ "text/plain": [
519
+ "30"
520
+ ]
521
+ },
522
+ "execution_count": 16,
523
+ "metadata": {},
524
+ "output_type": "execute_result"
525
+ }
526
+ ],
527
+ "source": [
528
+ "time_spent_sleeping"
529
+ ]
530
+ },
531
+ {
532
+ "cell_type": "code",
533
+ "execution_count": 5,
534
+ "metadata": {},
535
+ "outputs": [
536
+ {
537
+ "data": {
538
+ "text/plain": [
539
+ "0"
540
+ ]
541
+ },
542
+ "execution_count": 5,
543
+ "metadata": {},
544
+ "output_type": "execute_result"
545
+ }
546
+ ],
547
+ "source": [
548
+ "mismatches"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": null,
554
+ "metadata": {},
555
+ "outputs": [],
556
+ "source": []
557
+ }
558
+ ],
559
+ "metadata": {
560
+ "kernelspec": {
561
+ "display_name": "machine_translate",
562
+ "language": "python",
563
+ "name": "python3"
564
+ },
565
+ "language_info": {
566
+ "codemirror_mode": {
567
+ "name": "ipython",
568
+ "version": 3
569
+ },
570
+ "file_extension": ".py",
571
+ "mimetype": "text/x-python",
572
+ "name": "python",
573
+ "nbconvert_exporter": "python",
574
+ "pygments_lexer": "ipython3",
575
+ "version": "3.10.16"
576
+ }
577
+ },
578
+ "nbformat": 4,
579
+ "nbformat_minor": 2
580
+ }
word/word_translate.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import docx
2
  from docx import Document
3
  import google.generativeai as genai
@@ -5,79 +6,250 @@ import ast
5
  import json
6
  import re
7
  import dotenv
8
- import os
9
- import io
10
-
11
  from pymongo import MongoClient
12
  from gridfs import GridFS
13
  from docx import Document
 
 
14
 
15
  dotenv.load_dotenv(".env")
16
  api_key = os.getenv("GEMINI_API_KEY")
17
- genai.configure(api_key=api_key)
18
- model = genai.GenerativeModel("gemini-2.0-flash")
19
 
20
- def batch_translate(texts, source_lang = "English", target_lang="Vietnamese"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  """ Translates multiple text segments in a single API call. """
22
  if not texts:
23
  return texts # Skip if empty
24
 
25
- system_prompt = """ You are given three inputs: source language, target language and a json file.
26
- - Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
27
- - The original JSON file contains a Python array of objects, each with "index" and "text" keys.
28
- - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
29
- - The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
30
- - This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
31
- - Very frequently there are spaces before or after a string. Do not remove these spaces.
32
- - If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của".
33
- - Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
34
- - Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
35
- - If a word is split up into multiple arrays, the translation should be such that the word is not split up.
36
- - Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
37
- - Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
38
- - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
39
- - Return a JSON object that is a Python array.
40
- - Each object in the array is a dictionary with two keys: "index" and "text".
41
- - The text should be the translated version of the text in the original object, and the index should stay consistent.
42
- - The number of objects in the output MUST the same as the number of objects in the input.
43
- - The format of the output should look exactly like the example.
44
- - Example:
45
- **Input**: Target language: Vietnamese. JSON file:
46
- [{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
47
- **Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
48
- - Return the result of translation according to the format. Do NOT return code for translating.
49
- """
50
- json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
51
- user_prompt = f"Source languag: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  model = genai.GenerativeModel('gemini-2.0-flash')
54
  response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
55
  'temperature': 1, # Adjust temperature for desired creativity
56
  'top_p': 1,
57
  'top_k': 1,})
58
- response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
59
- if len(response_dict) > 0:
60
- if isinstance(response_dict[0]['text'], list):
61
- translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
62
- elif isinstance(response_dict[0]['text'], str):
63
- translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
64
- return translated_texts
65
-
66
- def full_translate(texts, source_lang, target_lang="Vietnamese"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  full_translated_texts = []
68
  batch = []
69
  word_count = 0
 
70
 
71
  for string in texts:
72
  if len(string.split()) + word_count >= 1000:
73
  print('Translating a batch.')
74
- full_translated_texts += batch_translate(batch, source_lang, target_lang)
 
 
 
 
 
75
  batch = []
76
  word_count = 0
77
  batch.append(string)
78
- word_count += len(string.split())
79
-
80
- full_translated_texts += batch_translate(batch, source_lang, target_lang)
 
 
 
 
 
 
81
  return full_translated_texts
82
 
83
  def merge_runs(runs):
@@ -98,8 +270,7 @@ def merge_runs(runs):
98
  return merged_runs
99
 
100
  NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
101
-
102
- def translate_header_footer(doc, target_lang):
103
  head_foot = []
104
  for section in doc.sections:
105
  for header in section.header.paragraphs:
@@ -108,7 +279,7 @@ def translate_header_footer(doc, target_lang):
108
  for footer in section.footer.paragraphs:
109
  for run in footer.runs:
110
  head_foot.append(run.text)
111
- translated_head_foot = full_translate(head_foot, target_lang)
112
 
113
  i = 0
114
  for section in doc.sections:
@@ -119,8 +290,8 @@ def translate_header_footer(doc, target_lang):
119
  for footer in section.footer.paragraphs:
120
  for run in footer.runs:
121
  run.text = translated_head_foot[i]
122
- i += 1
123
-
124
  def get_text_elements_para(doc):
125
  para_texts = []
126
  for para in doc.paragraphs:
@@ -133,7 +304,9 @@ def get_text_elements_para(doc):
133
  for part in parts:
134
  if re.match(emoji_pattern, part):
135
  continue
136
- para_texts.append(part)
 
 
137
  return para_texts
138
 
139
  def get_text_elements_table(doc):
@@ -155,9 +328,10 @@ def translate_paragraphs(doc, translated_texts, i = 0):
155
  for j in range(len(parts)):
156
  if re.match(emoji_pattern, parts[j]):
157
  continue
158
- translated_text = translated_texts[i]
159
- i += 1
160
- parts[j] = translated_text
 
161
  element.text = "".join(parts)
162
  return doc, i
163
 
@@ -169,36 +343,60 @@ def translate_tables(doc, translated_texts):
169
  cell, i = translate_paragraphs(cell, translated_texts, i)
170
  return doc
171
 
172
- def translate_docx_from_mongodb(file_id, source_lang, target_lang="Vietnamese"):
173
- # Kết nối MongoDB
174
- client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
175
- db = client["word"]
176
- fs_input = GridFS(db, collection="root_file")
177
- fs_output = GridFS(db, collection="final_file")
 
 
 
 
178
 
179
- # Lấy file từ MongoDB
180
- file_data = fs_input.get(file_id).read()
181
- original_file = fs_input.get(file_id).filename # Lấy tên gốc của file
182
- doc = Document(io.BytesIO(file_data))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- # Lấy nội dung và dịch
185
  para_texts = get_text_elements_para(doc)
186
- translated_para = full_translate(para_texts, source_lang, target_lang)
187
-
 
 
188
  table_texts = get_text_elements_table(doc)
189
- translated_tables = full_translate(table_texts, source_lang, target_lang)
190
-
191
- # Cập nhật nội dung dịch vào document
 
192
  doc, _ = translate_paragraphs(doc, translated_para)
 
193
  doc = translate_tables(doc, translated_tables)
194
- translate_header_footer(doc, target_lang)
195
-
196
- # Lưu file dịch vào MongoDB với cùng tên gốc
197
- output_stream = io.BytesIO()
198
- doc.save(output_stream)
199
- output_stream.seek(0)
200
-
201
- translated_file_id = fs_output.put(output_stream, filename=original_file)
202
- client.close()
203
-
204
- return translated_file_id
 
1
+ import os
2
  import docx
3
  from docx import Document
4
  import google.generativeai as genai
 
6
  import json
7
  import re
8
  import dotenv
 
 
 
9
  from pymongo import MongoClient
10
  from gridfs import GridFS
11
  from docx import Document
12
+ from io import BytesIO
13
+
14
 
15
  dotenv.load_dotenv(".env")
16
  api_key = os.getenv("GEMINI_API_KEY")
 
 
17
 
18
+
19
+ def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
20
+ """ Translates multiple text segments in a single API call. """
21
+ if not texts:
22
+ return texts # Skip if empty
23
+
24
+ system_prompt = """
25
+ Translate the contents of a JSON file from the specified source language to the specified target language while preserving the structure, spaces, and context of the original text.
26
+
27
+ Instructions:
28
+ 1. You will be given three inputs: source language, target language, and a JSON file.
29
+ 2. The JSON file contains a Python dictionary where each key is an integer, and each value is a string.
30
+ 3. Ensure one-to-one correspondence—each input item must correspond to exactly one output item with the same number of items.
31
+ 4. Preserve spaces before or after strings. Do not remove, merge, split, or omit any strings.
32
+ 5. Translate paragraphs and ensure the translation makes sense when text is put together.
33
+ 6. Translate split words so that the word is not split in the translation.
34
+ 7. Return a JSON object that is a Python dictionary containing as many items as the original JSON file, with keys and order preserved.
35
+ 8. The output must be a syntactically correct Python dictionary.
36
+
37
+ Additional Examples:
38
+ **Input 1**:
39
+ - Source language: English
40
+ - Target language: Vietnamese
41
+ - JSON file:
42
+ ```json
43
+ {"0": "My name is ", "1": "Huy", "2": ".", "3": " Today is ", "4": "a ", "5": "good day", "6": ".", "7": ""}
44
+ ```
45
+ **Output 1**:
46
+ ```json
47
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": " Hôm nay là ", "4": "một ", "5": "ngày đẹp", "6": ".", "7": ""}
48
+ ```
49
+
50
+ **Input 2**:
51
+ - Source language: English
52
+ - Target language: Spanish
53
+ - JSON file:
54
+ ```json
55
+ {"0": "The sky is ", "1": "blue", "2": ".", "3": " Water is ", "4": "essential", "5": " for ", "6": "life", "7": "."}
56
+ ```
57
+ **Output 2**:
58
+ ```json
59
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la vida", "7": "."}
60
+ ```
61
+
62
+ **Input 3**:
63
+ - Source language: English
64
+ - Target language: French
65
+ - JSON file:
66
+ ```json
67
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog", "7": "."}
68
+ ```
69
+ **Output 3**:
70
+ ```json
71
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux", "7": "."}
72
+ ```
73
+
74
+ Perform the translation and return the result as specified above. Do not include any additional text other than the translated JSON object.
75
+ """
76
+ json_data = json.dumps({i: t for i, t in enumerate(texts)})
77
+ user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
78
+
79
+ model = genai.GenerativeModel('gemini-2.0-flash')
80
+ response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
81
+ 'temperature': 1, # Adjust temperature for desired creativity
82
+ 'top_p': 1,
83
+ 'top_k': 1,})
84
+ # response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\""))
85
+ # print(len(texts), len(list(response_dict.values())))
86
+ # return list(response_dict.values())
87
+
88
+ return response
89
+
90
+ def response_to_dict(response):
91
+ return list(ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")).values())
92
+
93
+ def fix_translate(texts, translated_text):
94
  """ Translates multiple text segments in a single API call. """
95
  if not texts:
96
  return texts # Skip if empty
97
 
98
+ system_prompt = """
99
+ You are given the original JSON dictionary and the translated response text. Your task is to ensure that the translated text is in the correct format and has the same number of items as the original JSON dictionary.
100
+
101
+ Steps to follow:
102
+ 1. Parse the original and translated JSON dictionaries.
103
+ 2. Ensure that the keys in both dictionaries are strings (i.e., "1" instead of 1).
104
+ 3. Compare the number of items in both dictionaries.
105
+ 4. If the number of items in the translated dictionary is not equal to the number of items in the original dictionary, adjust the translated dictionary by:
106
+ a. Adding missing items with empty strings if there are fewer items.
107
+ b. Merging or splitting items to ensure correspondence with the original items if there are more items.
108
+ 5. Ensure that each item in the translated dictionary is in the correct order, with the same key as the original item.
109
+ 6. Preserve any leading or trailing spaces in the original strings.
110
+ 7. Ensure the output is a syntactically correct Python dictionary, with proper opening and closing braces.
111
+ 8. If the translated dictionary is already correct, return it as is.
112
+ 9. Return the corrected JSON dictionary in proper Python dictionary format.
113
+
114
+ Example Inputs and Outputs:
115
+
116
+ **Input:**
117
+ - Original JSON dictionary:
118
+ ```json
119
+ {"0": "My name is ", "1": "Huy", "2": ".", "3": " Today is ", "4": "a ", "5": "good day", "6": ".", "7": ""}
120
+ ```
121
+ - Translated response text with fewer items:
122
+ ```json
123
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": "Hôm nay ", "4": "là một ", "5": "ngày đẹp", "6": "."}
124
+ ```
125
+
126
+ **Output:**
127
+ ```json
128
+ {"0": "Tên tôi là ", "1": "Huy", "2": ".", "3": "Hôm nay ", "4": "là một ", "5": "ngày đẹp", "6": ".", "7": ""}
129
+ ```
130
+
131
+ **Input:**
132
+ - Original JSON dictionary:
133
+ ```json
134
+ {"0": "The sky is ", "1": "blue", "2": ".", "3": " Water is ", "4": "essential", "5": " for ", "6": "life", "7": "."}
135
+ ```
136
+ - Translated response text with more items:
137
+ ```json
138
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la", "7": " vida", "8": "."}
139
+ ```
140
+
141
+ **Output:**
142
+ ```json
143
+ {"0": "El cielo es ", "1": "azul", "2": ".", "3": " El agua es ", "4": "esencial", "5": " para ", "6": "la vida", "7": "."}
144
+ ```
145
+
146
+ **Input:**
147
+ - Original JSON dictionary:
148
+ ```json
149
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog", "7": "."}
150
+ ```
151
+ - Translated response text with issues:
152
+ ```json
153
+ {"0": "Le renard ", "1": "brun ", 2: "rapide ", 3: "saute ", 4: "par-dessus ", "5": "le ", "6": "chien ", "7": "paresseux", 8: "."}
154
+ ```
155
+
156
+ **Output:**
157
+ ```json
158
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux", "7": "."}
159
+ ```
160
+
161
+ **Input:**
162
+ - Original JSON dictionary:
163
+ ```json
164
+ {"0": "The quick brown ", "1": "fox ", "2": "jumps ", "3": "over ", "4": "the ", "5": "lazy ", "6": "dog."}
165
+ ```
166
+ - Translated response text with wrong formatting:
167
+ ```json
168
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux".}
169
+ ```
170
+
171
+ **Output:**
172
+ ```json
173
+ {"0": "Le renard brun ", "1": "rapide ", "2": "saute ", "3": "par-dessus ", "4": "le ", "5": "chien ", "6": "paresseux."}
174
+ ```
175
+
176
+ Perform the corrections and return the result as a properly formatted Python dictionary.
177
+ """
178
+ json_data = json.dumps({i: t for i, t in enumerate(texts)})
179
+ user_prompt = f"Original JSON dictionary: {json_data}. Translated response text: {translated_text}"
180
 
181
  model = genai.GenerativeModel('gemini-2.0-flash')
182
  response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
183
  'temperature': 1, # Adjust temperature for desired creativity
184
  'top_p': 1,
185
  'top_k': 1,})
186
+ return response_to_dict(response)
187
+ # return response
188
+
189
+ def brute_force_fix(batch, translated_batch):
190
+ if len(batch) > len(translated_batch):
191
+ translated_batch += [""] * (len(batch) - len(translated_batch))
192
+ elif len(batch) < len(translated_batch):
193
+ translated_batch = translated_batch[:len(batch)]
194
+ return translated_batch
195
+
196
+ def batch_translate_loop(batch, source_lang, target_lang):
197
+ translated_batch_response = batch_translate(batch, source_lang, target_lang)
198
+ try:
199
+ translated_batch = response_to_dict(translated_batch_response)
200
+ assert(len(translated_batch) == len(batch))
201
+
202
+ except:
203
+ for i in range(10):
204
+ print(f'I am ChatGPT and I am retarded, retrying translation time {i}:')
205
+ try:
206
+ translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip("json```").strip("```").strip().strip("\""))
207
+ assert(len(translated_batch) == len(batch))
208
+ break
209
+ except:
210
+ pass
211
+ try:
212
+ translated_batch = fix_translate(batch, translated_batch_response.text.strip().strip("json```").strip("```").strip().strip("\""))
213
+ except:
214
+ try:
215
+ translated_batch = response_to_dict(translated_batch_response)
216
+ except:
217
+ raise ValueError("The translated batch is not a list.")
218
+ if len(translated_batch) != len(batch):
219
+ print("Length mismatch after translation. Brute Force Fixing...")
220
+ translated_batch = brute_force_fix(batch, translated_batch)
221
+ global mismatches
222
+ mismatches += 1
223
+ print(len(batch), len(translated_batch))
224
+ return translated_batch
225
+
226
+ def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"):
227
  full_translated_texts = []
228
  batch = []
229
  word_count = 0
230
+ global time_spent_sleeping
231
 
232
  for string in texts:
233
  if len(string.split()) + word_count >= 1000:
234
  print('Translating a batch.')
235
+
236
+ translated_batch = batch_translate_loop(batch, source_lang, target_lang)
237
+ full_translated_texts += translated_batch
238
+
239
+ time.sleep(3)
240
+ time_spent_sleeping += 3
241
  batch = []
242
  word_count = 0
243
  batch.append(string)
244
+ word_count += len(string)
245
+
246
+ print('Translating a batch.')
247
+ if len(batch) == 0:
248
+ return full_translated_texts
249
+
250
+ translated_batch = batch_translate_loop(batch, source_lang, target_lang)
251
+ full_translated_texts += translated_batch
252
+
253
  return full_translated_texts
254
 
255
  def merge_runs(runs):
 
270
  return merged_runs
271
 
272
  NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
273
+ def translate_header_footer(doc, source_lang, target_lang):
 
274
  head_foot = []
275
  for section in doc.sections:
276
  for header in section.header.paragraphs:
 
279
  for footer in section.footer.paragraphs:
280
  for run in footer.runs:
281
  head_foot.append(run.text)
282
+ translated_head_foot = full_translate(head_foot, source_lang, target_lang)
283
 
284
  i = 0
285
  for section in doc.sections:
 
290
  for footer in section.footer.paragraphs:
291
  for run in footer.runs:
292
  run.text = translated_head_foot[i]
293
+ i += 1
294
+
295
  def get_text_elements_para(doc):
296
  para_texts = []
297
  for para in doc.paragraphs:
 
304
  for part in parts:
305
  if re.match(emoji_pattern, part):
306
  continue
307
+ if len(part.strip()) != 0:
308
+ para_texts.append(part)
309
+
310
  return para_texts
311
 
312
  def get_text_elements_table(doc):
 
328
  for j in range(len(parts)):
329
  if re.match(emoji_pattern, parts[j]):
330
  continue
331
+ if len(parts[j].strip()) != 0:
332
+ translated_text = translated_texts[i]
333
+ i += 1
334
+ parts[j] = translated_text
335
  element.text = "".join(parts)
336
  return doc, i
337
 
 
343
  cell, i = translate_paragraphs(cell, translated_texts, i)
344
  return doc
345
 
346
+ def is_same_formatting(text1, text2):
347
+ """
348
+ Check if two texts have the same formatting.
349
+ """
350
+ return (text1.bold == text2.bold \
351
+ and text1.italic == text2.italic \
352
+ and text1.underline == text2.underline \
353
+ and text1.font.size == text2.font.size \
354
+ and text1.font.color.rgb == text2.font.color.rgb \
355
+ and text1.font.name == text2.font.name)
356
 
357
+ def merge_elements(doc):
358
+ for para in doc.paragraphs:
359
+ current_run = []
360
+ for element in para.iter_inner_content():
361
+ if isinstance(element, docx.text.run.Run):
362
+ if current_run == []:
363
+ current_run = [element]
364
+ elif is_same_formatting(current_run[0], element):
365
+ current_run[0].text += element.text
366
+ element.text = ""
367
+ else:
368
+ current_run = [element]
369
+ return doc
370
+
371
+ def translate_docx(word_id, source_lang = "English", target_lang="Vietnamese", output_num = ''):
372
+ """ Translates a Word document efficiently using batch processing. """
373
+
374
+ client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
375
+ db = client['pptx']
376
+ fs = GridFS(db, collection='root_file')
377
+ word_file = fs.get(word_id)
378
+
379
+ doc = Document(BytesIO(word_file.read()))
380
+ output_file = os.path.join(os.path.dirname(input_file), f"{output_num}{target_language}_translated_{os.path.basename(input_file)}")
381
+
382
+ doc = merge_elements(doc)
383
 
384
+ print('Translating paragraphs.')
385
  para_texts = get_text_elements_para(doc)
386
+ translated_para = full_translate(para_texts, source_lang = source_lang, target_lang = target_lang)
387
+ print('Done translating pararaphs.')
388
+
389
+ print('Translating tables.')
390
  table_texts = get_text_elements_table(doc)
391
+ translated_tables = full_translate(table_texts, source_lang = source_lang, target_lang = target_lang)
392
+ print('Done translating tables.')
393
+
394
+ print('Inserting paragaphs')
395
  doc, _ = translate_paragraphs(doc, translated_para)
396
+ print('Inserting tables.')
397
  doc = translate_tables(doc, translated_tables)
398
+
399
+ translate_header_footer(doc, source_lang, target_lang)
400
+ print('Done translating headers & footers.')
401
+
402
+ doc.save(output_file)