MT_deploy / word /word_translate.py
mintlee's picture
update word
e53f591
raw
history blame
10.1 kB
import docx
from docx import Document
import google.generativeai as genai
import ast
import json
import re
import dotenv
import os
import io
from pymongo import MongoClient
from gridfs import GridFS
from docx import Document
dotenv.load_dotenv(".env")
api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")
def batch_translate(texts, target_lang="Vietnamese"):
""" Translates multiple text segments in a single API call. """
if not texts:
return texts # Skip if empty
system_prompt = """ You are given three inputs: source language, target language and a json file.
- Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
- The original JSON file contains a Python array of objects, each with "index" and "text" keys.
- Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.
- The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
- This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
- Very frequently there are spaces before or after a string. Do not remove these spaces.
- If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của".
- Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
- Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
- If a word is split up into multiple arrays, the translation should be such that the word is not split up.
- Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
- Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
- Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.
- Return a JSON object that is a Python array.
- Each object in the array is a dictionary with two keys: "index" and "text".
- The text should be the translated version of the text in the original object, and the index should stay consistent.
- The number of objects in the output MUST the same as the number of objects in the input.
- The format of the output should look exactly like the example.
- Example:
**Input**: Target language: Vietnamese. JSON file:
[{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
**Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
- Return the result of translation according to the format. Do NOT return code for translating.
"""
json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
model = genai.GenerativeModel('gemini-2.0-flash')
response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
'temperature': 1, # Adjust temperature for desired creativity
'top_p': 1,
'top_k': 1,})
response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
if len(response_dict) > 0:
if isinstance(response_dict[0]['text'], list):
translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
elif isinstance(response_dict[0]['text'], str):
translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
return translated_texts
def full_translate(texts, target_lang="Vietnamese"):
full_translated_texts = []
batch = []
word_count = 0
for string in texts:
if len(string.split()) + word_count >= 1000:
print('Translating a batch.')
full_translated_texts += batch_translate(batch, target_lang)
batch = []
word_count = 0
batch.append(string)
word_count += len(string.split())
full_translated_texts += batch_translate(batch, target_lang)
return full_translated_texts
def merge_runs(runs):
""" Merges adjacent runs with the same style. """
merged_runs = []
for run in runs:
if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and
run.style == merged_runs[-1].style and
merged_runs[-1].bold == run.bold and
merged_runs[-1].italic == run.italic and
merged_runs[-1].underline == run.underline and
merged_runs[-1].font.size == run.font.size and
merged_runs[-1].font.color.rgb == run.font.color.rgb and
merged_runs[-1].font.name == run.font.name):
merged_runs[-1].text += run.text
else:
merged_runs.append(run)
return merged_runs
NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
def translate_header_footer(doc, target_lang):
head_foot = []
for section in doc.sections:
for header in section.header.paragraphs:
for run in header.runs:
head_foot.append(run.text)
for footer in section.footer.paragraphs:
for run in footer.runs:
head_foot.append(run.text)
translated_head_foot = full_translate(head_foot, target_lang)
i = 0
for section in doc.sections:
for header in section.header.paragraphs:
for run in header.runs:
run.text = translated_head_foot[i]
i += 1
for footer in section.footer.paragraphs:
for run in footer.runs:
run.text = translated_head_foot[i]
i += 1
def get_text_elements_para(doc):
para_texts = []
for para in doc.paragraphs:
for element in para._element.iter():
if element.tag.endswith('t'):
if element.text:
emoji_pattern = r'[\U00010000-\U0010FFFF]'
# Split the text but keep emojis as separate elements
parts = re.split(f'({emoji_pattern})', element.text)
for part in parts:
if re.match(emoji_pattern, part):
continue
para_texts.append(part)
return para_texts
def get_text_elements_table(doc):
table_texts = []
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
table_texts += get_text_elements_para(cell)
return table_texts
def translate_paragraphs(doc, translated_texts, i = 0):
for para in doc.paragraphs:
for element in para._element.iter():
if element.tag.endswith('t'):
if element.text:
emoji_pattern = r'[\U00010000-\U0010FFFF]'
# Split the text but keep emojis as separate elements
parts = re.split(f'({emoji_pattern})', element.text)
for j in range(len(parts)):
if re.match(emoji_pattern, parts[j]):
continue
translated_text = translated_texts[i]
i += 1
parts[j] = translated_text
element.text = "".join(parts)
return doc, i
def translate_tables(doc, translated_texts):
i = 0
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
cell, i = translate_paragraphs(cell, translated_texts, i)
return doc
def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
# Kết nối MongoDB
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client["word"]
fs_input = GridFS(db, collection="root_file")
fs_output = GridFS(db, collection="final_file")
# Lấy file từ MongoDB
file_data = fs_input.get(file_id).read()
original_file = fs_input.get(file_id).filename # Lấy tên gốc của file
doc = Document(io.BytesIO(file_data))
# Lấy nội dung và dịch
para_texts = get_text_elements_para(doc)
translated_para = full_translate(para_texts, target_lang)
table_texts = get_text_elements_table(doc)
translated_tables = full_translate(table_texts, target_lang)
# Cập nhật nội dung dịch vào document
doc, _ = translate_paragraphs(doc, translated_para)
doc = translate_tables(doc, translated_tables)
translate_header_footer(doc, target_lang)
# Lưu file dịch vào MongoDB với cùng tên gốc
output_stream = io.BytesIO()
doc.save(output_stream)
output_stream.seek(0)
translated_file_id = fs_output.put(output_stream, filename=original_file)
client.close()
return translated_file_id