Spaces:

mintlee
/

MT_deploy

Running

File size: 10,053 Bytes

import docx
from docx import Document
import google.generativeai as genai
import ast
import json
import re
import dotenv
import os
import io

from pymongo import MongoClient
from gridfs import GridFS
from docx import Document

dotenv.load_dotenv(".env")
api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")

def batch_translate(texts, target_lang="Vietnamese"):
    """ Translates multiple text segments in a single API call. """
    if not texts:
        return texts  # Skip if empty
    
    system_prompt = """ You are given three inputs: source language, target language and a json file.
            - Your task is to translate the JSON file from the source language (you have to detect the source language yourself) to the target language.
            - The original JSON file contains a Python array of objects, each with "index" and "text" keys.  
            - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input.  
            - The array contains text that makes up whole paragraphs. Make sure that the translation makes sense when the text is put together and retains the same context.
            - This is very important: Empty spaces should be left as is. For example: From English, "Hello " should be translated into Vietnamese as "Xin chào ", with the same space at the end.
            - Very frequently there are spaces before or after a string. Do not remove these spaces. 
            - If the source language is English and the target language is Vietnamese and a string contains "'s" in the possessive sense, translate it as "của". 
            - Example: [["WorldQuant's"], ["Mission"]] should be translated as [["Nhiệm vụ"], ["của WorldQuant"]]
            - Words might be split into multiple continuous arrays. Translate them such that the translation corresponds to the full word.
            - If a word is split up into multiple arrays, the translation should be such that the word is not split up.
            - Exampe: ['Tesla sells its pro', 'ducts'] should be translated as ['Tesla bán sản phẩm của mình', ''.]. Note that the number of elements in the output is the same as the input.
            - Example: [["Hello"], ["World"]] should be translated as [["Xin chào"], ["Thế giới"]]
            - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object.  
            - Return a JSON object that is a Python array.
            - Each object in the array is a dictionary with two keys: "index" and "text".
            - The text should be the translated version of the text in the original object, and the index should stay consistent.
            - The number of objects in the output MUST the same as the number of objects in the input.
            - The format of the output should look exactly like the example.
            - Example:
            **Input**: Target language: Vietnamese. JSON file: 
            [{"index": 0, "text": ["My name is "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 3, "text": ["Today is "]}, {"index": 4, "text": ["a"]}, {"index": 5, "text": ["good day"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]'
            **Output**: [{"index": 0, "text": ["Tên tôi là "]}, {"index": 1, "text": ["Huy"]}, {"index": 2, "text": ["."]}, {"index": 4, "text": ["Hôm nay là "]}, {"index": 3, "text": ["Một"]}, {"index": 5, "text": ["ngày đẹp"]}, {"index": 6, "text": ["."]}, {"index": 7, "text": [""]}]
            - Return the result of translation according to the format. Do NOT return code for translating.
            """  
    json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
    user_prompt = f"Target language: {target_lang}. JSON file: {json_data}" 
  
    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
            'temperature': 1,  # Adjust temperature for desired creativity
            'top_p': 1,
            'top_k': 1,})
    response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip())
    if len(response_dict) > 0:
        if isinstance(response_dict[0]['text'], list):
            translated_texts = [i['text'][0] for i in sorted(response_dict, key = lambda x: x['index'])]
        elif isinstance(response_dict[0]['text'], str):
            translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
    return translated_texts

def full_translate(texts, target_lang="Vietnamese"):
    full_translated_texts = []
    batch = []
    word_count = 0

    for string in texts:
        if len(string.split()) + word_count >= 1000:
            print('Translating a batch.')
            full_translated_texts += batch_translate(batch, target_lang)
            batch = []
            word_count = 0
        batch.append(string)
        word_count += len(string.split())

    full_translated_texts += batch_translate(batch, target_lang)    
    return full_translated_texts

def merge_runs(runs):
    """ Merges adjacent runs with the same style. """
    merged_runs = []
    for run in runs:
        if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and 
            run.style == merged_runs[-1].style and 
            merged_runs[-1].bold == run.bold and
            merged_runs[-1].italic == run.italic and
            merged_runs[-1].underline == run.underline and 
            merged_runs[-1].font.size == run.font.size and
            merged_runs[-1].font.color.rgb == run.font.color.rgb and
            merged_runs[-1].font.name == run.font.name):
                merged_runs[-1].text += run.text
        else:
                merged_runs.append(run)
    return merged_runs

NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"

def translate_header_footer(doc, target_lang):
    head_foot = []
    for section in doc.sections:
        for header in section.header.paragraphs:
            for run in header.runs:
                head_foot.append(run.text) 
        for footer in section.footer.paragraphs:
            for run in footer.runs:
                head_foot.append(run.text)  
    translated_head_foot = full_translate(head_foot, target_lang)

    i = 0
    for section in doc.sections:
        for header in section.header.paragraphs:
            for run in header.runs:
                run.text = translated_head_foot[i]
                i += 1
        for footer in section.footer.paragraphs:
            for run in footer.runs:
                run.text = translated_head_foot[i]
                i += 1
                
def get_text_elements_para(doc):
    para_texts = []
    for para in doc.paragraphs:
        for element in para._element.iter():
            if element.tag.endswith('t'):
                if element.text:
                    emoji_pattern = r'[\U00010000-\U0010FFFF]'    
                    # Split the text but keep emojis as separate elements
                    parts = re.split(f'({emoji_pattern})', element.text)
                    for part in parts:
                        if re.match(emoji_pattern, part):
                            continue
                        para_texts.append(part)
    return para_texts

def get_text_elements_table(doc):
    table_texts = []
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                table_texts += get_text_elements_para(cell)
    return table_texts

def translate_paragraphs(doc, translated_texts, i = 0):
    for para in doc.paragraphs:
        for element in para._element.iter():
            if element.tag.endswith('t'):
                if element.text:
                    emoji_pattern = r'[\U00010000-\U0010FFFF]'    
                    # Split the text but keep emojis as separate elements
                    parts = re.split(f'({emoji_pattern})', element.text)
                    for j in range(len(parts)):
                        if re.match(emoji_pattern, parts[j]):
                            continue
                        translated_text = translated_texts[i]
                        i += 1
                        parts[j] = translated_text
                    element.text = "".join(parts)                        
    return doc, i

def translate_tables(doc, translated_texts):
    i = 0
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                cell, i = translate_paragraphs(cell, translated_texts, i)
    return doc

def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
        # Kết nối MongoDB
    client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
    db = client["word"]
    fs_input = GridFS(db, collection="root_file")
    fs_output = GridFS(db, collection="final_file")
    
    # Lấy file từ MongoDB
    file_data = fs_input.get(file_id).read()
    original_file = fs_input.get(file_id).filename  # Lấy tên gốc của file
    doc = Document(io.BytesIO(file_data))
    
    # Lấy nội dung và dịch
    para_texts = get_text_elements_para(doc)
    translated_para = full_translate(para_texts, target_lang)
    
    table_texts = get_text_elements_table(doc)
    translated_tables = full_translate(table_texts, target_lang)
    
    # Cập nhật nội dung dịch vào document
    doc, _ = translate_paragraphs(doc, translated_para)
    doc = translate_tables(doc, translated_tables)
    translate_header_footer(doc, target_lang)
    
    # Lưu file dịch vào MongoDB với cùng tên gốc
    output_stream = io.BytesIO()
    doc.save(output_stream)
    output_stream.seek(0)
    
    translated_file_id = fs_output.put(output_stream, filename=original_file)
    client.close()
    
    return translated_file_id