Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import os | |
| import docx | |
| from docx import Document | |
| from google import genai # Use OpenAI for LLM translation | |
| import ast | |
| import json | |
| from docx.oxml import OxmlElement | |
| from copy import deepcopy | |
| import io | |
| from pymongo import MongoClient | |
| from gridfs import GridFS | |
| from docx import Document | |
| from deep_translator import GoogleTranslator | |
| gemini_api = "AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg" | |
| target_language = 'vi' | |
| source_language = 'en' | |
| def batch_translate(texts, source_lang = 'en', target_lang="fr"): | |
| """ Translates multiple text segments in a single API call. """ | |
| if not texts: | |
| return texts # Skip if empty | |
| prompt = f""" | |
| Translate the following JSON file from {source_lang} into {target_lang} while preserving names, links, symbols, and formatting: | |
| {json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])} | |
| - The original JSON file contains a Python array of objects, each with "index" and "text" keys. | |
| - Ensure **one-to-one correspondence** — the output must have exactly as many items as the input. | |
| - Do **not** merge, split, or omit strings. Each input object corresponds to exactly one output object. | |
| - Return only valid JSON — a Python array of translated objects. | |
| - If the original array is empty, return an empty array. | |
| """ | |
| client = genai.Client(api_key=gemini_api) | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", contents=prompt) | |
| translated_output = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip()) | |
| return [item["text"] for item in translated_output] | |
| def merge_runs(runs): | |
| """ Merges adjacent runs with the same style. """ | |
| merged_runs = [] | |
| for run in runs: | |
| if merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run): | |
| if ( | |
| merged_runs and | |
| run.style == merged_runs[-1].style and | |
| merged_runs[-1].bold == run.bold and | |
| merged_runs[-1].italic == run.italic and | |
| merged_runs[-1].underline == run.underline and | |
| merged_runs[-1].font.size == run.font.size and | |
| merged_runs[-1].font.color.rgb == run.font.color.rgb and | |
| merged_runs[-1].font.name == run.font.name | |
| ): | |
| merged_runs[-1].text += run.text | |
| else: | |
| merged_runs.append(run) | |
| return merged_runs | |
| NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" | |
| def translate_paragraphs(doc, source_lang, target_lang): | |
| paragraphs = [] | |
| for para in doc.paragraphs: | |
| for run in merge_runs(para.iter_inner_content()): | |
| if isinstance(run, docx.text.run.Run): | |
| paragraphs.append(run.text) | |
| # paragraphs = merge_runs(paragraphs) | |
| translated_paragraphs = [] | |
| temp_batch = [] | |
| words = 0 | |
| for para in paragraphs: | |
| if len(para) + words > 5000: | |
| translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang) | |
| temp_batch = [] | |
| words = 0 | |
| words += len(para) | |
| temp_batch.append(para) | |
| translated_paragraphs += batch_translate(temp_batch, source_lang, target_lang) | |
| # translated_paragraphs = batch_translate(paragraphs, target_lang) | |
| if len(translated_paragraphs) > 0: | |
| # Replace translated text back | |
| para_index = 0 | |
| for para in doc.paragraphs: | |
| original_para = deepcopy(para) | |
| para.clear() # Remove text while keeping paragraph properties | |
| for run in merge_runs(original_para.iter_inner_content()): | |
| if isinstance(run, docx.text.run.Run): | |
| translated_text = translated_paragraphs[para_index] | |
| try: | |
| translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters | |
| except UnicodeEncodeError: | |
| translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters | |
| drawing = run._element.find(f".//{NS_W}drawing") | |
| pict = run._element.find(".//{NS_W}pict") | |
| # Create a new run with translated text and copy the formatting | |
| new_run = para.add_run(translated_text) | |
| new_run.style = run.style | |
| if drawing is not None: | |
| new_run._element.append(drawing) | |
| elif pict is not None: | |
| new_run._element.append(pict) | |
| # Copy formatting from original run | |
| new_run.bold = run.bold | |
| new_run.italic = run.italic | |
| new_run.underline = run.underline | |
| new_run.font.size = run.font.size | |
| new_run.font.color.rgb = run.font.color.rgb | |
| new_run.font.name = run.font.name | |
| para_index += 1 | |
| elif isinstance(run, docx.text.hyperlink.Hyperlink): | |
| parent = run._element | |
| tag = parent.tag.split("}")[-1] | |
| # Create a new hyperlink element with the correct namespace | |
| new_hyperlink = OxmlElement(f"w:{tag}") | |
| for attr in parent.attrib: | |
| new_hyperlink.set(attr, parent.get(attr)) | |
| for child in parent: | |
| new_hyperlink.append(child) | |
| para._element.append(new_hyperlink) | |
| def translate_tables(doc, source_lang, target_lang): | |
| table_texts = [] | |
| run_mapping = {} | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| for run in merge_runs(para.iter_inner_content()): | |
| if isinstance(run, docx.text.run.Run): | |
| table_texts.append(run.text) | |
| translated_tables = [] | |
| temp_batch = [] | |
| words = 0 | |
| for para in table_texts: | |
| if len(para) + words > 5000: | |
| translated_tables += batch_translate(temp_batch, source_lang, target_lang) | |
| temp_batch = [] | |
| words = 0 | |
| words += len(para) | |
| temp_batch.append(para) | |
| translated_tables += batch_translate(temp_batch, source_lang, target_lang) | |
| # translated_tables = batch_translate(table_texts, target_lang) | |
| if len(translated_tables) > 0: | |
| table_index = 0 | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| original_para = deepcopy(para) | |
| para.clear() # Remove text while keeping paragraph properties | |
| for run in merge_runs(original_para.iter_inner_content()): | |
| if isinstance(run, docx.text.run.Run): | |
| translated_text = translated_tables[table_index] | |
| try: | |
| translated_text = translated_text.encode('utf-8', 'ignore').decode('utf-8') # Ignore invalid characters | |
| except UnicodeEncodeError: | |
| translated_text = translated_text.encode('utf-8', 'replace').decode('utf-8') # Replace invalid characters | |
| drawing = run._element.find(f".//{NS_W}drawing") | |
| pict = run._element.find(".//{NS_W}pict") | |
| # Create a new run with translated text and copy the formatting | |
| new_run = para.add_run(translated_text) | |
| new_run.style = run.style | |
| if drawing is not None: | |
| new_run._element.append(drawing) | |
| elif pict is not None: | |
| new_run._element.append(pict) | |
| # Copy formatting from original run | |
| new_run.bold = run.bold | |
| new_run.italic = run.italic | |
| new_run.underline = run.underline | |
| new_run.font.size = run.font.size | |
| new_run.font.color.rgb = run.font.color.rgb | |
| new_run.font.name = run.font.name | |
| table_index += 1 | |
| elif isinstance(run, docx.text.hyperlink.Hyperlink): | |
| parent = run._element | |
| tag = parent.tag.split("}")[-1] | |
| # Create a new hyperlink element with the correct namespace | |
| new_hyperlink = OxmlElement(f"w:{tag}") | |
| for attr in parent.attrib: | |
| new_hyperlink.set(attr, parent.get(attr)) | |
| for child in parent: | |
| new_hyperlink.append(child) | |
| para._element.append(new_hyperlink) | |
| def translate_header_footer(doc, source_lang, target_lang): | |
| head_foot = [] | |
| for section in doc.sections: | |
| for header in section.header.paragraphs: | |
| for run in header.runs: | |
| head_foot.append(run.text) | |
| for footer in section.footer.paragraphs: | |
| for run in footer.runs: | |
| head_foot.append(run.text) | |
| translated_head_foot = batch_translate(head_foot, source_lang, target_lang) | |
| i = 0 | |
| for section in doc.sections: | |
| for header in section.header.paragraphs: | |
| for run in header.runs: | |
| run.text = translated_head_foot[i] | |
| i += 1 | |
| for footer in section.footer.paragraphs: | |
| for run in footer.runs: | |
| run.text = translated_head_foot[i] | |
| i += 1 | |
| def translate_docx(file_id, source_lang='en', target_lang='fr', db_name='word'): | |
| client = MongoClient('mongodb://localhost:27017/') | |
| db = client[db_name] | |
| fs_input = GridFS(db, collection="root_file") | |
| fs_output = GridFS(db, collection="final_file") | |
| file_data = fs_input.get(file_id).read() | |
| input_doc = Document(io.BytesIO(file_data)) | |
| translate_paragraphs(input_doc, source_lang, target_lang) | |
| translate_tables(input_doc, source_lang, target_lang) | |
| translate_header_footer(input_doc, source_lang, target_lang) | |
| output_stream = io.BytesIO() | |
| input_doc.save(output_stream) | |
| output_stream.seek(0) | |
| translated_file_id = fs_output.put(output_stream, filename=f"{target_lang}_translated.docx") | |
| print(f"Translation complete! Saved with file ID: {translated_file_id}") | |
| return translated_file_id |