import os import docx from docx import Document import google.generativeai as genai import ast import json import re import time import dotenv import os from io import BytesIO dotenv.load_dotenv(".env") genai.configure(api_key=os.getenv("GEMINI_API_KEY")) time_spent_sleeping = 0 mismatches = 0 def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"): """ Translates multiple text segments in a single API call. """ if not texts: return texts # Skip if empty system_prompt = f"""Translate the string values within the following JSON object . Follow these instructions carefully: 1. Analyze the entire JSON object to understand the context. 2. Translate *only* the string values. 3. Keep the original keys *exactly* as they are. 4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. 5. Preserve the original JSON structure perfectly. 6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. """ json_data = json.dumps({i: t for i, t in enumerate(texts)}) user_prompt = f"Target language: {target_lang}. JSON file: {json_data}" model = genai.GenerativeModel(os.getenv("MODEL_VERSION")) response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={ 'temperature': 0.3, # Adjust temperature for desired creativity 'top_p': 1, 'top_k': 1,}) # response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")) # print(len(texts), len(list(response_dict.values()))) # return list(response_dict.values()) return response def response_to_dict(response): return list(ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")).values()) def brute_force_fix(batch, translated_batch): if len(batch) > len(translated_batch): translated_batch += [""] * (len(batch) - len(translated_batch)) elif len(batch) < len(translated_batch): translated_batch = translated_batch[:len(batch)] return translated_batch def batch_translate_loop(batch, source_lang, target_lang): if not batch: return batch translated_batch_response = batch_translate(batch, source_lang, target_lang) try: translated_batch = response_to_dict(translated_batch_response) assert(len(translated_batch) == len(batch)) except: for i in range(10): print(f'I am ChatGPT and I am retarded, retrying translation time {i}:') try: translated_batch_response = batch_translate(batch, source_lang, target_lang) translated_batch = response_to_dict(translated_batch_response) assert(len(translated_batch) == len(batch)) break except: pass try: assert(isinstance(response_to_dict(translated_batch_response), list)) except: raise ValueError("The translated batch is not a list.") if len(translated_batch) != len(batch): print("Length mismatch after translation. Brute Force Fixing...") translated_batch = brute_force_fix(batch, translated_batch) global mismatches mismatches += 1 print(len(batch), len(translated_batch)) return translated_batch def get_batches(texts, limit = 2000): batches = [] batch = [] word_count = 0 for string in texts: if len(string.split()) + word_count >= limit: batches.append(batch) batch = [] word_count = 0 batch.append(string) word_count += len(string) batches.append(batch) return batches def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"): full_translated_texts = [] batches = get_batches(texts, limit = 2000) word_count = 0 global time_spent_sleeping for batch in batches: translated_batch = batch_translate_loop(batch, source_lang, target_lang) full_translated_texts += translated_batch time.sleep(3) time_spent_sleeping += 3 return full_translated_texts def merge_runs(runs): """ Merges adjacent runs with the same style. """ merged_runs = [] for run in runs: if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and run.style == merged_runs[-1].style and merged_runs[-1].bold == run.bold and merged_runs[-1].italic == run.italic and merged_runs[-1].underline == run.underline and merged_runs[-1].font.size == run.font.size and merged_runs[-1].font.color.rgb == run.font.color.rgb and merged_runs[-1].font.name == run.font.name): merged_runs[-1].text += run.text else: merged_runs.append(run) return merged_runs NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" def translate_header_footer(doc, source_lang, target_lang): head_foot = [] for section in doc.sections: for header in section.header.paragraphs: for run in header.runs: head_foot.append(run.text) for footer in section.footer.paragraphs: for run in footer.runs: head_foot.append(run.text) translated_head_foot = batch_translate_loop(head_foot, source_lang, target_lang) i = 0 for section in doc.sections: for header in section.header.paragraphs: for run in header.runs: run.text = translated_head_foot[i] i += 1 for footer in section.footer.paragraphs: for run in footer.runs: run.text = translated_head_foot[i] i += 1 def get_text_elements_para(doc): para_texts = [] for para in doc.paragraphs: for element in para._element.iter(): if element.tag.endswith('t'): if element.text: emoji_pattern = r'[\U00010000-\U0010FFFF]' # Split the text but keep emojis as separate elements parts = re.split(f'({emoji_pattern})', element.text) for part in parts: if re.match(emoji_pattern, part): continue if len(part.strip()) != 0: para_texts.append(part) return para_texts def get_text_elements_table(doc): table_texts = [] for table in doc.tables: for row in table.rows: for cell in row.cells: table_texts += get_text_elements_para(cell) return table_texts def translate_paragraphs(doc, translated_texts, i = 0): for para in doc.paragraphs: for element in para._element.iter(): if element.tag.endswith('t'): if element.text: emoji_pattern = r'[\U00010000-\U0010FFFF]' # Split the text but keep emojis as separate elements parts = re.split(f'({emoji_pattern})', element.text) for j in range(len(parts)): if re.match(emoji_pattern, parts[j]): continue if len(parts[j].strip()) != 0: translated_text = translated_texts[i] i += 1 parts[j] = translated_text element.text = "".join(parts) return doc, i def translate_tables(doc, translated_texts): i = 0 for table in doc.tables: for row in table.rows: for cell in row.cells: cell, i = translate_paragraphs(cell, translated_texts, i) return doc def is_same_formatting(text1, text2): """ Check if two texts have the same formatting. """ return (text1.bold == text2.bold \ and text1.italic == text2.italic \ and text1.underline == text2.underline \ and text1.font.size == text2.font.size \ and text1.font.color.rgb == text2.font.color.rgb \ and text1.font.name == text2.font.name) def merge_elements(doc): for para in doc.paragraphs: current_run = [] for element in para.iter_inner_content(): if isinstance(element, docx.text.run.Run): if current_run == []: current_run = [element] elif is_same_formatting(current_run[0], element): current_run[0].text += element.text element.text = "" else: current_run = [element] for table in doc.tables: for row in table.rows: for cell in row.cells: for para in cell.paragraphs: current_run = [] for element in para.iter_inner_content(): if isinstance(element, docx.text.run.Run): if current_run == []: current_run = [element] elif is_same_formatting(current_run[0], element): current_run[0].text += element.text element.text = "" else: current_run = [element] return doc def translate_docx(uploaded_file, file_name, source_lang="English", target_lang="Vietnamese"): """ Translates a Word document passed as a Streamlit UploadedFile and returns a BytesIO object. """ doc = Document(uploaded_file) doc = merge_elements(doc) print('Translating paragraphs.') para_texts = get_text_elements_para(doc) translated_para = full_translate(para_texts, source_lang=source_lang, target_lang=target_lang) print('Done translating paragraphs.') print('Translating tables.') table_texts = get_text_elements_table(doc) translated_tables = full_translate(table_texts, source_lang=source_lang, target_lang=target_lang) print('Done translating tables.') print('Inserting paragraphs.') doc, _ = translate_paragraphs(doc, translated_para) print('Inserting tables.') doc = translate_tables(doc, translated_tables) translate_header_footer(doc, source_lang, target_lang) print('Done translating headers & footers.') output_stream = BytesIO() doc.save(output_stream) output_stream.seek(0) return output_stream, file_name