Spaces:
Running
Running
| import os | |
| import docx | |
| from docx import Document | |
| import google.generativeai as genai | |
| import ast | |
| import json | |
| import re | |
| import time | |
| import dotenv | |
| import os | |
| from io import BytesIO | |
| dotenv.load_dotenv(".env") | |
| genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
| time_spent_sleeping = 0 | |
| mismatches = 0 | |
| def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"): | |
| """ Translates multiple text segments in a single API call. """ | |
| if not texts: | |
| return texts # Skip if empty | |
| system_prompt = f"""Translate the string values within the following JSON object . | |
| Follow these instructions carefully: | |
| 1. Analyze the entire JSON object to understand the context. | |
| 2. Translate *only* the string values. | |
| 3. Keep the original keys *exactly* as they are. | |
| 4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
| 5. Preserve the original JSON structure perfectly. | |
| 6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
| """ | |
| json_data = json.dumps({i: t for i, t in enumerate(texts)}) | |
| user_prompt = f"Target language: {target_lang}. JSON file: {json_data}" | |
| model = genai.GenerativeModel(os.getenv("MODEL_VERSION")) | |
| response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={ | |
| 'temperature': 0.3, # Adjust temperature for desired creativity | |
| 'top_p': 1, | |
| 'top_k': 1,}) | |
| # response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")) | |
| # print(len(texts), len(list(response_dict.values()))) | |
| # return list(response_dict.values()) | |
| return response | |
| def response_to_dict(response): | |
| return list(ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")).values()) | |
| def brute_force_fix(batch, translated_batch): | |
| if len(batch) > len(translated_batch): | |
| translated_batch += [""] * (len(batch) - len(translated_batch)) | |
| elif len(batch) < len(translated_batch): | |
| translated_batch = translated_batch[:len(batch)] | |
| return translated_batch | |
| def batch_translate_loop(batch, source_lang, target_lang): | |
| if not batch: | |
| return batch | |
| translated_batch_response = batch_translate(batch, source_lang, target_lang) | |
| try: | |
| translated_batch = response_to_dict(translated_batch_response) | |
| assert(len(translated_batch) == len(batch)) | |
| except: | |
| for i in range(10): | |
| print(f'I am ChatGPT and I am retarded, retrying translation time {i}:') | |
| try: | |
| translated_batch_response = batch_translate(batch, source_lang, target_lang) | |
| translated_batch = response_to_dict(translated_batch_response) | |
| assert(len(translated_batch) == len(batch)) | |
| break | |
| except: | |
| pass | |
| try: | |
| assert(isinstance(response_to_dict(translated_batch_response), list)) | |
| except: | |
| raise ValueError("The translated batch is not a list.") | |
| if len(translated_batch) != len(batch): | |
| print("Length mismatch after translation. Brute Force Fixing...") | |
| translated_batch = brute_force_fix(batch, translated_batch) | |
| global mismatches | |
| mismatches += 1 | |
| print(len(batch), len(translated_batch)) | |
| return translated_batch | |
| def get_batches(texts, limit = 2000): | |
| batches = [] | |
| batch = [] | |
| word_count = 0 | |
| for string in texts: | |
| if len(string.split()) + word_count >= limit: | |
| batches.append(batch) | |
| batch = [] | |
| word_count = 0 | |
| batch.append(string) | |
| word_count += len(string) | |
| batches.append(batch) | |
| return batches | |
| def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"): | |
| full_translated_texts = [] | |
| batches = get_batches(texts, limit = 2000) | |
| word_count = 0 | |
| global time_spent_sleeping | |
| for batch in batches: | |
| translated_batch = batch_translate_loop(batch, source_lang, target_lang) | |
| full_translated_texts += translated_batch | |
| time.sleep(3) | |
| time_spent_sleeping += 3 | |
| return full_translated_texts | |
| def merge_runs(runs): | |
| """ Merges adjacent runs with the same style. """ | |
| merged_runs = [] | |
| for run in runs: | |
| if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and | |
| run.style == merged_runs[-1].style and | |
| merged_runs[-1].bold == run.bold and | |
| merged_runs[-1].italic == run.italic and | |
| merged_runs[-1].underline == run.underline and | |
| merged_runs[-1].font.size == run.font.size and | |
| merged_runs[-1].font.color.rgb == run.font.color.rgb and | |
| merged_runs[-1].font.name == run.font.name): | |
| merged_runs[-1].text += run.text | |
| else: | |
| merged_runs.append(run) | |
| return merged_runs | |
| NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" | |
| def translate_header_footer(doc, source_lang, target_lang): | |
| head_foot = [] | |
| for section in doc.sections: | |
| for header in section.header.paragraphs: | |
| for run in header.runs: | |
| head_foot.append(run.text) | |
| for footer in section.footer.paragraphs: | |
| for run in footer.runs: | |
| head_foot.append(run.text) | |
| translated_head_foot = batch_translate_loop(head_foot, source_lang, target_lang) | |
| i = 0 | |
| for section in doc.sections: | |
| for header in section.header.paragraphs: | |
| for run in header.runs: | |
| run.text = translated_head_foot[i] | |
| i += 1 | |
| for footer in section.footer.paragraphs: | |
| for run in footer.runs: | |
| run.text = translated_head_foot[i] | |
| i += 1 | |
| def get_text_elements_para(doc): | |
| para_texts = [] | |
| for para in doc.paragraphs: | |
| for element in para._element.iter(): | |
| if element.tag.endswith('t'): | |
| if element.text: | |
| emoji_pattern = r'[\U00010000-\U0010FFFF]' | |
| # Split the text but keep emojis as separate elements | |
| parts = re.split(f'({emoji_pattern})', element.text) | |
| for part in parts: | |
| if re.match(emoji_pattern, part): | |
| continue | |
| if len(part.strip()) != 0: | |
| para_texts.append(part) | |
| return para_texts | |
| def get_text_elements_table(doc): | |
| table_texts = [] | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| table_texts += get_text_elements_para(cell) | |
| return table_texts | |
| def translate_paragraphs(doc, translated_texts, i = 0): | |
| for para in doc.paragraphs: | |
| for element in para._element.iter(): | |
| if element.tag.endswith('t'): | |
| if element.text: | |
| emoji_pattern = r'[\U00010000-\U0010FFFF]' | |
| # Split the text but keep emojis as separate elements | |
| parts = re.split(f'({emoji_pattern})', element.text) | |
| for j in range(len(parts)): | |
| if re.match(emoji_pattern, parts[j]): | |
| continue | |
| if len(parts[j].strip()) != 0: | |
| translated_text = translated_texts[i] | |
| i += 1 | |
| parts[j] = translated_text | |
| element.text = "".join(parts) | |
| return doc, i | |
| def translate_tables(doc, translated_texts): | |
| i = 0 | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| cell, i = translate_paragraphs(cell, translated_texts, i) | |
| return doc | |
| def is_same_formatting(text1, text2): | |
| """ | |
| Check if two texts have the same formatting. | |
| """ | |
| return (text1.bold == text2.bold \ | |
| and text1.italic == text2.italic \ | |
| and text1.underline == text2.underline \ | |
| and text1.font.size == text2.font.size \ | |
| and text1.font.color.rgb == text2.font.color.rgb \ | |
| and text1.font.name == text2.font.name) | |
| def merge_elements(doc): | |
| for para in doc.paragraphs: | |
| current_run = [] | |
| for element in para.iter_inner_content(): | |
| if isinstance(element, docx.text.run.Run): | |
| if current_run == []: | |
| current_run = [element] | |
| elif is_same_formatting(current_run[0], element): | |
| current_run[0].text += element.text | |
| element.text = "" | |
| else: | |
| current_run = [element] | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| current_run = [] | |
| for element in para.iter_inner_content(): | |
| if isinstance(element, docx.text.run.Run): | |
| if current_run == []: | |
| current_run = [element] | |
| elif is_same_formatting(current_run[0], element): | |
| current_run[0].text += element.text | |
| element.text = "" | |
| else: | |
| current_run = [element] | |
| return doc | |
| def translate_docx(uploaded_file, file_name, source_lang="English", target_lang="Vietnamese"): | |
| """ | |
| Translates a Word document passed as a Streamlit UploadedFile and returns a BytesIO object. | |
| """ | |
| doc = Document(uploaded_file) | |
| doc = merge_elements(doc) | |
| print('Translating paragraphs.') | |
| para_texts = get_text_elements_para(doc) | |
| translated_para = full_translate(para_texts, source_lang=source_lang, target_lang=target_lang) | |
| print('Done translating paragraphs.') | |
| print('Translating tables.') | |
| table_texts = get_text_elements_table(doc) | |
| translated_tables = full_translate(table_texts, source_lang=source_lang, target_lang=target_lang) | |
| print('Done translating tables.') | |
| print('Inserting paragraphs.') | |
| doc, _ = translate_paragraphs(doc, translated_para) | |
| print('Inserting tables.') | |
| doc = translate_tables(doc, translated_tables) | |
| translate_header_footer(doc, source_lang, target_lang) | |
| print('Done translating headers & footers.') | |
| output_stream = BytesIO() | |
| doc.save(output_stream) | |
| output_stream.seek(0) | |
| return output_stream, file_name | |