Spaces:
Running
Running
import os | |
import docx | |
from docx import Document | |
import google.generativeai as genai | |
import ast | |
import json | |
import re | |
import time | |
import dotenv | |
import os | |
from io import BytesIO | |
dotenv.load_dotenv(".env") | |
genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
time_spent_sleeping = 0 | |
mismatches = 0 | |
def batch_translate(texts, source_lang = 'English', target_lang="Vietnamese"): | |
""" Translates multiple text segments in a single API call. """ | |
if not texts: | |
return texts # Skip if empty | |
system_prompt = f"""Translate the string values within the following JSON object . | |
Follow these instructions carefully: | |
1. Analyze the entire JSON object to understand the context. | |
2. Translate *only* the string values. | |
3. Keep the original keys *exactly* as they are. | |
4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
5. Preserve the original JSON structure perfectly. | |
6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
""" | |
json_data = json.dumps({i: t for i, t in enumerate(texts)}) | |
user_prompt = f"Target language: {target_lang}. JSON file: {json_data}" | |
model = genai.GenerativeModel(os.getenv("MODEL_VERSION")) | |
response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={ | |
'temperature': 0.3, # Adjust temperature for desired creativity | |
'top_p': 1, | |
'top_k': 1,}) | |
# response_dict = ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")) | |
# print(len(texts), len(list(response_dict.values()))) | |
# return list(response_dict.values()) | |
return response | |
def response_to_dict(response): | |
return list(ast.literal_eval(response.text.strip().strip("json```").strip("```").strip().strip("\"")).values()) | |
def brute_force_fix(batch, translated_batch): | |
if len(batch) > len(translated_batch): | |
translated_batch += [""] * (len(batch) - len(translated_batch)) | |
elif len(batch) < len(translated_batch): | |
translated_batch = translated_batch[:len(batch)] | |
return translated_batch | |
def batch_translate_loop(batch, source_lang, target_lang): | |
if not batch: | |
return batch | |
translated_batch_response = batch_translate(batch, source_lang, target_lang) | |
try: | |
translated_batch = response_to_dict(translated_batch_response) | |
assert(len(translated_batch) == len(batch)) | |
except: | |
for i in range(10): | |
print(f'I am ChatGPT and I am retarded, retrying translation time {i}:') | |
try: | |
translated_batch_response = batch_translate(batch, source_lang, target_lang) | |
translated_batch = response_to_dict(translated_batch_response) | |
assert(len(translated_batch) == len(batch)) | |
break | |
except: | |
pass | |
try: | |
assert(isinstance(response_to_dict(translated_batch_response), list)) | |
except: | |
raise ValueError("The translated batch is not a list.") | |
if len(translated_batch) != len(batch): | |
print("Length mismatch after translation. Brute Force Fixing...") | |
translated_batch = brute_force_fix(batch, translated_batch) | |
global mismatches | |
mismatches += 1 | |
print(len(batch), len(translated_batch)) | |
return translated_batch | |
def get_batches(texts, limit = 2000): | |
batches = [] | |
batch = [] | |
word_count = 0 | |
for string in texts: | |
if len(string.split()) + word_count >= limit: | |
batches.append(batch) | |
batch = [] | |
word_count = 0 | |
batch.append(string) | |
word_count += len(string) | |
batches.append(batch) | |
return batches | |
def full_translate(texts, source_lang = 'English', target_lang="Vietnamese"): | |
full_translated_texts = [] | |
batches = get_batches(texts, limit = 2000) | |
word_count = 0 | |
global time_spent_sleeping | |
for batch in batches: | |
translated_batch = batch_translate_loop(batch, source_lang, target_lang) | |
full_translated_texts += translated_batch | |
time.sleep(3) | |
time_spent_sleeping += 3 | |
return full_translated_texts | |
def merge_runs(runs): | |
""" Merges adjacent runs with the same style. """ | |
merged_runs = [] | |
for run in runs: | |
if (merged_runs and isinstance(run, docx.text.run.Run) and isinstance(merged_runs[-1], docx.text.run.Run) and | |
run.style == merged_runs[-1].style and | |
merged_runs[-1].bold == run.bold and | |
merged_runs[-1].italic == run.italic and | |
merged_runs[-1].underline == run.underline and | |
merged_runs[-1].font.size == run.font.size and | |
merged_runs[-1].font.color.rgb == run.font.color.rgb and | |
merged_runs[-1].font.name == run.font.name): | |
merged_runs[-1].text += run.text | |
else: | |
merged_runs.append(run) | |
return merged_runs | |
NS_W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" | |
def translate_header_footer(doc, source_lang, target_lang): | |
head_foot = [] | |
for section in doc.sections: | |
for header in section.header.paragraphs: | |
for run in header.runs: | |
head_foot.append(run.text) | |
for footer in section.footer.paragraphs: | |
for run in footer.runs: | |
head_foot.append(run.text) | |
translated_head_foot = batch_translate_loop(head_foot, source_lang, target_lang) | |
i = 0 | |
for section in doc.sections: | |
for header in section.header.paragraphs: | |
for run in header.runs: | |
run.text = translated_head_foot[i] | |
i += 1 | |
for footer in section.footer.paragraphs: | |
for run in footer.runs: | |
run.text = translated_head_foot[i] | |
i += 1 | |
def get_text_elements_para(doc): | |
para_texts = [] | |
for para in doc.paragraphs: | |
for element in para._element.iter(): | |
if element.tag.endswith('t'): | |
if element.text: | |
emoji_pattern = r'[\U00010000-\U0010FFFF]' | |
# Split the text but keep emojis as separate elements | |
parts = re.split(f'({emoji_pattern})', element.text) | |
for part in parts: | |
if re.match(emoji_pattern, part): | |
continue | |
if len(part.strip()) != 0: | |
para_texts.append(part) | |
return para_texts | |
def get_text_elements_table(doc): | |
table_texts = [] | |
for table in doc.tables: | |
for row in table.rows: | |
for cell in row.cells: | |
table_texts += get_text_elements_para(cell) | |
return table_texts | |
def translate_paragraphs(doc, translated_texts, i = 0): | |
for para in doc.paragraphs: | |
for element in para._element.iter(): | |
if element.tag.endswith('t'): | |
if element.text: | |
emoji_pattern = r'[\U00010000-\U0010FFFF]' | |
# Split the text but keep emojis as separate elements | |
parts = re.split(f'({emoji_pattern})', element.text) | |
for j in range(len(parts)): | |
if re.match(emoji_pattern, parts[j]): | |
continue | |
if len(parts[j].strip()) != 0: | |
translated_text = translated_texts[i] | |
i += 1 | |
parts[j] = translated_text | |
element.text = "".join(parts) | |
return doc, i | |
def translate_tables(doc, translated_texts): | |
i = 0 | |
for table in doc.tables: | |
for row in table.rows: | |
for cell in row.cells: | |
cell, i = translate_paragraphs(cell, translated_texts, i) | |
return doc | |
def is_same_formatting(text1, text2): | |
""" | |
Check if two texts have the same formatting. | |
""" | |
return (text1.bold == text2.bold \ | |
and text1.italic == text2.italic \ | |
and text1.underline == text2.underline \ | |
and text1.font.size == text2.font.size \ | |
and text1.font.color.rgb == text2.font.color.rgb \ | |
and text1.font.name == text2.font.name) | |
def merge_elements(doc): | |
for para in doc.paragraphs: | |
current_run = [] | |
for element in para.iter_inner_content(): | |
if isinstance(element, docx.text.run.Run): | |
if current_run == []: | |
current_run = [element] | |
elif is_same_formatting(current_run[0], element): | |
current_run[0].text += element.text | |
element.text = "" | |
else: | |
current_run = [element] | |
for table in doc.tables: | |
for row in table.rows: | |
for cell in row.cells: | |
for para in cell.paragraphs: | |
current_run = [] | |
for element in para.iter_inner_content(): | |
if isinstance(element, docx.text.run.Run): | |
if current_run == []: | |
current_run = [element] | |
elif is_same_formatting(current_run[0], element): | |
current_run[0].text += element.text | |
element.text = "" | |
else: | |
current_run = [element] | |
return doc | |
def translate_docx(uploaded_file, file_name, source_lang="English", target_lang="Vietnamese"): | |
""" | |
Translates a Word document passed as a Streamlit UploadedFile and returns a BytesIO object. | |
""" | |
doc = Document(uploaded_file) | |
doc = merge_elements(doc) | |
print('Translating paragraphs.') | |
para_texts = get_text_elements_para(doc) | |
translated_para = full_translate(para_texts, source_lang=source_lang, target_lang=target_lang) | |
print('Done translating paragraphs.') | |
print('Translating tables.') | |
table_texts = get_text_elements_table(doc) | |
translated_tables = full_translate(table_texts, source_lang=source_lang, target_lang=target_lang) | |
print('Done translating tables.') | |
print('Inserting paragraphs.') | |
doc, _ = translate_paragraphs(doc, translated_para) | |
print('Inserting tables.') | |
doc = translate_tables(doc, translated_tables) | |
translate_header_footer(doc, source_lang, target_lang) | |
print('Done translating headers & footers.') | |
output_stream = BytesIO() | |
doc.save(output_stream) | |
output_stream.seek(0) | |
return output_stream, file_name | |