Spaces:
Running
Running
import os | |
import zipfile | |
import google.generativeai as genai | |
import tempfile | |
import io | |
import json | |
import time | |
from google.api_core.exceptions import ResourceExhausted | |
import re | |
genai.configure(api_key="AIzaSyDInJcxzqBvsh1avs4Zkxb4ZGBooNzOyEM") | |
def unzip_office_file(pptx_file: io.BytesIO): | |
""" | |
Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời. | |
Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx). | |
""" | |
# Tạo thư mục tạm để lưu nội dung giải nén | |
output_dir = tempfile.mkdtemp(prefix="pptx_extract_") | |
# Giải nén nội dung từ file PPTX (BytesIO) | |
with zipfile.ZipFile(pptx_file, 'r') as zip_ref: | |
zip_ref.extractall(output_dir) | |
return output_dir | |
def translate_single_text(text: str, source_lang: str = 'English', target_lang: str = "Vietnamese", | |
max_retries: int = 5, base_delay: float = 5.0) -> str: | |
if not text or not text.strip(): | |
return "" # Bỏ qua nếu chuỗi rỗng hoặc chỉ chứa khoảng trắng | |
retries = 0 | |
while retries <= max_retries: | |
try: | |
model = genai.GenerativeModel('gemini-2.0-flash') # hoặc 'gemini-1.5-flash' | |
system_prompt = f"""You are a translation engine. | |
Translate the following text accurately from {source_lang} to {target_lang}. | |
Provide *only* the translated text as a single string. | |
Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations.""" | |
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}" | |
full_prompt = system_prompt.strip() + "\n\n" + user_prompt.strip() | |
response = model.generate_content( | |
contents=full_prompt, | |
generation_config={ | |
'temperature': 0.2, | |
'top_p': 1.0, | |
'top_k': 1, | |
} | |
) | |
if response.candidates and response.candidates[0].content.parts: | |
translated_text = "".join(part.text for part in response.candidates[0].content.parts if hasattr(part, 'text')).strip() | |
return translated_text | |
else: | |
print(f"[!] Không nhận được nội dung hợp lệ từ API cho văn bản: '{text[:50]}...'") | |
return "" | |
except ResourceExhausted as e: | |
wait_time = base_delay * (2 ** retries) | |
print(f"[429] Quota exceeded khi dịch '{text[:50]}...'. Thử lại sau {wait_time:.1f}s (lần {retries + 1}/{max_retries + 1}).") | |
time.sleep(wait_time) | |
retries += 1 | |
except Exception as e: | |
print(f"[!] Lỗi không mong muốn khi dịch '{text[:50]}...': {e}") | |
return "" | |
print(f"[x] Bỏ qua sau {max_retries + 1} lần thử không thành công cho '{text[:50]}...'.") | |
return "" | |
def preprocess_text(text_list): | |
""" | |
Converts a list of strings into a dictionary where keys are the | |
list indices (int) and values are the strings. | |
""" | |
if not isinstance(text_list, list): | |
return {} | |
if not text_list: | |
return {} | |
text_dict = {index: text for index, text in enumerate(text_list)} | |
return text_dict | |
def translate_text(text_dict, source_lang='English', target_lang="Vietnamese", max_retries=5, base_delay: float = 5.0): | |
def _dict_to_json_string(d): | |
json_compatible = {str(k): v for k, v in d.items()} | |
try: | |
return json.dumps(json_compatible, ensure_ascii=False, separators=(',', ':')) | |
except Exception as e: | |
print(f"Internal Error (_dict_to_json_string): {e}") | |
return "{}" | |
def _json_string_to_dict(s): | |
res_dict = {} | |
if not s or not isinstance(s, str): return {} | |
try: | |
raw = json.loads(s) | |
if not isinstance(raw, dict): | |
print(f"LLM response is not a JSON object: {s}") | |
return {} | |
for k_str, v in raw.items(): | |
try: | |
res_dict[int(k_str)] = v | |
except ValueError: | |
print(f"Non-integer key '{k_str}' in LLM response.") | |
except json.JSONDecodeError as e: | |
print(f"JSON decode error: {e}") | |
except Exception as e: | |
print(f"General error: {e}") | |
return res_dict | |
if not isinstance(text_dict, dict): | |
print("translate_text_dict expected a dict, got:", type(text_dict)) | |
return {} | |
if not text_dict: | |
return {} | |
json_input_string = _dict_to_json_string(text_dict) | |
if json_input_string == "{}": | |
print("Empty or invalid dictionary input.") | |
return {key: "" for key in text_dict} | |
system_prompt = f"""Translate the string values within the following JSON object . | |
Follow these instructions carefully: | |
1. Analyze the entire JSON object to understand the context. | |
2. Translate *only* the string values. | |
3. Keep the original keys *exactly* as they are. | |
4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
5. Preserve the original JSON structure perfectly. | |
6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
""" | |
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string}\n\nTranslated JSON Output:" | |
raw_translated_json_string = "{}" | |
retry_count = 0 | |
while retry_count < max_retries: | |
try: | |
model = genai.GenerativeModel('gemini-2.0-flash') | |
full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}" | |
response = model.generate_content( | |
contents=full_prompt, | |
generation_config={ | |
'temperature': 0.3, | |
'top_p': 1, | |
'top_k': 1, | |
} | |
) | |
if response and response.parts and hasattr(response.parts[0], 'text'): | |
raw_translated_json_string = response.parts[0].text.strip() | |
elif hasattr(response, 'text'): | |
raw_translated_json_string = response.text.strip() | |
# Clean markdown wrappers if present | |
raw_translated_json_string = re.sub(r"^```(?:json)?|```$", "", raw_translated_json_string).strip() | |
if raw_translated_json_string: | |
break # Success, exit retry loop | |
except Exception as e: | |
wait_time = base_delay * (2 ** retry_count) | |
print(f"[Retry {retry_count+1}] Lỗi gọi API: {e}. Thử lại sau {wait_time:.2f} giây.") | |
time.sleep(wait_time) | |
retry_count += 1 | |
if retry_count == max_retries: | |
print("❌ Hết số lần thử lại. Trả về JSON rỗng.") | |
raw_translated_json_string = "{}" | |
print(raw_translated_json_string) | |
translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string) | |
final_translated_dict = {} | |
missing_keys = [] | |
for key in text_dict: | |
if key in translated_intermediate_dict: | |
final_translated_dict[key] = translated_intermediate_dict[key] | |
else: | |
final_translated_dict[key] = "" | |
missing_keys.append(key) | |
if missing_keys: | |
print(f"Cảnh báo: Thiếu keys: {sorted(missing_keys)}.") | |
extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys()) | |
if extra_keys: | |
print(f"Cảnh báo: Có keys không mong đợi: {sorted(extra_keys)}.") | |
return final_translated_dict | |
# Function 3: Dictionary -> List | |
def postprocess_text(translated_dict): | |
""" | |
Converts a dictionary {index: translated_text} back into a list of | |
strings, ordered by the index (key). | |
""" | |
if not isinstance(translated_dict, dict): | |
print("Warning: postprocess_text expected a dict, received:", type(translated_dict)) | |
return [] | |
if not translated_dict: | |
return [] | |
# Sort the dictionary items by key (index) | |
try: | |
# Ensure keys are integers for correct sorting if possible, handle errors | |
items_to_sort = [] | |
for k, v in translated_dict.items(): | |
try: | |
items_to_sort.append((int(k), v)) | |
except (ValueError, TypeError): | |
print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.") | |
continue # Skip non-integer keys for sorting | |
if not items_to_sort: | |
print("Warning: No sortable items found in dictionary for postprocessing.") | |
return [] | |
sorted_items = sorted(items_to_sort) | |
# Check for gaps in indices (optional but good practice) | |
expected_length = sorted_items[-1][0] + 1 | |
if len(sorted_items) != expected_length: | |
print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.") | |
# Reconstruct carefully to handle gaps, filling with empty strings | |
result_list = [""] * expected_length | |
for index, text in sorted_items: | |
if 0 <= index < expected_length: | |
result_list[index] = text | |
return result_list | |
# If no gaps, simply extract values | |
translated_list = [text for index, text in sorted_items] | |
return translated_list | |
except Exception as e: | |
print(f"Error during postprocessing sorting/list creation: {e}") | |
return [] # Return empty list on error |