Spaces:
Running
Running
import os | |
import zipfile | |
import google.generativeai as genai | |
import tempfile | |
import io | |
import json | |
genai.configure(api_key="AIzaSyBH8O5IfqYrJ5wtWnmUC21IfMjzJCrTm3I") | |
def unzip_office_file(pptx_file: io.BytesIO): | |
""" | |
Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời. | |
Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx). | |
""" | |
# Tạo thư mục tạm để lưu nội dung giải nén | |
output_dir = tempfile.mkdtemp(prefix="pptx_extract_") | |
# Giải nén nội dung từ file PPTX (BytesIO) | |
with zipfile.ZipFile(pptx_file, 'r') as zip_ref: | |
zip_ref.extractall(output_dir) | |
return output_dir | |
def translate_single_text(text, source_lang='English', target_lang="Vietnamese"): | |
if not text: | |
return "" # Bỏ qua nếu chuỗi rỗng | |
try: | |
model = genai.GenerativeModel('gemini-2.0-flash') # Sử dụng model từ code gốc nếu hoạt động tốt | |
# --- Prompt đơn giản chỉ yêu cầu dịch thuật --- | |
system_prompt_simple = f"""You are a translation engine. | |
Translate the following text accurately from {source_lang} to {target_lang}. | |
Provide *only* the translated text as a single string. | |
Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations.""" | |
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}" | |
full_prompt = system_prompt_simple.strip() + "\n\n" + user_prompt.strip() | |
response = model.generate_content( | |
contents=full_prompt, | |
generation_config={ | |
'temperature': 0.7, # Nhiệt độ phù hợp cho dịch thuật (có thể điều chỉnh) | |
'top_p': 1, | |
'top_k': 1, | |
} | |
) | |
translated_text = response.text.strip() | |
return translated_text | |
except Exception as e: | |
print(f"Lỗi trong quá trình dịch (translate_single_text): {e}") | |
return "" # Trả về chuỗi rỗng nếu có lỗi | |
def preprocess_text(text_list): | |
""" | |
Converts a list of strings into a dictionary where keys are the | |
list indices (int) and values are the strings. | |
""" | |
if not isinstance(text_list, list): | |
return {} | |
if not text_list: | |
return {} | |
text_dict = {index: text for index, text in enumerate(text_list)} | |
return text_dict | |
def translate_text(text_dict, source_lang='English', target_lang="Vietnamese"): | |
""" | |
Translates the values of a dictionary {index: text} using an LLM. | |
It uses an intermediate JSON string format for reliable LLM interaction. | |
Returns a dictionary {index: translated_text} with the same keys. | |
""" | |
if not isinstance(text_dict, dict): | |
print("Warning: translate_text_dict expected a dict, received:", type(text_dict)) | |
return {} | |
if not text_dict: | |
return {} | |
# --- Internal Helper: Convert Dictionary to JSON String for LLM --- | |
def _dict_to_json_string(d): | |
json_compatible = {str(k): v for k, v in d.items()} | |
try: | |
return json.dumps(json_compatible, ensure_ascii=False, separators=(',',':')) | |
except Exception as e: | |
print(f"Internal Error (_dict_to_json_string): {e}") | |
return "{}" | |
# --- Internal Helper: Convert LLM's JSON String Response to Dictionary --- | |
def _json_string_to_dict(s): | |
res_dict = {} | |
if not s or not isinstance(s, str): return {} | |
try: | |
raw = json.loads(s) | |
if not isinstance(raw, dict): | |
print(f"Internal Warning (_json_string_to_dict): LLM response is not a JSON object: {s}") | |
return {} | |
for k_str, v in raw.items(): | |
try: | |
res_dict[int(k_str)] = v | |
except ValueError: | |
print(f"Internal Warning (_json_string_to_dict): Non-integer key '{k_str}' in LLM response.") | |
except json.JSONDecodeError as e: | |
print(f"Internal Error (_json_string_to_dict): Failed decoding JSON '{s}'. Error: {e}") | |
except Exception as e: | |
print(f"Internal Error (_json_string_to_dict): {e}") | |
return res_dict | |
# --- End Internal Helpers --- | |
# 1. Convert input dictionary to JSON string | |
json_input_string = _dict_to_json_string(text_dict) | |
print(f"Input JSON String: {json_input_string}") # Debugging output | |
if json_input_string == "{}": | |
print("Skipping translation due to empty input dictionary or conversion error.") | |
return {key: "" for key in text_dict} # Return original structure with empty values | |
system_prompt = f"""Translate the string values within the following JSON object . | |
Follow these instructions carefully: | |
1. Analyze the entire JSON object to understand the context. | |
2. Translate *only* the string values. | |
3. Keep the original keys *exactly* as they are. | |
4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
5. Preserve the original JSON structure perfectly. | |
6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
""" | |
# 3. Construct User Prompt | |
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string} \n\n Translated JSON Output:" | |
# 4. Call the LLM API | |
raw_translated_json_string = "{}" # Default to empty JSON string | |
try: | |
model = genai.GenerativeModel('gemini-2.0-flash') | |
full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}" | |
response = model.generate_content( | |
contents=full_prompt, | |
generation_config={ | |
'temperature': 0.3, # Low temp for adherence | |
'top_p': 1, | |
'top_k': 1, | |
} | |
# safety_settings=[...] | |
) | |
# Extract text safely and clean | |
if response and response.parts: | |
if hasattr(response.parts[0], 'text'): | |
raw_translated_json_string = response.parts[0].text.strip() | |
else: | |
print(f"Warning: Received response part without text attribute: {response.parts[0]}") | |
try: raw_translated_json_string = str(response.parts[0]) | |
except Exception as str_e: print(f"Could not convert response part to string: {str_e}") | |
elif response and hasattr(response, 'text'): | |
raw_translated_json_string = response.text.strip() | |
else: | |
print(f"Warning: Received unexpected or empty response format from API: {response}") | |
# Clean potential markdown backticks | |
if raw_translated_json_string.startswith("```json"): raw_translated_json_string = raw_translated_json_string[7:] | |
if raw_translated_json_string.startswith("```"): raw_translated_json_string = raw_translated_json_string[3:] | |
if raw_translated_json_string.endswith("```"): raw_translated_json_string = raw_translated_json_string[:-3] | |
raw_translated_json_string = raw_translated_json_string.strip() | |
# Ensure it's at least plausible JSON before parsing | |
if not raw_translated_json_string: raw_translated_json_string = "{}" | |
except Exception as e: | |
print(f"Lỗi trong quá trình gọi API dịch: {e}") | |
raw_translated_json_string = "{}" # Ensure empty JSON on error | |
print(raw_translated_json_string) | |
# 5. Convert the LLM's JSON string response back to a dictionary | |
translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string) | |
# 6. Validation: Ensure output dict has same keys as input dict | |
final_translated_dict = {} | |
missing_keys = [] | |
for key in text_dict.keys(): # Iterate using ORIGINAL keys | |
if key in translated_intermediate_dict: | |
final_translated_dict[key] = translated_intermediate_dict[key] | |
else: | |
final_translated_dict[key] = "" # Preserve key, use empty string if missing | |
missing_keys.append(key) | |
if missing_keys: | |
print(f"Warning: LLM response was missing keys: {sorted(missing_keys)}. Filled with empty strings.") | |
extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys()) | |
if extra_keys: | |
print(f"Warning: LLM response contained unexpected extra keys: {sorted(list(extra_keys))}. These were ignored.") | |
return final_translated_dict | |
# Function 3: Dictionary -> List | |
def postprocess_text(translated_dict): | |
""" | |
Converts a dictionary {index: translated_text} back into a list of | |
strings, ordered by the index (key). | |
""" | |
if not isinstance(translated_dict, dict): | |
print("Warning: postprocess_text expected a dict, received:", type(translated_dict)) | |
return [] | |
if not translated_dict: | |
return [] | |
# Sort the dictionary items by key (index) | |
try: | |
# Ensure keys are integers for correct sorting if possible, handle errors | |
items_to_sort = [] | |
for k, v in translated_dict.items(): | |
try: | |
items_to_sort.append((int(k), v)) | |
except (ValueError, TypeError): | |
print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.") | |
continue # Skip non-integer keys for sorting | |
if not items_to_sort: | |
print("Warning: No sortable items found in dictionary for postprocessing.") | |
return [] | |
sorted_items = sorted(items_to_sort) | |
# Check for gaps in indices (optional but good practice) | |
expected_length = sorted_items[-1][0] + 1 | |
if len(sorted_items) != expected_length: | |
print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.") | |
# Reconstruct carefully to handle gaps, filling with empty strings | |
result_list = [""] * expected_length | |
for index, text in sorted_items: | |
if 0 <= index < expected_length: | |
result_list[index] = text | |
return result_list | |
# If no gaps, simply extract values | |
translated_list = [text for index, text in sorted_items] | |
return translated_list | |
except Exception as e: | |
print(f"Error during postprocessing sorting/list creation: {e}") | |
return [] # Return empty list on error |