Spaces:
Running
Running
| import os | |
| import zipfile | |
| import google.generativeai as genai | |
| import tempfile | |
| import io | |
| import json | |
| import time | |
| from google.api_core.exceptions import ResourceExhausted | |
| import re | |
| genai.configure(api_key="AIzaSyDInJcxzqBvsh1avs4Zkxb4ZGBooNzOyEM") | |
| def unzip_office_file(pptx_file: io.BytesIO): | |
| """ | |
| Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời. | |
| Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx). | |
| """ | |
| # Tạo thư mục tạm để lưu nội dung giải nén | |
| output_dir = tempfile.mkdtemp(prefix="pptx_extract_") | |
| # Giải nén nội dung từ file PPTX (BytesIO) | |
| with zipfile.ZipFile(pptx_file, 'r') as zip_ref: | |
| zip_ref.extractall(output_dir) | |
| return output_dir | |
| def translate_single_text(text: str, source_lang: str = 'English', target_lang: str = "Vietnamese", | |
| max_retries: int = 5, base_delay: float = 5.0) -> str: | |
| if not text or not text.strip(): | |
| return "" # Bỏ qua nếu chuỗi rỗng hoặc chỉ chứa khoảng trắng | |
| retries = 0 | |
| while retries <= max_retries: | |
| try: | |
| model = genai.GenerativeModel('gemini-2.0-flash') # hoặc 'gemini-1.5-flash' | |
| system_prompt = f"""You are a translation engine. | |
| Translate the following text accurately from {source_lang} to {target_lang}. | |
| Provide *only* the translated text as a single string. | |
| Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations.""" | |
| user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}" | |
| full_prompt = system_prompt.strip() + "\n\n" + user_prompt.strip() | |
| response = model.generate_content( | |
| contents=full_prompt, | |
| generation_config={ | |
| 'temperature': 0.2, | |
| 'top_p': 1.0, | |
| 'top_k': 1, | |
| } | |
| ) | |
| if response.candidates and response.candidates[0].content.parts: | |
| translated_text = "".join(part.text for part in response.candidates[0].content.parts if hasattr(part, 'text')).strip() | |
| return translated_text | |
| else: | |
| print(f"[!] Không nhận được nội dung hợp lệ từ API cho văn bản: '{text[:50]}...'") | |
| return "" | |
| except ResourceExhausted as e: | |
| wait_time = base_delay * (2 ** retries) | |
| print(f"[429] Quota exceeded khi dịch '{text[:50]}...'. Thử lại sau {wait_time:.1f}s (lần {retries + 1}/{max_retries + 1}).") | |
| time.sleep(wait_time) | |
| retries += 1 | |
| except Exception as e: | |
| print(f"[!] Lỗi không mong muốn khi dịch '{text[:50]}...': {e}") | |
| return "" | |
| print(f"[x] Bỏ qua sau {max_retries + 1} lần thử không thành công cho '{text[:50]}...'.") | |
| return "" | |
| def preprocess_text(text_list): | |
| """ | |
| Converts a list of strings into a dictionary where keys are the | |
| list indices (int) and values are the strings. | |
| """ | |
| if not isinstance(text_list, list): | |
| return {} | |
| if not text_list: | |
| return {} | |
| text_dict = {index: text for index, text in enumerate(text_list)} | |
| return text_dict | |
| def translate_text(text_dict, source_lang='English', target_lang="Vietnamese", max_retries=5, base_delay: float = 5.0): | |
| def _dict_to_json_string(d): | |
| json_compatible = {str(k): v for k, v in d.items()} | |
| try: | |
| return json.dumps(json_compatible, ensure_ascii=False, separators=(',', ':')) | |
| except Exception as e: | |
| print(f"Internal Error (_dict_to_json_string): {e}") | |
| return "{}" | |
| def _json_string_to_dict(s): | |
| res_dict = {} | |
| if not s or not isinstance(s, str): return {} | |
| try: | |
| raw = json.loads(s) | |
| if not isinstance(raw, dict): | |
| print(f"LLM response is not a JSON object: {s}") | |
| return {} | |
| for k_str, v in raw.items(): | |
| try: | |
| res_dict[int(k_str)] = v | |
| except ValueError: | |
| print(f"Non-integer key '{k_str}' in LLM response.") | |
| except json.JSONDecodeError as e: | |
| print(f"JSON decode error: {e}") | |
| except Exception as e: | |
| print(f"General error: {e}") | |
| return res_dict | |
| if not isinstance(text_dict, dict): | |
| print("translate_text_dict expected a dict, got:", type(text_dict)) | |
| return {} | |
| if not text_dict: | |
| return {} | |
| json_input_string = _dict_to_json_string(text_dict) | |
| if json_input_string == "{}": | |
| print("Empty or invalid dictionary input.") | |
| return {key: "" for key in text_dict} | |
| system_prompt = f"""Translate the string values within the following JSON object . | |
| Follow these instructions carefully: | |
| 1. Analyze the entire JSON object to understand the context. | |
| 2. Translate *only* the string values. | |
| 3. Keep the original keys *exactly* as they are. | |
| 4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
| 5. Preserve the original JSON structure perfectly. | |
| 6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
| """ | |
| user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string}\n\nTranslated JSON Output:" | |
| raw_translated_json_string = "{}" | |
| retry_count = 0 | |
| while retry_count < max_retries: | |
| try: | |
| model = genai.GenerativeModel('gemini-2.0-flash') | |
| full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}" | |
| response = model.generate_content( | |
| contents=full_prompt, | |
| generation_config={ | |
| 'temperature': 0.3, | |
| 'top_p': 1, | |
| 'top_k': 1, | |
| } | |
| ) | |
| if response and response.parts and hasattr(response.parts[0], 'text'): | |
| raw_translated_json_string = response.parts[0].text.strip() | |
| elif hasattr(response, 'text'): | |
| raw_translated_json_string = response.text.strip() | |
| # Clean markdown wrappers if present | |
| raw_translated_json_string = re.sub(r"^```(?:json)?|```$", "", raw_translated_json_string).strip() | |
| if raw_translated_json_string: | |
| break # Success, exit retry loop | |
| except Exception as e: | |
| wait_time = base_delay * (2 ** retry_count) | |
| print(f"[Retry {retry_count+1}] Lỗi gọi API: {e}. Thử lại sau {wait_time:.2f} giây.") | |
| time.sleep(wait_time) | |
| retry_count += 1 | |
| if retry_count == max_retries: | |
| print("❌ Hết số lần thử lại. Trả về JSON rỗng.") | |
| raw_translated_json_string = "{}" | |
| print(raw_translated_json_string) | |
| translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string) | |
| final_translated_dict = {} | |
| missing_keys = [] | |
| for key in text_dict: | |
| if key in translated_intermediate_dict: | |
| final_translated_dict[key] = translated_intermediate_dict[key] | |
| else: | |
| final_translated_dict[key] = "" | |
| missing_keys.append(key) | |
| if missing_keys: | |
| print(f"Cảnh báo: Thiếu keys: {sorted(missing_keys)}.") | |
| extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys()) | |
| if extra_keys: | |
| print(f"Cảnh báo: Có keys không mong đợi: {sorted(extra_keys)}.") | |
| return final_translated_dict | |
| # Function 3: Dictionary -> List | |
| def postprocess_text(translated_dict): | |
| """ | |
| Converts a dictionary {index: translated_text} back into a list of | |
| strings, ordered by the index (key). | |
| """ | |
| if not isinstance(translated_dict, dict): | |
| print("Warning: postprocess_text expected a dict, received:", type(translated_dict)) | |
| return [] | |
| if not translated_dict: | |
| return [] | |
| # Sort the dictionary items by key (index) | |
| try: | |
| # Ensure keys are integers for correct sorting if possible, handle errors | |
| items_to_sort = [] | |
| for k, v in translated_dict.items(): | |
| try: | |
| items_to_sort.append((int(k), v)) | |
| except (ValueError, TypeError): | |
| print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.") | |
| continue # Skip non-integer keys for sorting | |
| if not items_to_sort: | |
| print("Warning: No sortable items found in dictionary for postprocessing.") | |
| return [] | |
| sorted_items = sorted(items_to_sort) | |
| # Check for gaps in indices (optional but good practice) | |
| expected_length = sorted_items[-1][0] + 1 | |
| if len(sorted_items) != expected_length: | |
| print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.") | |
| # Reconstruct carefully to handle gaps, filling with empty strings | |
| result_list = [""] * expected_length | |
| for index, text in sorted_items: | |
| if 0 <= index < expected_length: | |
| result_list[index] = text | |
| return result_list | |
| # If no gaps, simply extract values | |
| translated_list = [text for index, text in sorted_items] | |
| return translated_list | |
| except Exception as e: | |
| print(f"Error during postprocessing sorting/list creation: {e}") | |
| return [] # Return empty list on error |