Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| import os | |
| import zipfile | |
| import google.generativeai as genai | |
| import tempfile | |
| import io | |
| import json | |
| import time | |
| from google.api_core.exceptions import ResourceExhausted | |
| genai.configure(api_key="AIzaSyBH8O5IfqYrJ5wtWnmUC21IfMjzJCrTm3I") | |
| def unzip_office_file(pptx_file: io.BytesIO): | |
| """ | |
| Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời. | |
| Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx). | |
| """ | |
| # Tạo thư mục tạm để lưu nội dung giải nén | |
| output_dir = tempfile.mkdtemp(prefix="pptx_extract_") | |
| # Giải nén nội dung từ file PPTX (BytesIO) | |
| with zipfile.ZipFile(pptx_file, 'r') as zip_ref: | |
| zip_ref.extractall(output_dir) | |
| return output_dir | |
| def translate_single_text(text: str, source_lang: str = 'English', target_lang: str = "Vietnamese", | |
| max_retries: int = 5, retry_delay_seconds: int = 3) -> str: | |
| if not text or not text.strip(): | |
| return "" # Bỏ qua nếu chuỗi rỗng hoặc chỉ chứa khoảng trắng | |
| retries = 0 | |
| while retries <= max_retries: | |
| try: | |
| model = genai.GenerativeModel('gemini-2.0-flash') # Hoặc 'gemini-1.0-pro', 'gemini-1.5-flash' tùy bạn chọn | |
| system_prompt_simple = f"""You are a translation engine. | |
| Translate the following text accurately from {source_lang} to {target_lang}. | |
| Provide *only* the translated text as a single string. | |
| Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations.""" | |
| user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}" | |
| full_prompt = system_prompt_simple.strip() + "\n\n" + user_prompt.strip() | |
| response = model.generate_content( | |
| contents=full_prompt, | |
| generation_config={ | |
| 'temperature': 0.2, | |
| 'top_p': 1.0, | |
| 'top_k': 1, | |
| # 'max_output_tokens': 2048, | |
| } | |
| ) | |
| # Kiểm tra xem response có text không và có ứng viên không | |
| if response.candidates and response.candidates[0].content.parts: | |
| translated_text = "".join(part.text for part in response.candidates[0].content.parts if hasattr(part, 'text')).strip() | |
| return translated_text | |
| else: | |
| print(f"Không nhận được nội dung hợp lệ từ API cho văn bản: '{text[:50]}...'") | |
| # Không thử lại với lỗi này, trả về rỗng | |
| return "" | |
| except ResourceExhausted as e: # Bắt lỗi 429 (Too Many Requests / Quota Exceeded) | |
| print(f"Lỗi quota (429) khi dịch '{text[:50]}...': {e}. Đang thử lại sau {retry_delay_seconds} giây. Lần thử {retries + 1}/{max_retries +1 }.") | |
| if retries < max_retries: | |
| time.sleep(retry_delay_seconds) | |
| retries += 1 | |
| else: | |
| print(f"Đã vượt quá số lần thử lại tối đa ({max_retries + 1}) cho '{text[:50]}...'. Bỏ qua.") | |
| return "" # Trả về rỗng sau khi đã thử lại tối đa số lần | |
| except Exception as e: | |
| print(f"Lỗi không mong muốn trong quá trình dịch (translate_single_text) cho '{text[:50]}...': {e}") | |
| return "" | |
| return "" # Trường hợp không bao giờ nên xảy ra nếu logic vòng lặp đúng | |
| def preprocess_text(text_list): | |
| """ | |
| Converts a list of strings into a dictionary where keys are the | |
| list indices (int) and values are the strings. | |
| """ | |
| if not isinstance(text_list, list): | |
| return {} | |
| if not text_list: | |
| return {} | |
| text_dict = {index: text for index, text in enumerate(text_list)} | |
| return text_dict | |
| def translate_text(text_dict, source_lang='English', target_lang="Vietnamese"): | |
| """ | |
| Translates the values of a dictionary {index: text} using an LLM. | |
| It uses an intermediate JSON string format for reliable LLM interaction. | |
| Returns a dictionary {index: translated_text} with the same keys. | |
| """ | |
| if not isinstance(text_dict, dict): | |
| print("Warning: translate_text_dict expected a dict, received:", type(text_dict)) | |
| return {} | |
| if not text_dict: | |
| return {} | |
| # --- Internal Helper: Convert Dictionary to JSON String for LLM --- | |
| def _dict_to_json_string(d): | |
| json_compatible = {str(k): v for k, v in d.items()} | |
| try: | |
| return json.dumps(json_compatible, ensure_ascii=False, separators=(',',':')) | |
| except Exception as e: | |
| print(f"Internal Error (_dict_to_json_string): {e}") | |
| return "{}" | |
| # --- Internal Helper: Convert LLM's JSON String Response to Dictionary --- | |
| def _json_string_to_dict(s): | |
| res_dict = {} | |
| if not s or not isinstance(s, str): return {} | |
| try: | |
| raw = json.loads(s) | |
| if not isinstance(raw, dict): | |
| print(f"Internal Warning (_json_string_to_dict): LLM response is not a JSON object: {s}") | |
| return {} | |
| for k_str, v in raw.items(): | |
| try: | |
| res_dict[int(k_str)] = v | |
| except ValueError: | |
| print(f"Internal Warning (_json_string_to_dict): Non-integer key '{k_str}' in LLM response.") | |
| except json.JSONDecodeError as e: | |
| print(f"Internal Error (_json_string_to_dict): Failed decoding JSON '{s}'. Error: {e}") | |
| except Exception as e: | |
| print(f"Internal Error (_json_string_to_dict): {e}") | |
| return res_dict | |
| # --- End Internal Helpers --- | |
| # 1. Convert input dictionary to JSON string | |
| json_input_string = _dict_to_json_string(text_dict) | |
| print(f"Input JSON String: {json_input_string}") # Debugging output | |
| if json_input_string == "{}": | |
| print("Skipping translation due to empty input dictionary or conversion error.") | |
| return {key: "" for key in text_dict} # Return original structure with empty values | |
| system_prompt = f"""Translate the string values within the following JSON object . | |
| Follow these instructions carefully: | |
| 1. Analyze the entire JSON object to understand the context. | |
| 2. Translate *only* the string values. | |
| 3. Keep the original keys *exactly* as they are. | |
| 4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns. | |
| 5. Preserve the original JSON structure perfectly. | |
| 6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```. | |
| """ | |
| # 3. Construct User Prompt | |
| user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string} \n\n Translated JSON Output:" | |
| # 4. Call the LLM API | |
| raw_translated_json_string = "{}" # Default to empty JSON string | |
| try: | |
| model = genai.GenerativeModel('gemini-2.0-flash') | |
| full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}" | |
| response = model.generate_content( | |
| contents=full_prompt, | |
| generation_config={ | |
| 'temperature': 0.3, # Low temp for adherence | |
| 'top_p': 1, | |
| 'top_k': 1, | |
| } | |
| # safety_settings=[...] | |
| ) | |
| # Extract text safely and clean | |
| if response and response.parts: | |
| if hasattr(response.parts[0], 'text'): | |
| raw_translated_json_string = response.parts[0].text.strip() | |
| else: | |
| print(f"Warning: Received response part without text attribute: {response.parts[0]}") | |
| try: raw_translated_json_string = str(response.parts[0]) | |
| except Exception as str_e: print(f"Could not convert response part to string: {str_e}") | |
| elif response and hasattr(response, 'text'): | |
| raw_translated_json_string = response.text.strip() | |
| else: | |
| print(f"Warning: Received unexpected or empty response format from API: {response}") | |
| # Clean potential markdown backticks | |
| if raw_translated_json_string.startswith("```json"): raw_translated_json_string = raw_translated_json_string[7:] | |
| if raw_translated_json_string.startswith("```"): raw_translated_json_string = raw_translated_json_string[3:] | |
| if raw_translated_json_string.endswith("```"): raw_translated_json_string = raw_translated_json_string[:-3] | |
| raw_translated_json_string = raw_translated_json_string.strip() | |
| # Ensure it's at least plausible JSON before parsing | |
| if not raw_translated_json_string: raw_translated_json_string = "{}" | |
| except Exception as e: | |
| print(f"Lỗi trong quá trình gọi API dịch: {e}") | |
| raw_translated_json_string = "{}" # Ensure empty JSON on error | |
| print(raw_translated_json_string) | |
| # 5. Convert the LLM's JSON string response back to a dictionary | |
| translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string) | |
| # 6. Validation: Ensure output dict has same keys as input dict | |
| final_translated_dict = {} | |
| missing_keys = [] | |
| for key in text_dict.keys(): # Iterate using ORIGINAL keys | |
| if key in translated_intermediate_dict: | |
| final_translated_dict[key] = translated_intermediate_dict[key] | |
| else: | |
| final_translated_dict[key] = "" # Preserve key, use empty string if missing | |
| missing_keys.append(key) | |
| if missing_keys: | |
| print(f"Warning: LLM response was missing keys: {sorted(missing_keys)}. Filled with empty strings.") | |
| extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys()) | |
| if extra_keys: | |
| print(f"Warning: LLM response contained unexpected extra keys: {sorted(list(extra_keys))}. These were ignored.") | |
| return final_translated_dict | |
| # Function 3: Dictionary -> List | |
| def postprocess_text(translated_dict): | |
| """ | |
| Converts a dictionary {index: translated_text} back into a list of | |
| strings, ordered by the index (key). | |
| """ | |
| if not isinstance(translated_dict, dict): | |
| print("Warning: postprocess_text expected a dict, received:", type(translated_dict)) | |
| return [] | |
| if not translated_dict: | |
| return [] | |
| # Sort the dictionary items by key (index) | |
| try: | |
| # Ensure keys are integers for correct sorting if possible, handle errors | |
| items_to_sort = [] | |
| for k, v in translated_dict.items(): | |
| try: | |
| items_to_sort.append((int(k), v)) | |
| except (ValueError, TypeError): | |
| print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.") | |
| continue # Skip non-integer keys for sorting | |
| if not items_to_sort: | |
| print("Warning: No sortable items found in dictionary for postprocessing.") | |
| return [] | |
| sorted_items = sorted(items_to_sort) | |
| # Check for gaps in indices (optional but good practice) | |
| expected_length = sorted_items[-1][0] + 1 | |
| if len(sorted_items) != expected_length: | |
| print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.") | |
| # Reconstruct carefully to handle gaps, filling with empty strings | |
| result_list = [""] * expected_length | |
| for index, text in sorted_items: | |
| if 0 <= index < expected_length: | |
| result_list[index] = text | |
| return result_list | |
| # If no gaps, simply extract values | |
| translated_list = [text for index, text in sorted_items] | |
| return translated_list | |
| except Exception as e: | |
| print(f"Error during postprocessing sorting/list creation: {e}") | |
| return [] # Return empty list on error |