Spaces:
Running
Running
File size: 10,758 Bytes
73196e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import os
import zipfile
import google.generativeai as genai
import tempfile
import io
import json
genai.configure(api_key="AIzaSyBH8O5IfqYrJ5wtWnmUC21IfMjzJCrTm3I")
def unzip_office_file(pptx_file: io.BytesIO):
"""
Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời.
Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx).
"""
# Tạo thư mục tạm để lưu nội dung giải nén
output_dir = tempfile.mkdtemp(prefix="pptx_extract_")
# Giải nén nội dung từ file PPTX (BytesIO)
with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
zip_ref.extractall(output_dir)
return output_dir
def translate_single_text(text, source_lang='English', target_lang="Vietnamese"):
if not text:
return "" # Bỏ qua nếu chuỗi rỗng
try:
model = genai.GenerativeModel('gemini-2.0-flash') # Sử dụng model từ code gốc nếu hoạt động tốt
# --- Prompt đơn giản chỉ yêu cầu dịch thuật ---
system_prompt_simple = f"""You are a translation engine.
Translate the following text accurately from {source_lang} to {target_lang}.
Provide *only* the translated text as a single string.
Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}"
full_prompt = system_prompt_simple.strip() + "\n\n" + user_prompt.strip()
response = model.generate_content(
contents=full_prompt,
generation_config={
'temperature': 0.7, # Nhiệt độ phù hợp cho dịch thuật (có thể điều chỉnh)
'top_p': 1,
'top_k': 1,
}
)
translated_text = response.text.strip()
return translated_text
except Exception as e:
print(f"Lỗi trong quá trình dịch (translate_single_text): {e}")
return "" # Trả về chuỗi rỗng nếu có lỗi
def preprocess_text(text_list):
"""
Converts a list of strings into a dictionary where keys are the
list indices (int) and values are the strings.
"""
if not isinstance(text_list, list):
return {}
if not text_list:
return {}
text_dict = {index: text for index, text in enumerate(text_list)}
return text_dict
def translate_text(text_dict, source_lang='English', target_lang="Vietnamese"):
"""
Translates the values of a dictionary {index: text} using an LLM.
It uses an intermediate JSON string format for reliable LLM interaction.
Returns a dictionary {index: translated_text} with the same keys.
"""
if not isinstance(text_dict, dict):
print("Warning: translate_text_dict expected a dict, received:", type(text_dict))
return {}
if not text_dict:
return {}
# --- Internal Helper: Convert Dictionary to JSON String for LLM ---
def _dict_to_json_string(d):
json_compatible = {str(k): v for k, v in d.items()}
try:
return json.dumps(json_compatible, ensure_ascii=False, separators=(',',':'))
except Exception as e:
print(f"Internal Error (_dict_to_json_string): {e}")
return "{}"
# --- Internal Helper: Convert LLM's JSON String Response to Dictionary ---
def _json_string_to_dict(s):
res_dict = {}
if not s or not isinstance(s, str): return {}
try:
raw = json.loads(s)
if not isinstance(raw, dict):
print(f"Internal Warning (_json_string_to_dict): LLM response is not a JSON object: {s}")
return {}
for k_str, v in raw.items():
try:
res_dict[int(k_str)] = v
except ValueError:
print(f"Internal Warning (_json_string_to_dict): Non-integer key '{k_str}' in LLM response.")
except json.JSONDecodeError as e:
print(f"Internal Error (_json_string_to_dict): Failed decoding JSON '{s}'. Error: {e}")
except Exception as e:
print(f"Internal Error (_json_string_to_dict): {e}")
return res_dict
# --- End Internal Helpers ---
# 1. Convert input dictionary to JSON string
json_input_string = _dict_to_json_string(text_dict)
print(f"Input JSON String: {json_input_string}") # Debugging output
if json_input_string == "{}":
print("Skipping translation due to empty input dictionary or conversion error.")
return {key: "" for key in text_dict} # Return original structure with empty values
system_prompt = f"""Translate the string values within the following JSON object .
Follow these instructions carefully:
1. Analyze the entire JSON object to understand the context.
2. Translate *only* the string values.
3. Keep the original keys *exactly* as they are.
4. Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns.
5. Preserve the original JSON structure perfectly.
6. Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
"""
# 3. Construct User Prompt
user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string} \n\n Translated JSON Output:"
# 4. Call the LLM API
raw_translated_json_string = "{}" # Default to empty JSON string
try:
model = genai.GenerativeModel('gemini-2.0-flash')
full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}"
response = model.generate_content(
contents=full_prompt,
generation_config={
'temperature': 0.3, # Low temp for adherence
'top_p': 1,
'top_k': 1,
}
# safety_settings=[...]
)
# Extract text safely and clean
if response and response.parts:
if hasattr(response.parts[0], 'text'):
raw_translated_json_string = response.parts[0].text.strip()
else:
print(f"Warning: Received response part without text attribute: {response.parts[0]}")
try: raw_translated_json_string = str(response.parts[0])
except Exception as str_e: print(f"Could not convert response part to string: {str_e}")
elif response and hasattr(response, 'text'):
raw_translated_json_string = response.text.strip()
else:
print(f"Warning: Received unexpected or empty response format from API: {response}")
# Clean potential markdown backticks
if raw_translated_json_string.startswith("```json"): raw_translated_json_string = raw_translated_json_string[7:]
if raw_translated_json_string.startswith("```"): raw_translated_json_string = raw_translated_json_string[3:]
if raw_translated_json_string.endswith("```"): raw_translated_json_string = raw_translated_json_string[:-3]
raw_translated_json_string = raw_translated_json_string.strip()
# Ensure it's at least plausible JSON before parsing
if not raw_translated_json_string: raw_translated_json_string = "{}"
except Exception as e:
print(f"Lỗi trong quá trình gọi API dịch: {e}")
raw_translated_json_string = "{}" # Ensure empty JSON on error
print(raw_translated_json_string)
# 5. Convert the LLM's JSON string response back to a dictionary
translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string)
# 6. Validation: Ensure output dict has same keys as input dict
final_translated_dict = {}
missing_keys = []
for key in text_dict.keys(): # Iterate using ORIGINAL keys
if key in translated_intermediate_dict:
final_translated_dict[key] = translated_intermediate_dict[key]
else:
final_translated_dict[key] = "" # Preserve key, use empty string if missing
missing_keys.append(key)
if missing_keys:
print(f"Warning: LLM response was missing keys: {sorted(missing_keys)}. Filled with empty strings.")
extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys())
if extra_keys:
print(f"Warning: LLM response contained unexpected extra keys: {sorted(list(extra_keys))}. These were ignored.")
return final_translated_dict
# Function 3: Dictionary -> List
def postprocess_text(translated_dict):
"""
Converts a dictionary {index: translated_text} back into a list of
strings, ordered by the index (key).
"""
if not isinstance(translated_dict, dict):
print("Warning: postprocess_text expected a dict, received:", type(translated_dict))
return []
if not translated_dict:
return []
# Sort the dictionary items by key (index)
try:
# Ensure keys are integers for correct sorting if possible, handle errors
items_to_sort = []
for k, v in translated_dict.items():
try:
items_to_sort.append((int(k), v))
except (ValueError, TypeError):
print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.")
continue # Skip non-integer keys for sorting
if not items_to_sort:
print("Warning: No sortable items found in dictionary for postprocessing.")
return []
sorted_items = sorted(items_to_sort)
# Check for gaps in indices (optional but good practice)
expected_length = sorted_items[-1][0] + 1
if len(sorted_items) != expected_length:
print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.")
# Reconstruct carefully to handle gaps, filling with empty strings
result_list = [""] * expected_length
for index, text in sorted_items:
if 0 <= index < expected_length:
result_list[index] = text
return result_list
# If no gaps, simply extract values
translated_list = [text for index, text in sorted_items]
return translated_list
except Exception as e:
print(f"Error during postprocessing sorting/list creation: {e}")
return [] # Return empty list on error |