Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

MT_deploy / utils /utils.py

mintlee

add translate sheet_name

58fa02f 5 months ago

raw

history blame

12.2 kB

	import os
	import zipfile
	import google.generativeai as genai
	import tempfile
	import io
	import json
	import time
	from google.api_core.exceptions import ResourceExhausted

	genai.configure(api_key="AIzaSyBH8O5IfqYrJ5wtWnmUC21IfMjzJCrTm3I")


	def unzip_office_file(pptx_file: io.BytesIO):
	"""
	Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời.
	Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx).
	"""
	# Tạo thư mục tạm để lưu nội dung giải nén
	output_dir = tempfile.mkdtemp(prefix="pptx_extract_")

	# Giải nén nội dung từ file PPTX (BytesIO)
	with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
	zip_ref.extractall(output_dir)

	return output_dir


	def translate_single_text(text: str, source_lang: str = 'English', target_lang: str = "Vietnamese",
	max_retries: int = 5, retry_delay_seconds: int = 3) -> str:
	if not text or not text.strip():
	return "" # Bỏ qua nếu chuỗi rỗng hoặc chỉ chứa khoảng trắng

	retries = 0
	while retries <= max_retries:
	try:
	model = genai.GenerativeModel('gemini-2.0-flash') # Hoặc 'gemini-1.0-pro', 'gemini-1.5-flash' tùy bạn chọn

	system_prompt_simple = f"""You are a translation engine.
	Translate the following text accurately from {source_lang} to {target_lang}.
	Provide only the translated text as a single string.
	Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""

	user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}"
	full_prompt = system_prompt_simple.strip() + "\n\n" + user_prompt.strip()

	response = model.generate_content(
	contents=full_prompt,
	generation_config={
	'temperature': 0.2,
	'top_p': 1.0,
	'top_k': 1,
	# 'max_output_tokens': 2048,
	}
	)
	# Kiểm tra xem response có text không và có ứng viên không
	if response.candidates and response.candidates[0].content.parts:
	translated_text = "".join(part.text for part in response.candidates[0].content.parts if hasattr(part, 'text')).strip()
	return translated_text
	else:
	print(f"Không nhận được nội dung hợp lệ từ API cho văn bản: '{text[:50]}...'")
	# Không thử lại với lỗi này, trả về rỗng
	return ""


	except ResourceExhausted as e: # Bắt lỗi 429 (Too Many Requests / Quota Exceeded)
	print(f"Lỗi quota (429) khi dịch '{text[:50]}...': {e}. Đang thử lại sau {retry_delay_seconds} giây. Lần thử {retries + 1}/{max_retries +1 }.")
	if retries < max_retries:
	time.sleep(retry_delay_seconds)
	retries += 1
	else:
	print(f"Đã vượt quá số lần thử lại tối đa ({max_retries + 1}) cho '{text[:50]}...'. Bỏ qua.")
	return "" # Trả về rỗng sau khi đã thử lại tối đa số lần

	except Exception as e:
	print(f"Lỗi không mong muốn trong quá trình dịch (translate_single_text) cho '{text[:50]}...': {e}")
	return ""

	return "" # Trường hợp không bao giờ nên xảy ra nếu logic vòng lặp đúng


	def preprocess_text(text_list):
	"""
	Converts a list of strings into a dictionary where keys are the
	list indices (int) and values are the strings.
	"""
	if not isinstance(text_list, list):
	return {}
	if not text_list:
	return {}
	text_dict = {index: text for index, text in enumerate(text_list)}
	return text_dict

	def translate_text(text_dict, source_lang='English', target_lang="Vietnamese"):
	"""
	Translates the values of a dictionary {index: text} using an LLM.
	It uses an intermediate JSON string format for reliable LLM interaction.
	Returns a dictionary {index: translated_text} with the same keys.
	"""
	if not isinstance(text_dict, dict):
	print("Warning: translate_text_dict expected a dict, received:", type(text_dict))
	return {}
	if not text_dict:
	return {}

	# --- Internal Helper: Convert Dictionary to JSON String for LLM ---
	def _dict_to_json_string(d):
	json_compatible = {str(k): v for k, v in d.items()}
	try:
	return json.dumps(json_compatible, ensure_ascii=False, separators=(',',':'))
	except Exception as e:
	print(f"Internal Error (_dict_to_json_string): {e}")
	return "{}"

	# --- Internal Helper: Convert LLM's JSON String Response to Dictionary ---
	def _json_string_to_dict(s):
	res_dict = {}
	if not s or not isinstance(s, str): return {}
	try:
	raw = json.loads(s)
	if not isinstance(raw, dict):
	print(f"Internal Warning (_json_string_to_dict): LLM response is not a JSON object: {s}")
	return {}
	for k_str, v in raw.items():
	try:
	res_dict[int(k_str)] = v
	except ValueError:
	print(f"Internal Warning (_json_string_to_dict): Non-integer key '{k_str}' in LLM response.")
	except json.JSONDecodeError as e:
	print(f"Internal Error (_json_string_to_dict): Failed decoding JSON '{s}'. Error: {e}")
	except Exception as e:
	print(f"Internal Error (_json_string_to_dict): {e}")
	return res_dict
	# --- End Internal Helpers ---

	# 1. Convert input dictionary to JSON string
	json_input_string = _dict_to_json_string(text_dict)
	print(f"Input JSON String: {json_input_string}") # Debugging output
	if json_input_string == "{}":
	print("Skipping translation due to empty input dictionary or conversion error.")
	return {key: "" for key in text_dict} # Return original structure with empty values


	system_prompt = f"""Translate the string values within the following JSON object .

	Follow these instructions carefully:
	1. Analyze the entire JSON object to understand the context.
	2. Translate only the string values.
	3. Keep the original keys exactly as they are.
	4. Do not translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns.
	5. Preserve the original JSON structure perfectly.
	6. Your output must be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.

	"""
	# 3. Construct User Prompt
	user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string} \n\n Translated JSON Output:"

	# 4. Call the LLM API
	raw_translated_json_string = "{}" # Default to empty JSON string
	try:
	model = genai.GenerativeModel('gemini-2.0-flash')
	full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}"

	response = model.generate_content(
	contents=full_prompt,
	generation_config={
	'temperature': 0.3, # Low temp for adherence
	'top_p': 1,
	'top_k': 1,
	}
	# safety_settings=[...]
	)

	# Extract text safely and clean
	if response and response.parts:
	if hasattr(response.parts[0], 'text'):
	raw_translated_json_string = response.parts[0].text.strip()
	else:
	print(f"Warning: Received response part without text attribute: {response.parts[0]}")
	try: raw_translated_json_string = str(response.parts[0])
	except Exception as str_e: print(f"Could not convert response part to string: {str_e}")
	elif response and hasattr(response, 'text'):
	raw_translated_json_string = response.text.strip()
	else:
	print(f"Warning: Received unexpected or empty response format from API: {response}")

	# Clean potential markdown backticks
	if raw_translated_json_string.startswith("```json"): raw_translated_json_string = raw_translated_json_string[7:]
	if raw_translated_json_string.startswith("```"): raw_translated_json_string = raw_translated_json_string[3:]
	if raw_translated_json_string.endswith("```"): raw_translated_json_string = raw_translated_json_string[:-3]
	raw_translated_json_string = raw_translated_json_string.strip()
	# Ensure it's at least plausible JSON before parsing
	if not raw_translated_json_string: raw_translated_json_string = "{}"


	except Exception as e:
	print(f"Lỗi trong quá trình gọi API dịch: {e}")
	raw_translated_json_string = "{}" # Ensure empty JSON on error

	print(raw_translated_json_string)
	# 5. Convert the LLM's JSON string response back to a dictionary
	translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string)

	# 6. Validation: Ensure output dict has same keys as input dict
	final_translated_dict = {}
	missing_keys = []
	for key in text_dict.keys(): # Iterate using ORIGINAL keys
	if key in translated_intermediate_dict:
	final_translated_dict[key] = translated_intermediate_dict[key]
	else:
	final_translated_dict[key] = "" # Preserve key, use empty string if missing
	missing_keys.append(key)

	if missing_keys:
	print(f"Warning: LLM response was missing keys: {sorted(missing_keys)}. Filled with empty strings.")

	extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys())
	if extra_keys:
	print(f"Warning: LLM response contained unexpected extra keys: {sorted(list(extra_keys))}. These were ignored.")


	return final_translated_dict

	# Function 3: Dictionary -> List
	def postprocess_text(translated_dict):
	"""
	Converts a dictionary {index: translated_text} back into a list of
	strings, ordered by the index (key).
	"""
	if not isinstance(translated_dict, dict):
	print("Warning: postprocess_text expected a dict, received:", type(translated_dict))
	return []
	if not translated_dict:
	return []

	# Sort the dictionary items by key (index)
	try:
	# Ensure keys are integers for correct sorting if possible, handle errors
	items_to_sort = []
	for k, v in translated_dict.items():
	try:
	items_to_sort.append((int(k), v))
	except (ValueError, TypeError):
	print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.")
	continue # Skip non-integer keys for sorting

	if not items_to_sort:
	print("Warning: No sortable items found in dictionary for postprocessing.")
	return []

	sorted_items = sorted(items_to_sort)

	# Check for gaps in indices (optional but good practice)
	expected_length = sorted_items[-1][0] + 1
	if len(sorted_items) != expected_length:
	print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.")
	# Reconstruct carefully to handle gaps, filling with empty strings
	result_list = [""] * expected_length
	for index, text in sorted_items:
	if 0 <= index < expected_length:
	result_list[index] = text
	return result_list

	# If no gaps, simply extract values
	translated_list = [text for index, text in sorted_items]
	return translated_list

	except Exception as e:
	print(f"Error during postprocessing sorting/list creation: {e}")
	return [] # Return empty list on error