Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

MT_deploy / powerpoint /xml_handling.py

mintlee

add japanese

d300944 3 months ago

raw

history blame contribute delete

23.9 kB

	from lxml import etree as ET
	import copy # Để tạo bản sao sâu của rPr
	import os
	import traceback # Để in chi tiết lỗi

	# --- Namespaces (giữ nguyên) ---
	ns = {
	'a': "http://schemas.openxmlformats.org/drawingml/2006/main",
	'p': "http://schemas.openxmlformats.org/presentationml/2006/main",
	'r': "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
	'dgm': 'http://schemas.openxmlformats.org/drawingml/2006/diagram',
	'pr': 'http://schemas.openxmlformats.org/package/2006/relationships'
	}

	# --- Đăng ký namespace (giữ nguyên) ---
	for prefix, uri in ns.items():
	if prefix != 'pr':
	ET.register_namespace(prefix, uri)


	def _get_paragraph_details(p_element):
	paragraph_text_parts = []
	first_rPr_with_text = None
	found_first_rpr = False # Cờ để chỉ tìm rPr đầu tiên một lần

	# Duyệt qua các con TRỰC TIẾP của <a:p> để xử lý <a:r> và <a:fld>
	for child_elem in p_element:
	current_rpr = None
	found_text_in_child = None

	# Trường hợp 1: Run thông thường (<a:r>)
	if child_elem.tag == f"{{{ns['a']}}}r":
	# Tìm text <a:t> bên trong run (dùng .// an toàn cho run lồng nhau nếu có)
	t_elem = child_elem.find('.//a:t', ns)
	if t_elem is not None and t_elem.text is not None:
	found_text_in_child = t_elem.text
	# Tìm rPr của run này
	current_rpr = child_elem.find('.//a:rPr', ns) # Dùng .//

	# Trường hợp 2: Field (<a:fld>)
	elif child_elem.tag == f"{{{ns['a']}}}fld":
	# Tìm text <a:t> là con TRỰC TIẾP của field
	t_elem = child_elem.find('./a:t', ns)
	if t_elem is not None and t_elem.text is not None:
	found_text_in_child = t_elem.text
	# Tìm rPr là con TRỰC TIẾP của field
	current_rpr = child_elem.find('./a:rPr', ns)

	# Xử lý nếu tìm thấy text trong child hiện tại (hoặc <a:r> hoặc <a:fld>)
	if found_text_in_child is not None:
	paragraph_text_parts.append(found_text_in_child)
	# Nếu chưa lưu rPr đầu tiên, lưu rPr của child hiện tại
	if not found_first_rpr:
	first_rPr_with_text = current_rpr # Lưu rPr tìm được (có thể là None)
	found_first_rpr = True # Đánh dấu đã tìm thấy

	# Chỉ trả về kết quả nếu paragraph thực sự có nội dung text
	if paragraph_text_parts:
	merged_text = "".join(paragraph_text_parts).strip()
	if merged_text:
	# Trả về text đã ghép và rPr đầu tiên tìm thấy (có thể là None)
	return (merged_text, first_rPr_with_text)

	return None # Không có text trong paragraph này hoặc text rỗng

	# --- Hàm trích xuất chính (Trả về list các tuple chi tiết paragraph) ---
	def extract_text_from_slide(slide_file):
	# print(f"--- Bắt đầu trích xuất chi tiết từng <a:p> từ file: {slide_file} ---")
	extracted_data = [] # Danh sách kết quả cuối cùng

	if not os.path.exists(slide_file):
	print(f"Lỗi: File không tồn tại: {slide_file}")
	print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi) ---")
	return extracted_data

	try:
	tree = ET.parse(slide_file)
	root = tree.getroot()
	except ET.ParseError as e:
	print(f"Lỗi parse XML file {slide_file}: {e}")
	print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi Parse) ---")
	return extracted_data
	except Exception as e:
	print(f"Lỗi không xác định khi parse {slide_file}: {e}")
	# traceback.print_exc()
	print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi Parse không xác định) ---")
	return extracted_data

	try:
	processed_txBody_elements = set()
	elements_to_check = []

	# 1. Thu thập các container có thể chứa txBody
	for sp in root.findall('.//p:spTree/p:sp', ns): elements_to_check.append(sp)
	for grpSp in root.findall('.//p:spTree/p:grpSp', ns):
	for sp_in_grp in grpSp.findall('.//p:sp', ns): elements_to_check.append(sp_in_grp)
	for tc in root.findall('.//a:tbl//a:tc', ns): elements_to_check.append(tc)
	# Thêm tìm kiếm khác nếu cần

	# 2. Duyệt qua container, tìm txBody, rồi xử lý từng <a:p> bên trong
	for container in elements_to_check:
	txBody = container.find('./p:txBody', ns)
	if txBody is None: txBody = container.find('./a:txBody', ns)

	if txBody is not None and txBody not in processed_txBody_elements:
	# Tìm TẤT CẢ các thẻ <a:p> là con TRỰC TIẾP của txBody này
	paragraphs = txBody.findall('a:p', ns)
	for p_elem in paragraphs:
	# Gọi hàm helper để lấy chi tiết của paragraph này
	details = _get_paragraph_details(p_elem)
	# Nếu paragraph có nội dung text, thêm tuple vào kết quả
	if details:
	extracted_data.append(details)

	processed_txBody_elements.add(txBody)

	except Exception as e:
	print(f"Lỗi khi tìm kiếm hoặc trích xuất chi tiết <a:p>: {e}")

	return extracted_data


	def replace_text_in_slide(xml_file_path, list_of_translated_paragraph_data):
	# print(f"\n--- Bắt đầu thay thế PARAGRAPH (ghi đè, logic length/bold) trong file: {os.path.basename(xml_file_path)} ---")
	processed_p_count = 0

	if not os.path.exists(xml_file_path):
	print(f"Lỗi: Không tìm thấy file XML nguồn '{xml_file_path}'.")
	return False

	try:
	tree = ET.parse(xml_file_path)
	root = tree.getroot()

	# --- TÌM và LỌC <a:p> THEO CÙNG LOGIC NHƯ EXTRACT ---
	paragraphs_to_modify = []
	processed_txBody_elements = set()
	elements_to_check = []
	for sp in root.findall('.//p:spTree/p:sp', ns): elements_to_check.append(sp)
	for grpSp in root.findall('.//p:spTree/p:grpSp', ns):
	for sp_in_grp in grpSp.findall('.//p:sp', ns): elements_to_check.append(sp_in_grp)
	for tc in root.findall('.//a:tbl//a:tc', ns): elements_to_check.append(tc)

	for container in elements_to_check:
	txBody = container.find('./p:txBody', ns)
	if txBody is None: txBody = container.find('./a:txBody', ns)
	if txBody is not None and txBody not in processed_txBody_elements:
	paragraphs = txBody.findall('a:p', ns)
	for p_elem in paragraphs:
	has_actual_text = False
	elements_with_text = p_elem.findall('.//a:r/a:t', ns) + p_elem.findall('.//a:fld/a:t', ns)
	for t in elements_with_text:
	if t.text and t.text.strip(): has_actual_text = True; break
	if has_actual_text: paragraphs_to_modify.append(p_elem)
	processed_txBody_elements.add(txBody)

	# --- Kiểm tra số lượng khớp ---
	num_paragraphs_found = len(paragraphs_to_modify)
	num_data_items = len(list_of_translated_paragraph_data)

	if num_paragraphs_found == 0:
	# print(f"Thông báo [...]: Không tìm thấy <a:p> nào có text để thay thế.")
	if num_data_items > 0: print(f"Cảnh báo: Đã cung cấp {num_data_items} mục dữ liệu nhưng không có <a:p> nào để áp dụng.")
	# print(f"--- Kết thúc xử lý (không thay đổi): {os.path.basename(xml_file_path)} ---")
	return True

	if num_paragraphs_found != num_data_items:
	print(f"CẢNH BÁO [...]: Số lượng <a:p> ({num_paragraphs_found}) KHÔNG KHỚP dữ liệu dịch ({num_data_items}).")
	num_items_to_process = min(num_paragraphs_found, num_data_items)
	print(f"=> Sẽ chỉ xử lý {num_items_to_process} mục đầu tiên.")
	else:
	num_items_to_process = num_paragraphs_found

	# --- Lặp và thực hiện thay thế ---
	for i in range(num_items_to_process):
	try:
	p_elem_to_modify = paragraphs_to_modify[i]
	translated_text, rpr_to_use_original = list_of_translated_paragraph_data[i]
	p_id = hex(id(p_elem_to_modify))

	# --- 1. Xử lý text ban đầu (chỉ strip) ---
	cleaned_translated_text = translated_text.strip() if isinstance(translated_text, str) else ""

	# --- 2. Chuẩn bị rPr cuối cùng (bắt đầu bằng copy hoặc trống) ---
	final_rpr = None
	if rpr_to_use_original is not None and ET.iselement(rpr_to_use_original) and rpr_to_use_original.tag == f"{{{ns['a']}}}rPr":
	try:
	final_rpr = copy.deepcopy(rpr_to_use_original)
	except Exception as clone_e:
	print(f"Lỗi sao chép rPr gốc cho <a:p> index {i} (ID {p_id}): {clone_e}")
	final_rpr = ET.Element(f"{{{ns['a']}}}rPr")
	else:
	final_rpr = ET.Element(f"{{{ns['a']}}}rPr")

	# --- 3. Luôn giảm cỡ chữ (nếu có) ---
	original_sz_str = final_rpr.get('sz')
	if original_sz_str:
	try:
	original_sz = int(original_sz_str)
	new_sz = max(100, int(original_sz * 0.85))
	final_rpr.set('sz', str(new_sz))
	except ValueError:
	print(f"Cảnh báo: Không thể chuyển đổi sz='{original_sz_str}' thành số nguyên cho p_id {p_id}.")

	# --- 4. Áp dụng logic độ dài cho bold (KHÔNG ĐỔI CASE) ---
	if len(cleaned_translated_text) > 10:
	# Dài > 20: BỎ BOLD (nếu có)
	final_rpr.attrib.pop('b', None) # Xóa thuộc tính bold
	# print(f"Debug: Text > 20 chars for p_id {p_id}. Removed bold.")
	# else:
	# Ngắn <= 20: Giữ lại thuộc tính 'b' gốc (đã có trong final_rpr nếu có)
	# print(f"Debug: Text <= 20 chars for p_id {p_id}. Kept original bold.")

	# --- 5. Xóa nội dung cũ (run và field) ---
	runs_to_remove = p_elem_to_modify.findall('a:r', ns)
	fields_to_remove = p_elem_to_modify.findall('a:fld', ns)
	for elem_to_remove in runs_to_remove + fields_to_remove:
	try: p_elem_to_modify.remove(elem_to_remove)
	except ValueError: pass

	# --- 6. Tạo nội dung mới (nếu text không rỗng) ---
	if cleaned_translated_text:
	new_r = ET.Element(f"{{{ns['a']}}}r")
	new_r.insert(0, final_rpr) # Chèn rPr đã xử lý
	new_t = ET.SubElement(new_r, f"{{{ns['a']}}}t")
	new_t.text = cleaned_translated_text # Chèn text gốc (đã strip)
	# Chèn run mới
	end_para_rpr = p_elem_to_modify.find('./a:endParaRPr', ns)
	insert_index = -1
	if end_para_rpr is not None:
	try: insert_index = list(p_elem_to_modify).index(end_para_rpr)
	except ValueError: insert_index = -1
	if insert_index != -1: p_elem_to_modify.insert(insert_index, new_r)
	else: p_elem_to_modify.append(new_r)
	processed_p_count += 1

	except (IndexError, ValueError, TypeError) as data_err: print(f"Lỗi lấy dữ liệu tại index {i}: {data_err}. Bỏ qua mục này.")
	except Exception as p_replace_err:
	p_id_err = hex(id(paragraphs_to_modify[i])) if i < len(paragraphs_to_modify) else "N/A"
	print(f"Lỗi khi xử lý thay thế cho <a:p> tại index {i} (ID {p_id_err}): {p_replace_err}")


	# --- Lưu cây XML ---
	try:
	tree.write(xml_file_path, encoding='utf-8', xml_declaration=True, pretty_print=True)
	except TypeError:
	tree.write(xml_file_path, encoding='utf-8', xml_declaration=True)
	return True

	except ET.ParseError as pe: print(f"Lỗi parse XML file '{xml_file_path}': {pe}"); return False
	except IOError as ioe: print(f"Lỗi I/O với file '{xml_file_path}': {ioe}"); return False
	except Exception as e: print(f"Lỗi nghiêm trọng: {e}"); traceback.print_exc(); return False

	# --------------------------
	# 2. Xử lý SmartArt
	# --------------------------
	def get_smartart_data_file(rels_file, base_path):
	"""
	Đọc file .rels và tìm relationship có Type là diagramData,
	trả về đường dẫn đầy đủ đến file data*.xml của SmartArt.
	(Không thay đổi đáng kể)
	"""
	try:
	if not os.path.exists(rels_file):
	# print(f"Thông báo: File rels không tồn tại: {rels_file}") # Có thể bỏ qua log này
	return None
	tree = ET.parse(rels_file)
	root = tree.getroot()
	# Sử dụng ns['pr']
	for rel in root.findall('pr:Relationship', ns):
	target = rel.attrib.get('Target')
	rel_type = rel.attrib.get('Type')
	# Kiểm tra Type chính xác
	if rel_type == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData' and target:
	target_fixed = target.replace("../", "")
	full_target_path = os.path.join(base_path, target_fixed)
	absolute_path = os.path.normpath(full_target_path)
	if os.path.exists(absolute_path):
	return absolute_path
	else:
	print(f"Cảnh báo: Tìm thấy relationship SmartArt nhưng file target không tồn tại: {absolute_path}")
	return None
	except ET.ParseError as e:
	print(f"Lỗi parse XML file rels {rels_file}: {e}")
	return None
	except Exception as e:
	print(f"Lỗi khi xử lý file rels {rels_file}: {e}")
	# traceback.print_exc()
	return None


	def extract_text_from_smartart(xml_file_path):
	paragraph_data = []
	try:
	tree = ET.parse(xml_file_path)
	root = tree.getroot()

	# Tìm tất cả các đoạn <a:p> trong cây XML (thường nằm trong <dgm:txBody>)
	# Sử dụng .// để tìm ở mọi cấp độ sâu trong các cấu trúc SmartArt
	for p_elem in root.findall('.//a:p', ns):
	combined_text = ""
	first_rPr = None
	found_first_rpr_in_p = False # Cờ cho rPr đầu tiên trong đoạn p này

	# Tìm tất cả các run <a:r> bên trong đoạn <a:p> hiện tại
	for r_elem in p_elem.findall('.//a:r', ns):
	t_element = r_elem.find('.//a:t', ns) # Tìm text trong run

	if t_element is not None and t_element.text is not None:
	current_text = t_element.text
	combined_text += current_text # Nối text từ các run

	# Lấy rPr của run đầu tiên có text trong đoạn p này
	if not found_first_rpr_in_p and current_text.strip():
	rPr_element = r_elem.find('.//a:rPr', ns)
	first_rPr = rPr_element # Lưu trữ element rPr (có thể là None)
	found_first_rpr_in_p = True

	# Sau khi duyệt hết các run trong <a:p>, thêm vào kết quả nếu có text
	cleaned_text = combined_text.strip()
	if cleaned_text:
	paragraph_data.append((cleaned_text, first_rPr))

	except FileNotFoundError:
	print(f"Lỗi: Không tìm thấy file XML '{xml_file_path}'.")
	return []
	except ET.ParseError as pe:
	print(f"Lỗi phân tích cú pháp XML file '{xml_file_path}': {pe}")
	return []
	except Exception as e:
	print(f"Lỗi không xác định khi trích xuất text theo đoạn từ file '{xml_file_path}': {e}")
	traceback.print_exc()
	return []

	return paragraph_data

	# --- Hàm thay thế theo từng đoạn <a:p> ---
	def replace_text_in_smartart(xml_file_path, list_of_translated_paragraph_data, output_xml_file_path):
	p_index_for_data = 0 # Index để lấy dữ liệu dịch
	processed_p_count = 0 # Đếm số đoạn <a:p> đã được xử lý (thay thế)
	if not output_xml_file_path:
	output_xml_file_path = xml_file_path
	try:
	tree = ET.parse(xml_file_path)
	root = tree.getroot()

	# Tạo parent map để xóa element an toàn khi dùng findall với './/'
	parent_map = {c: p for p in root.iter() for c in p}

	# Tìm lại tất cả các <a:p> theo cùng thứ tự như khi trích xuất
	paragraphs_in_order = root.findall('.//a:p', ns)

	# Lọc ra những đoạn <a:p> mà ban đầu có chứa text để khớp với logic trích xuất
	paragraphs_to_modify = []
	for p_elem in paragraphs_in_order:
	has_actual_text = False
	for t in p_elem.findall('.//a:t', ns):
	if t.text and t.text.strip():
	has_actual_text = True
	break
	if has_actual_text:
	paragraphs_to_modify.append(p_elem)

	# Kiểm tra số lượng khớp
	if len(paragraphs_to_modify) != len(list_of_translated_paragraph_data):
	print(f"Cảnh báo [File: {os.path.basename(xml_file_path)}]: Số lượng <a:p> có text ({len(paragraphs_to_modify)}) "
	f"không khớp số lượng dữ liệu dịch ({len(list_of_translated_paragraph_data)}). Thay thế có thể sai lệch.")
	# Quyết định số lượng sẽ xử lý
	num_items_to_process = min(len(paragraphs_to_modify), len(list_of_translated_paragraph_data))
	else:
	num_items_to_process = len(paragraphs_to_modify)


	# Duyệt qua các <a:p> cần sửa đổi
	for i in range(num_items_to_process):
	p_elem = paragraphs_to_modify[i]
	translated_text, original_first_rPr = list_of_translated_paragraph_data[p_index_for_data]
	cleaned_translated_text = translated_text.strip() if translated_text else ""

	# --- Xóa các run <a:r> cũ bên trong <a:p> này ---
	# Sử dụng .// để nhất quán với extraction, cần parent map để xóa
	runs_to_remove = p_elem.findall('.//a:r', ns)
	for r_elem in runs_to_remove:
	parent = parent_map.get(r_elem)
	if parent is not None:
	try:
	# Cập nhật parent map nếu cấu trúc thay đổi động (ít khả năng ở đây)
	# parent_map = {c: p for p in root.iter() for c in p}
	parent.remove(r_elem)
	except ValueError:
	pass # Bỏ qua nếu không tìm thấy để xóa
	# else: # r_elem không có parent trong map (hiếm)

	if cleaned_translated_text:
	new_r = ET.Element(f"{{{ns['a']}}}r") # Tạo run mới

	# Áp dụng rPr gốc (đã deepcopy) cho run mới
	applied_rPr = False
	if original_first_rPr is not None and ET.iselement(original_first_rPr):
	# * Thêm kiểm tra thẻ rPr ở đây cho an toàn *
	if original_first_rPr.tag == f"{{{ns['a']}}}rPr":
	try:
	cloned_rPr = copy.deepcopy(original_first_rPr)
	new_r.insert(0, cloned_rPr) # Chèn rPr vào đầu run
	applied_rPr = True
	except Exception as clone_e:
	print(f"Lỗi sao chép rPr cho <a:p> index {i} (data index {p_index_for_data}): {clone_e}")
	else:
	print(f"Cảnh báo: Thẻ rPr gốc không phải <a:rPr> cho p_elem index {i}. Thẻ: {original_first_rPr.tag}")


	if not applied_rPr:
	ET.SubElement(new_r, f"{{{ns['a']}}}rPr") # Thêm rPr trống nếu cần

	# Thêm text vào run
	new_t = ET.SubElement(new_r, f"{{{ns['a']}}}t")
	new_t.text = cleaned_translated_text

	# --- SỬA ĐỔI QUAN TRỌNG: Chèn run mới vào đúng vị trí ---
	# Tìm phần tử <a:endParaRPr> là con TRỰC TIẾP của p_elem
	end_para_rpr = p_elem.find('./a:endParaRPr', ns)

	if end_para_rpr is not None:
	# Nếu tìm thấy, lấy danh sách con hiện tại và tìm index của nó
	try:
	children_list = list(p_elem)
	insert_index = children_list.index(end_para_rpr)
	# Chèn run mới ngay trước endParaRPr
	p_elem.insert(insert_index, new_r)
	# print(f"Inserted new_r at index {insert_index} before endParaRPr for p_elem {i}")
	except ValueError:
	# Hiếm khi xảy ra nếu find() hoạt động đúng, nhưng là fallback
	print(f"Cảnh báo: Không tìm thấy index của endParaRPr dù đã find thấy. Appending new_r cho p_elem {i}.")
	p_elem.append(new_r)
	else:
	# Nếu không có endParaRPr, append vào cuối là hành vi chấp nhận được
	p_elem.append(new_r)
	# print(f"Appended new_r (no endParaRPr found) for p_elem {i}")

	# Nếu cleaned_translated_text rỗng, đoạn <a:p> sẽ bị trống (đã xóa hết <a:r>)

	p_index_for_data += 1 # Chuyển sang dữ liệu dịch tiếp theo
	processed_p_count += 1 # Tăng số đoạn đã xử lý

	# print(f"Thông tin [File: {os.path.basename(xml_file_path)}]: Đã xử lý {processed_p_count} đoạn <a:p>.")
	if p_index_for_data < len(list_of_translated_paragraph_data):
	print(f"Cảnh báo [File: {os.path.basename(xml_file_path)}]: Còn {len(list_of_translated_paragraph_data) - p_index_for_data} "
	f"mục dữ liệu dịch chưa được sử dụng do số lượng <a:p> không đủ.")


	# --- Lưu cây XML đã sửa đổi ---
	for prefix, uri in ns.items():
	ET.register_namespace(prefix, uri)
	tree.write(output_xml_file_path, encoding='utf-8', xml_declaration=True)
	# print(f"Đã lưu SmartArt cập nhật (theo đoạn) vào: {output_xml_file_path}")
	return True

	except FileNotFoundError:
	print(f"Lỗi: Không tìm thấy file XML nguồn '{xml_file_path}'.")
	return False
	except ET.ParseError as pe:
	print(f"Lỗi phân tích cú pháp XML file '{xml_file_path}': {pe}")
	return False
	except IOError as ioe:
	print(f"Lỗi I/O khi ghi file '{output_xml_file_path}': {ioe}")
	return False
	except Exception as e:
	print(f"Lỗi nghiêm trọng trong quá trình thay thế text SmartArt (theo đoạn) file '{xml_file_path}': {e}")
	traceback.print_exc()
	return False