mintlee's picture
add no mongodb
ff93898
import os
import zipfile
import shutil
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
from powerpoint.xml_handling import *
from io import BytesIO
def create_pptx_from_dir(temp_dir, pptx_filename):
"""
Tạo file PPTX từ thư mục chứa nội dung đã giải nén và trả về BytesIO object.
"""
pptx_buffer = BytesIO()
with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root_dir, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root_dir, file)
arcname = os.path.relpath(file_path, temp_dir)
zipf.write(file_path, arcname)
pptx_buffer.seek(0)
return pptx_buffer, pptx_filename
def translate_and_replace_pptx(xml_folder, source_lang='vn', target_lang='en', slides_per_batch=5):
slides_dir = os.path.join(xml_folder, "ppt/slides")
all_slides = sorted([f for f in os.listdir(slides_dir)
if f.startswith("slide") and f.endswith(".xml")],
key=lambda x: int(x[5:-4]))
for i in range(0, len(all_slides), slides_per_batch):
batch_slides = all_slides[i:i + slides_per_batch]
slide_text_mapping = {}
smartart_text_mapping = {}
for slide_file in batch_slides:
slide_index = int(slide_file[5:-4])
slide_path = os.path.join(slides_dir, slide_file)
slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)
rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
base_path = os.path.join(xml_folder, "ppt")
smartart_data_path = get_smartart_data_file(rels_file, base_path)
if smartart_data_path:
smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path)
# Gộp text
combined_slide_text_list = []
for slide_index in sorted(slide_text_mapping.keys()):
combined_slide_text_list.extend(slide_text_mapping[slide_index])
combined_smartart_text_list = []
for slide_index in sorted(smartart_text_mapping.keys()):
combined_smartart_text_list.extend(smartart_text_mapping[slide_index])
# Dịch
slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]
translated_slide_string = translate_text(preprocess_text(slide_texts_to_translate), source_lang, target_lang)
translated_smartart_string = translate_text(preprocess_text(smartart_texts_to_translate), source_lang, target_lang)
translated_slide_texts = postprocess_text(translated_slide_string)
translated_smartart_texts = postprocess_text(translated_smartart_string)
translated_slide_data = []
for i, (original_text, rPr) in enumerate(combined_slide_text_list):
translated_slide_data.append((translated_slide_texts[i] if i < len(translated_slide_texts) else "", rPr))
translated_smartart_data = []
for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
translated_smartart_data.append((translated_smartart_texts[i] if i < len(translated_smartart_texts) else "", rPr))
for slide_index in sorted(slide_text_mapping.keys()):
slide_file = f"slide{slide_index}.xml"
slide_path = os.path.join(slides_dir, slide_file)
num_texts = len(slide_text_mapping[slide_index])
replace_data = translated_slide_data[:num_texts]
replace_text_in_slide(slide_path, replace_data)
translated_slide_data = translated_slide_data[num_texts:]
for slide_index in sorted(smartart_text_mapping.keys()):
rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
base_path = os.path.join(xml_folder, "ppt")
smartart_data_path = get_smartart_data_file(rels_file, base_path)
if smartart_data_path:
num_texts = len(smartart_text_mapping[slide_index])
replace_data = translated_smartart_data[:num_texts]
replace_text_in_smartart(smartart_data_path, replace_data, None)
translated_smartart_data = translated_smartart_data[num_texts:]
def translate_pptx(file_obj: BytesIO, file_name: str, source_lang='vn', target_lang='en', slides_per_batch=5):
"""
Hàm chính: nhận file PPTX (BytesIO), dịch, và trả về BytesIO của file đã dịch.
"""
file_obj.seek(0)
xml_folder = unzip_office_file(file_obj)
translate_and_replace_pptx(xml_folder, source_lang, target_lang, slides_per_batch)
translated_io, translated_filename = create_pptx_from_dir(xml_folder, file_name)
shutil.rmtree(xml_folder)
return translated_io, translated_filename