Spaces:
Running
Running
File size: 4,959 Bytes
73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 73196e5 ff93898 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import zipfile
import shutil
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
from powerpoint.xml_handling import *
from io import BytesIO
def create_pptx_from_dir(temp_dir, pptx_filename):
"""
Tạo file PPTX từ thư mục chứa nội dung đã giải nén và trả về BytesIO object.
"""
pptx_buffer = BytesIO()
with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root_dir, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root_dir, file)
arcname = os.path.relpath(file_path, temp_dir)
zipf.write(file_path, arcname)
pptx_buffer.seek(0)
return pptx_buffer, pptx_filename
def translate_and_replace_pptx(xml_folder, source_lang='vn', target_lang='en', slides_per_batch=5):
slides_dir = os.path.join(xml_folder, "ppt/slides")
all_slides = sorted([f for f in os.listdir(slides_dir)
if f.startswith("slide") and f.endswith(".xml")],
key=lambda x: int(x[5:-4]))
for i in range(0, len(all_slides), slides_per_batch):
batch_slides = all_slides[i:i + slides_per_batch]
slide_text_mapping = {}
smartart_text_mapping = {}
for slide_file in batch_slides:
slide_index = int(slide_file[5:-4])
slide_path = os.path.join(slides_dir, slide_file)
slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)
rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
base_path = os.path.join(xml_folder, "ppt")
smartart_data_path = get_smartart_data_file(rels_file, base_path)
if smartart_data_path:
smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path)
# Gộp text
combined_slide_text_list = []
for slide_index in sorted(slide_text_mapping.keys()):
combined_slide_text_list.extend(slide_text_mapping[slide_index])
combined_smartart_text_list = []
for slide_index in sorted(smartart_text_mapping.keys()):
combined_smartart_text_list.extend(smartart_text_mapping[slide_index])
# Dịch
slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]
translated_slide_string = translate_text(preprocess_text(slide_texts_to_translate), source_lang, target_lang)
translated_smartart_string = translate_text(preprocess_text(smartart_texts_to_translate), source_lang, target_lang)
translated_slide_texts = postprocess_text(translated_slide_string)
translated_smartart_texts = postprocess_text(translated_smartart_string)
translated_slide_data = []
for i, (original_text, rPr) in enumerate(combined_slide_text_list):
translated_slide_data.append((translated_slide_texts[i] if i < len(translated_slide_texts) else "", rPr))
translated_smartart_data = []
for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
translated_smartart_data.append((translated_smartart_texts[i] if i < len(translated_smartart_texts) else "", rPr))
for slide_index in sorted(slide_text_mapping.keys()):
slide_file = f"slide{slide_index}.xml"
slide_path = os.path.join(slides_dir, slide_file)
num_texts = len(slide_text_mapping[slide_index])
replace_data = translated_slide_data[:num_texts]
replace_text_in_slide(slide_path, replace_data)
translated_slide_data = translated_slide_data[num_texts:]
for slide_index in sorted(smartart_text_mapping.keys()):
rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
base_path = os.path.join(xml_folder, "ppt")
smartart_data_path = get_smartart_data_file(rels_file, base_path)
if smartart_data_path:
num_texts = len(smartart_text_mapping[slide_index])
replace_data = translated_smartart_data[:num_texts]
replace_text_in_smartart(smartart_data_path, replace_data, None)
translated_smartart_data = translated_smartart_data[num_texts:]
def translate_pptx(file_obj: BytesIO, file_name: str, source_lang='vn', target_lang='en', slides_per_batch=5):
"""
Hàm chính: nhận file PPTX (BytesIO), dịch, và trả về BytesIO của file đã dịch.
"""
file_obj.seek(0)
xml_folder = unzip_office_file(file_obj)
translate_and_replace_pptx(xml_folder, source_lang, target_lang, slides_per_batch)
translated_io, translated_filename = create_pptx_from_dir(xml_folder, file_name)
shutil.rmtree(xml_folder)
return translated_io, translated_filename
|