File size: 4,959 Bytes
73196e5
 
 
 
 
 
 
ff93898
73196e5
ff93898
73196e5
ff93898
73196e5
 
 
 
 
 
 
 
ff93898
 
73196e5
ff93898
73196e5
 
 
 
 
 
 
ff93898
 
73196e5
 
 
 
ff93898
73196e5
 
 
 
 
ff93898
73196e5
ff93898
73196e5
 
 
 
 
 
 
ff93898
 
73196e5
 
 
ff93898
 
73196e5
 
 
 
 
 
ff93898
 
73196e5
 
ff93898
73196e5
 
 
 
 
 
ff93898
 
73196e5
 
 
 
 
 
 
 
ff93898
 
73196e5
ff93898
 
 
 
 
 
 
 
 
 
 
73196e5
ff93898
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import zipfile
import shutil
from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
from powerpoint.xml_handling import *
from io import BytesIO

def create_pptx_from_dir(temp_dir, pptx_filename):
    """
    Tạo file PPTX từ thư mục chứa nội dung đã giải nén và trả về BytesIO object.
    """
    pptx_buffer = BytesIO()

    with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root_dir, _, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root_dir, file)
                arcname = os.path.relpath(file_path, temp_dir)
                zipf.write(file_path, arcname)

    pptx_buffer.seek(0)
    return pptx_buffer, pptx_filename

def translate_and_replace_pptx(xml_folder, source_lang='vn', target_lang='en', slides_per_batch=5):
    slides_dir = os.path.join(xml_folder, "ppt/slides")
    all_slides = sorted([f for f in os.listdir(slides_dir)
                         if f.startswith("slide") and f.endswith(".xml")],
                        key=lambda x: int(x[5:-4]))

    for i in range(0, len(all_slides), slides_per_batch):
        batch_slides = all_slides[i:i + slides_per_batch]
        slide_text_mapping = {}
        smartart_text_mapping = {}

        for slide_file in batch_slides:
            slide_index = int(slide_file[5:-4])
            slide_path = os.path.join(slides_dir, slide_file)
            slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)

            rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
            base_path = os.path.join(xml_folder, "ppt")
            smartart_data_path = get_smartart_data_file(rels_file, base_path)
            if smartart_data_path:
                smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path)

        # Gộp text
        combined_slide_text_list = []
        for slide_index in sorted(slide_text_mapping.keys()):
            combined_slide_text_list.extend(slide_text_mapping[slide_index])

        combined_smartart_text_list = []
        for slide_index in sorted(smartart_text_mapping.keys()):
            combined_smartart_text_list.extend(smartart_text_mapping[slide_index])

        # Dịch
        slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
        smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]

        translated_slide_string = translate_text(preprocess_text(slide_texts_to_translate), source_lang, target_lang)
        translated_smartart_string = translate_text(preprocess_text(smartart_texts_to_translate), source_lang, target_lang)

        translated_slide_texts = postprocess_text(translated_slide_string)
        translated_smartart_texts = postprocess_text(translated_smartart_string)

        translated_slide_data = []
        for i, (original_text, rPr) in enumerate(combined_slide_text_list):
            translated_slide_data.append((translated_slide_texts[i] if i < len(translated_slide_texts) else "", rPr))

        translated_smartart_data = []
        for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
            translated_smartart_data.append((translated_smartart_texts[i] if i < len(translated_smartart_texts) else "", rPr))

        for slide_index in sorted(slide_text_mapping.keys()):
            slide_file = f"slide{slide_index}.xml"
            slide_path = os.path.join(slides_dir, slide_file)
            num_texts = len(slide_text_mapping[slide_index])
            replace_data = translated_slide_data[:num_texts]
            replace_text_in_slide(slide_path, replace_data)
            translated_slide_data = translated_slide_data[num_texts:]

        for slide_index in sorted(smartart_text_mapping.keys()):
            rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
            base_path = os.path.join(xml_folder, "ppt")
            smartart_data_path = get_smartart_data_file(rels_file, base_path)
            if smartart_data_path:
                num_texts = len(smartart_text_mapping[slide_index])
                replace_data = translated_smartart_data[:num_texts]
                replace_text_in_smartart(smartart_data_path, replace_data, None)
                translated_smartart_data = translated_smartart_data[num_texts:]

def translate_pptx(file_obj: BytesIO, file_name: str, source_lang='vn', target_lang='en', slides_per_batch=5):
    """
    Hàm chính: nhận file PPTX (BytesIO), dịch, và trả về BytesIO của file đã dịch.
    """
    file_obj.seek(0)
    xml_folder = unzip_office_file(file_obj)

    translate_and_replace_pptx(xml_folder, source_lang, target_lang, slides_per_batch)

    translated_io, translated_filename = create_pptx_from_dir(xml_folder, file_name)
    shutil.rmtree(xml_folder)

    return translated_io, translated_filename