File size: 9,911 Bytes
73196e5
 
 
 
 
 
58fa02f
 
f9becec
73196e5
314e765
73196e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58fa02f
314e765
58fa02f
 
 
 
 
 
314e765
58fa02f
314e765
 
 
 
58fa02f
 
314e765
58fa02f
 
 
 
314e765
 
 
58fa02f
 
314e765
58fa02f
 
 
 
314e765
58fa02f
 
314e765
 
 
 
 
58fa02f
 
314e765
58fa02f
 
314e765
 
73196e5
 
 
 
 
 
 
 
 
 
 
 
 
 
f9becec
73196e5
 
 
f9becec
73196e5
 
 
 
 
 
 
 
 
 
f9becec
 
73196e5
 
 
 
f9becec
73196e5
f9becec
73196e5
f9becec
73196e5
 
f9becec
 
 
 
 
 
73196e5
 
f9becec
 
73196e5
 
 
 
 
 
 
 
 
 
 
f9becec
73196e5
f9becec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73196e5
f9becec
 
73196e5
f9becec
 
73196e5
f9becec
 
 
 
 
 
 
 
 
 
73196e5
 
 
 
 
f9becec
73196e5
 
 
f9becec
73196e5
 
 
f9becec
73196e5
 
 
f9becec
73196e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import os
import zipfile
import google.generativeai as genai
import tempfile
import io
import json
import time
from google.api_core.exceptions import ResourceExhausted
import re

genai.configure(api_key="AIzaSyDInJcxzqBvsh1avs4Zkxb4ZGBooNzOyEM")


def unzip_office_file(pptx_file: io.BytesIO):
    """
    Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời.
    Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx).
    """
    # Tạo thư mục tạm để lưu nội dung giải nén
    output_dir = tempfile.mkdtemp(prefix="pptx_extract_")

    # Giải nén nội dung từ file PPTX (BytesIO)
    with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

    return output_dir

    
def translate_single_text(text: str, source_lang: str = 'English', target_lang: str = "Vietnamese",
                          max_retries: int = 5, base_delay: float = 5.0) -> str:
    if not text or not text.strip():
        return ""  # Bỏ qua nếu chuỗi rỗng hoặc chỉ chứa khoảng trắng

    retries = 0
    while retries <= max_retries:
        try:
            model = genai.GenerativeModel('gemini-2.0-flash')  # hoặc 'gemini-1.5-flash'

            system_prompt = f"""You are a translation engine.
Translate the following text accurately from {source_lang} to {target_lang}.
Provide *only* the translated text as a single string.
Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""

            user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}"
            full_prompt = system_prompt.strip() + "\n\n" + user_prompt.strip()

            response = model.generate_content(
                contents=full_prompt,
                generation_config={
                    'temperature': 0.2,
                    'top_p': 1.0,
                    'top_k': 1,
                }
            )

            if response.candidates and response.candidates[0].content.parts:
                translated_text = "".join(part.text for part in response.candidates[0].content.parts if hasattr(part, 'text')).strip()
                return translated_text
            else:
                print(f"[!] Không nhận được nội dung hợp lệ từ API cho văn bản: '{text[:50]}...'")
                return ""

        except ResourceExhausted as e:
            wait_time = base_delay * (2 ** retries)
            print(f"[429] Quota exceeded khi dịch '{text[:50]}...'. Thử lại sau {wait_time:.1f}s (lần {retries + 1}/{max_retries + 1}).")
            time.sleep(wait_time)
            retries += 1

        except Exception as e:
            print(f"[!] Lỗi không mong muốn khi dịch '{text[:50]}...': {e}")
            return ""

    print(f"[x] Bỏ qua sau {max_retries + 1} lần thử không thành công cho '{text[:50]}...'.")
    return ""    

def preprocess_text(text_list):
    """
    Converts a list of strings into a dictionary where keys are the
    list indices (int) and values are the strings.
    """
    if not isinstance(text_list, list):
        return {}
    if not text_list:
        return {}
    text_dict = {index: text for index, text in enumerate(text_list)}
    return text_dict


def translate_text(text_dict, source_lang='English', target_lang="Vietnamese", max_retries=5, base_delay: float = 5.0):
    def _dict_to_json_string(d):
        json_compatible = {str(k): v for k, v in d.items()}
        try:
            return json.dumps(json_compatible, ensure_ascii=False, separators=(',', ':'))
        except Exception as e:
            print(f"Internal Error (_dict_to_json_string): {e}")
            return "{}"

    def _json_string_to_dict(s):
        res_dict = {}
        if not s or not isinstance(s, str): return {}
        try:
            raw = json.loads(s)
            if not isinstance(raw, dict):
                print(f"LLM response is not a JSON object: {s}")
                return {}
            for k_str, v in raw.items():
                try:
                    res_dict[int(k_str)] = v
                except ValueError:
                    print(f"Non-integer key '{k_str}' in LLM response.")
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
        except Exception as e:
            print(f"General error: {e}")
        return res_dict

    if not isinstance(text_dict, dict):
        print("translate_text_dict expected a dict, got:", type(text_dict))
        return {}
    if not text_dict:
        return {}

    json_input_string = _dict_to_json_string(text_dict)
    if json_input_string == "{}":
        print("Empty or invalid dictionary input.")
        return {key: "" for key in text_dict}

    system_prompt = f"""Translate the string values within the following JSON object .
        Follow these instructions carefully:
        1.  Analyze the entire JSON object to understand the context.
        2.  Translate *only* the string values.
        3.  Keep the original keys *exactly* as they are.
        4.  Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns.
        5.  Preserve the original JSON structure perfectly.
        6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
    """

    user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string}\n\nTranslated JSON Output:"

    raw_translated_json_string = "{}"
    retry_count = 0
    while retry_count < max_retries:
        try:
            model = genai.GenerativeModel('gemini-2.0-flash')
            full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}"

            response = model.generate_content(
                contents=full_prompt,
                generation_config={
                    'temperature': 0.3,
                    'top_p': 1,
                    'top_k': 1,
                }
            )

            if response and response.parts and hasattr(response.parts[0], 'text'):
                raw_translated_json_string = response.parts[0].text.strip()
            elif hasattr(response, 'text'):
                raw_translated_json_string = response.text.strip()

            # Clean markdown wrappers if present
            raw_translated_json_string = re.sub(r"^```(?:json)?|```$", "", raw_translated_json_string).strip()

            if raw_translated_json_string:
                break  # Success, exit retry loop

        except Exception as e:
            wait_time = base_delay * (2 ** retry_count) 
            print(f"[Retry {retry_count+1}] Lỗi gọi API: {e}. Thử lại sau {wait_time:.2f} giây.")
            time.sleep(wait_time)
            retry_count += 1

    if retry_count == max_retries:
        print("❌ Hết số lần thử lại. Trả về JSON rỗng.")
        raw_translated_json_string = "{}"
    
    print(raw_translated_json_string)
    translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string)

    final_translated_dict = {}
    missing_keys = []
    for key in text_dict:
        if key in translated_intermediate_dict:
            final_translated_dict[key] = translated_intermediate_dict[key]
        else:
            final_translated_dict[key] = ""
            missing_keys.append(key)

    if missing_keys:
        print(f"Cảnh báo: Thiếu keys: {sorted(missing_keys)}.")

    extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys())
    if extra_keys:
        print(f"Cảnh báo: Có keys không mong đợi: {sorted(extra_keys)}.")

    return final_translated_dict

# Function 3: Dictionary -> List
def postprocess_text(translated_dict):
    """
    Converts a dictionary {index: translated_text} back into a list of
    strings, ordered by the index (key).
    """
    if not isinstance(translated_dict, dict):
        print("Warning: postprocess_text expected a dict, received:", type(translated_dict))
        return []
    if not translated_dict:
        return []

    # Sort the dictionary items by key (index)
    try:
        # Ensure keys are integers for correct sorting if possible, handle errors
        items_to_sort = []
        for k, v in translated_dict.items():
            try:
                items_to_sort.append((int(k), v))
            except (ValueError, TypeError):
                print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.")
                continue # Skip non-integer keys for sorting

        if not items_to_sort:
            print("Warning: No sortable items found in dictionary for postprocessing.")
            return []

        sorted_items = sorted(items_to_sort)

        # Check for gaps in indices (optional but good practice)
        expected_length = sorted_items[-1][0] + 1
        if len(sorted_items) != expected_length:
            print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.")
            # Reconstruct carefully to handle gaps, filling with empty strings
            result_list = [""] * expected_length
            for index, text in sorted_items:
                if 0 <= index < expected_length:
                     result_list[index] = text
            return result_list

        # If no gaps, simply extract values
        translated_list = [text for index, text in sorted_items]
        return translated_list

    except Exception as e:
        print(f"Error during postprocessing sorting/list creation: {e}")
        return [] # Return empty list on error