Spaces:
Running
Running
- .env +1 -1
- excel/__pycache__/excel_translate.cpython-310.pyc +0 -0
- excel/excel_translate.py +14 -15
- pages/upload.py +3 -3
- powerpoint/__pycache__/pptx_object.cpython-310.pyc +0 -0
- powerpoint/__pycache__/xml_handling.cpython-310.pyc +0 -0
- powerpoint/pptx_object.py +12 -15
- powerpoint/xml_handling.py +3 -2
- translate/__pycache__/translator.cpython-310.pyc +0 -0
- translate/translator.py +1 -1
.env
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
GEMINI_API_KEY =
|
|
|
|
| 1 |
+
GEMINI_API_KEY = AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8
|
excel/__pycache__/excel_translate.cpython-310.pyc
CHANGED
|
Binary files a/excel/__pycache__/excel_translate.cpython-310.pyc and b/excel/__pycache__/excel_translate.cpython-310.pyc differ
|
|
|
excel/excel_translate.py
CHANGED
|
@@ -10,26 +10,27 @@ import gridfs
|
|
| 10 |
import tempfile
|
| 11 |
import os
|
| 12 |
|
| 13 |
-
def translate_xlsx(file_id: str,
|
| 14 |
# Kết nối MongoDB
|
| 15 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 16 |
-
db = client[
|
| 17 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
| 18 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
| 19 |
|
| 20 |
# Tải file từ MongoDB
|
| 21 |
-
file_data = fs_input.get(file_id)
|
|
|
|
| 22 |
|
| 23 |
# Lưu file tạm thời
|
| 24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
| 25 |
-
temp_file.write(file_data)
|
| 26 |
temp_file_path = temp_file.name
|
| 27 |
|
| 28 |
# Đọc file Excel bằng openpyxl
|
| 29 |
wb = openpyxl.load_workbook(temp_file_path)
|
| 30 |
|
| 31 |
-
|
| 32 |
-
sheets =
|
| 33 |
|
| 34 |
for ws in sheets:
|
| 35 |
max_row = ws.max_row
|
|
@@ -48,7 +49,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
|
|
| 48 |
cell_map[key] = cell
|
| 49 |
|
| 50 |
# Gọi hàm dịch theo dạng bulk
|
| 51 |
-
translated_dict = translate_text_dict(text_dict, target_lang=target_lang
|
| 52 |
|
| 53 |
# Cập nhật lại các cell với nội dung đã dịch
|
| 54 |
for key, cell in cell_map.items():
|
|
@@ -61,7 +62,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
|
|
| 61 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
|
| 62 |
wb.save(output_file.name)
|
| 63 |
output_file.seek(0)
|
| 64 |
-
translated_file_id = fs_output.put(output_file.read(), filename=
|
| 65 |
|
| 66 |
# Đóng workbook và xóa file tạm
|
| 67 |
wb.close()
|
|
@@ -94,10 +95,10 @@ def read_csv_with_auto_encoding(csv_path):
|
|
| 94 |
return df
|
| 95 |
|
| 96 |
|
| 97 |
-
def translate_csv(file_id, target_lang="vi",
|
| 98 |
# Kết nối MongoDB
|
| 99 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 100 |
-
db = client[
|
| 101 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
| 102 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
| 103 |
|
|
@@ -113,9 +114,8 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
|
|
| 113 |
|
| 114 |
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
| 115 |
# Otherwise, only translate the given columns.
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
| 119 |
|
| 120 |
num_rows = len(df)
|
| 121 |
num_chunks = math.ceil(num_rows / chunk_size)
|
|
@@ -143,8 +143,7 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
|
|
| 143 |
# Now call your LLM translator on this dictionary
|
| 144 |
translated_chunk = translate_text_dict(
|
| 145 |
text_dict=chunk_dict,
|
| 146 |
-
target_lang=target_lang
|
| 147 |
-
gemini_api=gemini_api
|
| 148 |
)
|
| 149 |
|
| 150 |
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
|
|
|
| 10 |
import tempfile
|
| 11 |
import os
|
| 12 |
|
| 13 |
+
def translate_xlsx(file_id: str, target_lang: str = ""):
|
| 14 |
# Kết nối MongoDB
|
| 15 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 16 |
+
db = client["excel"]
|
| 17 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
| 18 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
| 19 |
|
| 20 |
# Tải file từ MongoDB
|
| 21 |
+
file_data = fs_input.get(file_id)
|
| 22 |
+
|
| 23 |
|
| 24 |
# Lưu file tạm thời
|
| 25 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
| 26 |
+
temp_file.write(file_data.read())
|
| 27 |
temp_file_path = temp_file.name
|
| 28 |
|
| 29 |
# Đọc file Excel bằng openpyxl
|
| 30 |
wb = openpyxl.load_workbook(temp_file_path)
|
| 31 |
|
| 32 |
+
|
| 33 |
+
sheets = wb.worksheets # Chọn tất cả sheets nếu sheet_name không hợp lệ
|
| 34 |
|
| 35 |
for ws in sheets:
|
| 36 |
max_row = ws.max_row
|
|
|
|
| 49 |
cell_map[key] = cell
|
| 50 |
|
| 51 |
# Gọi hàm dịch theo dạng bulk
|
| 52 |
+
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
| 53 |
|
| 54 |
# Cập nhật lại các cell với nội dung đã dịch
|
| 55 |
for key, cell in cell_map.items():
|
|
|
|
| 62 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
|
| 63 |
wb.save(output_file.name)
|
| 64 |
output_file.seek(0)
|
| 65 |
+
translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
|
| 66 |
|
| 67 |
# Đóng workbook và xóa file tạm
|
| 68 |
wb.close()
|
|
|
|
| 95 |
return df
|
| 96 |
|
| 97 |
|
| 98 |
+
def translate_csv(file_id, target_lang="vi", chunk_size=50):
|
| 99 |
# Kết nối MongoDB
|
| 100 |
client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 101 |
+
db = client["csv"]
|
| 102 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
| 103 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
| 104 |
|
|
|
|
| 114 |
|
| 115 |
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
| 116 |
# Otherwise, only translate the given columns.
|
| 117 |
+
|
| 118 |
+
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
|
|
|
| 119 |
|
| 120 |
num_rows = len(df)
|
| 121 |
num_chunks = math.ceil(num_rows / chunk_size)
|
|
|
|
| 143 |
# Now call your LLM translator on this dictionary
|
| 144 |
translated_chunk = translate_text_dict(
|
| 145 |
text_dict=chunk_dict,
|
| 146 |
+
target_lang=target_lang
|
|
|
|
| 147 |
)
|
| 148 |
|
| 149 |
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
pages/upload.py
CHANGED
|
@@ -33,11 +33,11 @@ def process_file(file, file_type):
|
|
| 33 |
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
| 34 |
progress_bar.progress(60)
|
| 35 |
final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
| 36 |
-
final_id = create_translated_ppt("
|
| 37 |
elif file_type == "Excel":
|
| 38 |
-
final_id = translate_xlsx(file_id
|
| 39 |
elif file_type == "CSV":
|
| 40 |
-
final_id = translate_csv(file_id
|
| 41 |
elif file_type == "Word":
|
| 42 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
| 43 |
else:
|
|
|
|
| 33 |
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
| 34 |
progress_bar.progress(60)
|
| 35 |
final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
| 36 |
+
final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
|
| 37 |
elif file_type == "Excel":
|
| 38 |
+
final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
|
| 39 |
elif file_type == "CSV":
|
| 40 |
+
final_id = translate_csv(file_id = file_id, target_lang = target_lang)
|
| 41 |
elif file_type == "Word":
|
| 42 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
| 43 |
else:
|
powerpoint/__pycache__/pptx_object.cpython-310.pyc
CHANGED
|
Binary files a/powerpoint/__pycache__/pptx_object.cpython-310.pyc and b/powerpoint/__pycache__/pptx_object.cpython-310.pyc differ
|
|
|
powerpoint/__pycache__/xml_handling.cpython-310.pyc
CHANGED
|
Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ
|
|
|
powerpoint/pptx_object.py
CHANGED
|
@@ -283,7 +283,8 @@ def get_file_from_mongodb(db_name, collection_name, file_id):
|
|
| 283 |
db = client[db_name]
|
| 284 |
fs = GridFS(db, collection_name)
|
| 285 |
file_data = fs.get(file_id)
|
| 286 |
-
return
|
|
|
|
| 287 |
|
| 288 |
|
| 289 |
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
|
@@ -292,18 +293,19 @@ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
|
| 292 |
db = client[db_name]
|
| 293 |
fs = GridFS(db, collection_name)
|
| 294 |
file_id = fs.put(file_data, filename=file_name)
|
|
|
|
| 295 |
return file_id
|
| 296 |
|
| 297 |
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
| 298 |
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
| 299 |
try:
|
| 300 |
# Kết nối MongoDB và tải file
|
| 301 |
-
|
| 302 |
-
|
| 303 |
|
| 304 |
# Load PowerPoint gốc và XML dịch
|
| 305 |
-
prs = Presentation(
|
| 306 |
-
tree = ET.parse(
|
| 307 |
root = tree.getroot()
|
| 308 |
|
| 309 |
# Áp dụng bản dịch
|
|
@@ -335,23 +337,18 @@ def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_co
|
|
| 335 |
except Exception as e:
|
| 336 |
print(f"Error applying shape properties: {str(e)}")
|
| 337 |
|
| 338 |
-
# Lưu PowerPoint vào MongoDB
|
| 339 |
output_io = BytesIO()
|
| 340 |
prs.save(output_io)
|
| 341 |
output_io.seek(0) # Reset vị trí đọc
|
| 342 |
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
| 344 |
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
| 345 |
|
| 346 |
return file_id
|
| 347 |
except Exception as e:
|
| 348 |
print(f"Error creating translated PowerPoint: {str(e)}")
|
| 349 |
return None
|
| 350 |
-
|
| 351 |
-
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
| 352 |
-
"""Lưu tệp vào MongoDB GridFS"""
|
| 353 |
-
client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
| 354 |
-
db = client[db_name]
|
| 355 |
-
fs = GridFS(db, collection_name)
|
| 356 |
-
file_id = fs.put(file_data, filename=file_name)
|
| 357 |
-
return file_id
|
|
|
|
| 283 |
db = client[db_name]
|
| 284 |
fs = GridFS(db, collection_name)
|
| 285 |
file_data = fs.get(file_id)
|
| 286 |
+
return file_data
|
| 287 |
+
# return BytesIO(file_data.read())
|
| 288 |
|
| 289 |
|
| 290 |
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
|
|
|
| 293 |
db = client[db_name]
|
| 294 |
fs = GridFS(db, collection_name)
|
| 295 |
file_id = fs.put(file_data, filename=file_name)
|
| 296 |
+
client.close()
|
| 297 |
return file_id
|
| 298 |
|
| 299 |
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
| 300 |
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
| 301 |
try:
|
| 302 |
# Kết nối MongoDB và tải file
|
| 303 |
+
original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
|
| 304 |
+
translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
|
| 305 |
|
| 306 |
# Load PowerPoint gốc và XML dịch
|
| 307 |
+
prs = Presentation(BytesIO(original_ppt.read()))
|
| 308 |
+
tree = ET.parse(BytesIO(translated_xml.read()))
|
| 309 |
root = tree.getroot()
|
| 310 |
|
| 311 |
# Áp dụng bản dịch
|
|
|
|
| 337 |
except Exception as e:
|
| 338 |
print(f"Error applying shape properties: {str(e)}")
|
| 339 |
|
| 340 |
+
# Lưu PowerPoint vào MongoDB với tên gốc
|
| 341 |
output_io = BytesIO()
|
| 342 |
prs.save(output_io)
|
| 343 |
output_io.seek(0) # Reset vị trí đọc
|
| 344 |
|
| 345 |
+
# Giữ nguyên tên file gốc, thêm hậu tố "_translated"
|
| 346 |
+
translated_filename = original_ppt.filename.replace(".xml", ".pptx")
|
| 347 |
+
|
| 348 |
+
file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
|
| 349 |
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
| 350 |
|
| 351 |
return file_id
|
| 352 |
except Exception as e:
|
| 353 |
print(f"Error creating translated PowerPoint: {str(e)}")
|
| 354 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
powerpoint/xml_handling.py
CHANGED
|
@@ -124,7 +124,8 @@ def ppt_to_xml_mongodb(ppt_file_id: str, db_name="pptx"):
|
|
| 124 |
|
| 125 |
# Lưu XML vào MongoDB
|
| 126 |
xml_output = BytesIO(xml_str.encode("utf-8"))
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
|
| 130 |
client.close()
|
|
@@ -363,7 +364,7 @@ def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[
|
|
| 363 |
updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
| 364 |
|
| 365 |
# Lưu file cập nhật vào MongoDB (final_xml)
|
| 366 |
-
new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}
|
| 367 |
print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
|
| 368 |
|
| 369 |
return new_file_id
|
|
|
|
| 124 |
|
| 125 |
# Lưu XML vào MongoDB
|
| 126 |
xml_output = BytesIO(xml_str.encode("utf-8"))
|
| 127 |
+
file_name = ppt_file.filename.replace(".pptx", ".xml")
|
| 128 |
+
xml_file_id = fs_xml.put(xml_output, filename=file_name)
|
| 129 |
|
| 130 |
print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
|
| 131 |
client.close()
|
|
|
|
| 364 |
updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
| 365 |
|
| 366 |
# Lưu file cập nhật vào MongoDB (final_xml)
|
| 367 |
+
new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}")
|
| 368 |
print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
|
| 369 |
|
| 370 |
return new_file_id
|
translate/__pycache__/translator.cpython-310.pyc
CHANGED
|
Binary files a/translate/__pycache__/translator.cpython-310.pyc and b/translate/__pycache__/translator.cpython-310.pyc differ
|
|
|
translate/translator.py
CHANGED
|
@@ -21,7 +21,7 @@ def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi"
|
|
| 21 |
Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
|
| 22 |
|
| 23 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 24 |
-
model = genai.GenerativeModel("gemini-
|
| 25 |
|
| 26 |
response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
|
| 27 |
|
|
|
|
| 21 |
Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
|
| 22 |
|
| 23 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 24 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
| 25 |
|
| 26 |
response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
|
| 27 |
|