Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

MT_deploy / excel /excel_translate_bis.py

Phong1

Upload excel_translate_bis.py

4537eab verified 4 months ago

raw

history blame

7.76 kB

	import xlwings as xw
	from typing import Dict, List
	from translate.translator import translate_text_dict
	import math
	import chardet
	import io
	import pandas as pd
	import pymongo
	import gridfs
	from io import BytesIO
	import tempfile
	import os
	import openpyxl
	from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing

	def translate_xlsx(file_id: str, target_lang: str = ""):
	# Kết nối MongoDB
	client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
	db = client["excel"]
	fs_input = gridfs.GridFS(db, collection="root_file")
	fs_output = gridfs.GridFS(db, collection="final_file")

	# Tải file từ MongoDB
	file_data = fs_input.get(file_id)

	# Lưu file tạm thời
	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
	temp_file.write(file_data.read())
	temp_file_path = temp_file.name

	# Đọc file Excel bằng openpyxl
	wb = openpyxl.load_workbook(temp_file_path)
	sheets = wb.worksheets

	# 🔸 Translate Sheet Names
	sheet_name_dict = {ws.title: [ws.title] for ws in sheets}
	translated_sheet_names = translate_text_dict(sheet_name_dict, target_lang=target_lang)
	for ws in sheets:
	new_name = translated_sheet_names.get(ws.title, [ws.title])[0]
	ws.title = new_name[:31] # Excel sheet names must be ≤ 31 characters

	for ws in sheets:
	max_row = ws.max_row
	max_col = ws.max_column

	# Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
	text_dict: Dict[str, List[str]] = {}
	cell_map: Dict[str, any] = {}

	for row in range(1, max_row + 1):
	for col in range(1, max_col + 1):
	cell = ws.cell(row=row, column=col)
	if isinstance(cell.value, str):
	key = f"R{row}C{col}"
	text_dict[key] = [cell.value]
	cell_map[key] = cell

	# 🔸 Extract Text Box Content
	textbox_dict = {}
	textbox_map = {}
	if hasattr(ws, "_drawing") and ws._drawing:
	for rel in ws._drawing._rels.values():
	if isinstance(rel.target, SpreadsheetDrawing):
	for shape in rel.target.shape_tree:
	if hasattr(shape, "text") and shape.text:
	key = f"TX-{hash(shape.text)}"
	textbox_dict[key] = [shape.text]
	textbox_map[key] = shape

	# 🔸 Merge all translatable text and translate
	full_dict = {text_dict, textbox_dict}
	translated_dict = translate_text_dict(full_dict, target_lang=target_lang)

	# Cập nhật cell
	for key, cell in cell_map.items():
	if key in translated_dict:
	translated_text = translated_dict[key][0]
	cell.value = translated_text

	# Cập nhật text box
	for key, shape in textbox_map.items():
	if key in translated_dict:
	translated_text = translated_dict[key][0]
	shape.text = translated_text

	# Lưu workbook vào file tạm thời
	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
	wb.save(output_file.name)
	output_file.seek(0)
	translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)

	# Đóng workbook và xóa file tạm
	wb.close()
	os.remove(temp_file_path)

	print(f"✅ Dịch thành công! File đã lưu vào MongoDB với file_id: {translated_file_id}")
	return translated_file_id


	def read_csv_with_auto_encoding(csv_path):
	# Đọc file dưới dạng nhị phân
	with open(csv_path, "rb") as f:
	raw_data = f.read()
	# Dò tìm encoding
	detect_result = chardet.detect(raw_data)
	encoding = detect_result["encoding"]
	confidence = detect_result["confidence"]

	print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")

	# Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
	if encoding is None:
	encoding = "utf-8"

	decoded_data = raw_data.decode(encoding, errors='replace')

	# Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
	csv_data = io.StringIO(decoded_data)
	df = pd.read_csv(csv_data)
	return df


	def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
	# Kết nối MongoDB
	client = pymongo.MongoClient("mongodb://localhost:27017")
	db = client[db_name]
	fs_input = gridfs.GridFS(db, collection="root_file")
	fs_output = gridfs.GridFS(db, collection="final_file")

	# Tải file từ MongoDB
	file_data = fs_input.get(file_id).read()

	# Lưu file tạm thời
	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
	temp_file.write(file_data)
	temp_file_path = temp_file.name

	df = read_csv_with_auto_encoding(temp_file_path)

	# If text_columns is not specified, we assume we want to translate everything that looks like text.
	# Otherwise, only translate the given columns.
	if text_columns is None:
	# Example heuristic: choose all object/string columns
	text_columns = df.select_dtypes(include=["object"]).columns.tolist()

	num_rows = len(df)
	num_chunks = math.ceil(num_rows / chunk_size)

	translated_df = df.copy() # copy to store the final translations

	for chunk_index in range(num_chunks):
	start_idx = chunk_index * chunk_size
	end_idx = min((chunk_index + 1) * chunk_size, num_rows)
	chunk_df = df.iloc[start_idx:end_idx]

	# Build a dictionary structure. For example, row-based:
	# {
	# "0": {"colA": "some text", "colB": "some text"},
	# "1": {"colA": "some text", "colB": "some text"},
	# ...
	# }
	chunk_dict = {}
	for i, row in chunk_df.iterrows():
	row_dict = {}
	for col in text_columns:
	row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
	chunk_dict[str(i)] = row_dict

	# Now call your LLM translator on this dictionary
	translated_chunk = translate_text_dict(
	text_dict=chunk_dict,
	source_lang=source_lang,
	target_lang=target_lang,
	gemini_api=gemini_api
	)

	# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
	for i_str, row_data in translated_chunk.items():
	i = int(i_str)
	for col, translated_val in row_data.items():
	translated_df.at[i, col] = translated_val

	# Lưu file dịch vào tệp tạm thời
	translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
	translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')

	# Đọc lại file tạm để lưu vào MongoDB
	with open(translated_file_path, "rb") as f:
	translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")

	# Xóa file tạm
	os.remove(temp_file_path)
	os.remove(translated_file_path)

	print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
	return translated_file_id