MT_deploy / excel /excel_translate.py
mintlee's picture
add no mongodb
ff93898
import io
import pandas as pd
from translate.translator import translate_text_dict
import math
import chardet
def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame:
raw_data = csv_bytes.read()
detect_result = chardet.detect(raw_data)
encoding = detect_result["encoding"] or "utf-8"
decoded_data = raw_data.decode(encoding, errors='replace')
csv_data = io.StringIO(decoded_data)
return pd.read_csv(csv_data)
def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes:
df = read_csv_with_auto_encoding_from_bytes(file_bytes)
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
num_rows = len(df)
num_chunks = math.ceil(num_rows / chunk_size)
translated_df = df.copy()
for chunk_index in range(num_chunks):
start_idx = chunk_index * chunk_size
end_idx = min((chunk_index + 1) * chunk_size, num_rows)
chunk_df = df.iloc[start_idx:end_idx]
chunk_dict = {}
for i, row in chunk_df.iterrows():
row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns}
chunk_dict[str(i)] = row_dict
translated_chunk = translate_text_dict(
text_dict=chunk_dict,
source_lang=source_lang,
target_lang=target_lang
)
for i_str, row_data in translated_chunk.items():
i = int(i_str)
for col, translated_val in row_data.items():
translated_df.at[i, col] = translated_val
output_buffer = io.BytesIO()
translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig')
output_buffer.seek(0)
return output_buffer, file_name