import io import pandas as pd from translate.translator import translate_text_dict import math import chardet def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame: raw_data = csv_bytes.read() detect_result = chardet.detect(raw_data) encoding = detect_result["encoding"] or "utf-8" decoded_data = raw_data.decode(encoding, errors='replace') csv_data = io.StringIO(decoded_data) return pd.read_csv(csv_data) def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes: df = read_csv_with_auto_encoding_from_bytes(file_bytes) text_columns = df.select_dtypes(include=["object"]).columns.tolist() num_rows = len(df) num_chunks = math.ceil(num_rows / chunk_size) translated_df = df.copy() for chunk_index in range(num_chunks): start_idx = chunk_index * chunk_size end_idx = min((chunk_index + 1) * chunk_size, num_rows) chunk_df = df.iloc[start_idx:end_idx] chunk_dict = {} for i, row in chunk_df.iterrows(): row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns} chunk_dict[str(i)] = row_dict translated_chunk = translate_text_dict( text_dict=chunk_dict, source_lang=source_lang, target_lang=target_lang ) for i_str, row_data in translated_chunk.items(): i = int(i_str) for col, translated_val in row_data.items(): translated_df.at[i, col] = translated_val output_buffer = io.BytesIO() translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig') output_buffer.seek(0) return output_buffer, file_name