Spaces:
Running
Running
import io | |
import pandas as pd | |
from translate.translator import translate_text_dict | |
import math | |
import chardet | |
def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame: | |
raw_data = csv_bytes.read() | |
detect_result = chardet.detect(raw_data) | |
encoding = detect_result["encoding"] or "utf-8" | |
decoded_data = raw_data.decode(encoding, errors='replace') | |
csv_data = io.StringIO(decoded_data) | |
return pd.read_csv(csv_data) | |
def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes: | |
df = read_csv_with_auto_encoding_from_bytes(file_bytes) | |
text_columns = df.select_dtypes(include=["object"]).columns.tolist() | |
num_rows = len(df) | |
num_chunks = math.ceil(num_rows / chunk_size) | |
translated_df = df.copy() | |
for chunk_index in range(num_chunks): | |
start_idx = chunk_index * chunk_size | |
end_idx = min((chunk_index + 1) * chunk_size, num_rows) | |
chunk_df = df.iloc[start_idx:end_idx] | |
chunk_dict = {} | |
for i, row in chunk_df.iterrows(): | |
row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns} | |
chunk_dict[str(i)] = row_dict | |
translated_chunk = translate_text_dict( | |
text_dict=chunk_dict, | |
source_lang=source_lang, | |
target_lang=target_lang | |
) | |
for i_str, row_data in translated_chunk.items(): | |
i = int(i_str) | |
for col, translated_val in row_data.items(): | |
translated_df.at[i, col] = translated_val | |
output_buffer = io.BytesIO() | |
translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig') | |
output_buffer.seek(0) | |
return output_buffer, file_name |