Spaces:
Running
Running
| import io | |
| import pandas as pd | |
| from translate.translator import translate_text_dict | |
| import math | |
| import chardet | |
| def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame: | |
| raw_data = csv_bytes.read() | |
| detect_result = chardet.detect(raw_data) | |
| encoding = detect_result["encoding"] or "utf-8" | |
| decoded_data = raw_data.decode(encoding, errors='replace') | |
| csv_data = io.StringIO(decoded_data) | |
| return pd.read_csv(csv_data) | |
| def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes: | |
| df = read_csv_with_auto_encoding_from_bytes(file_bytes) | |
| text_columns = df.select_dtypes(include=["object"]).columns.tolist() | |
| num_rows = len(df) | |
| num_chunks = math.ceil(num_rows / chunk_size) | |
| translated_df = df.copy() | |
| for chunk_index in range(num_chunks): | |
| start_idx = chunk_index * chunk_size | |
| end_idx = min((chunk_index + 1) * chunk_size, num_rows) | |
| chunk_df = df.iloc[start_idx:end_idx] | |
| chunk_dict = {} | |
| for i, row in chunk_df.iterrows(): | |
| row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns} | |
| chunk_dict[str(i)] = row_dict | |
| translated_chunk = translate_text_dict( | |
| text_dict=chunk_dict, | |
| source_lang=source_lang, | |
| target_lang=target_lang | |
| ) | |
| for i_str, row_data in translated_chunk.items(): | |
| i = int(i_str) | |
| for col, translated_val in row_data.items(): | |
| translated_df.at[i, col] = translated_val | |
| output_buffer = io.BytesIO() | |
| translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig') | |
| output_buffer.seek(0) | |
| return output_buffer, file_name |