mintlee commited on
Commit
804add3
·
1 Parent(s): 4d84219
excel/excel_translate.py CHANGED
@@ -95,7 +95,7 @@ def read_csv_with_auto_encoding(csv_path):
95
  return df
96
 
97
 
98
- def translate_csv(file_id, target_lang="vi", chunk_size=50):
99
  # Kết nối MongoDB
100
  client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
101
  db = client["csv"]
@@ -143,6 +143,7 @@ def translate_csv(file_id, target_lang="vi", chunk_size=50):
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
 
146
  target_lang=target_lang
147
  )
148
 
 
95
  return df
96
 
97
 
98
+ def translate_csv(file_id, source_lang, target_lang="vi", chunk_size=50):
99
  # Kết nối MongoDB
100
  client = pymongo.MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
101
  db = client["csv"]
 
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
146
+ source_lang=source_lang,
147
  target_lang=target_lang
148
  )
149
 
home.py CHANGED
@@ -30,3 +30,7 @@ st.subheader("25/03/2025")
30
  st.write("1. Đã hoàn thành file Word, Excel")
31
  st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")
32
  st.write("3. Sắp tới: file CSV, PDF, JPG")
 
 
 
 
 
30
  st.write("1. Đã hoàn thành file Word, Excel")
31
  st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")
32
  st.write("3. Sắp tới: file CSV, PDF, JPG")
33
+
34
+ st.subheader("04/04/2025")
35
+ st.write("1. Đã hoàn thành file Word, Excel, PPTX, CSV")
36
+ st.write("2. Sắp tới: file PDF, JPG")
pages/upload.py CHANGED
@@ -3,6 +3,7 @@ import google.generativeai as genai
3
  from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
4
  from powerpoint.pptx import translate_pptx
5
  from excel.xlsx import translate_xlsx
 
6
  from word.word_translate import translate_docx_from_mongodb
7
  import dotenv
8
  import os
@@ -29,10 +30,10 @@ def process_file(file, file_type):
29
  progress_bar.progress(60)
30
  elif file_type == "Excel":
31
  final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
32
- # elif file_type == "CSV":
33
- # final_id = translate_csv(file_id = file_id, target_lang = target_lang)
34
  elif file_type == "Word":
35
- final_id = translate_docx_from_mongodb(file_id, target_lang)
36
  else:
37
  st.error("❌ Loại file không hỗ trợ!")
38
  return
 
3
  from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
4
  from powerpoint.pptx import translate_pptx
5
  from excel.xlsx import translate_xlsx
6
+ from excel.excel_translate import translate_csv
7
  from word.word_translate import translate_docx_from_mongodb
8
  import dotenv
9
  import os
 
30
  progress_bar.progress(60)
31
  elif file_type == "Excel":
32
  final_id = translate_xlsx(file_id = file_id, file_name = file_name, source_lang = source_lang, target_lang = target_lang)
33
+ elif file_type == "CSV":
34
+ final_id = translate_csv(file_id = file_id, source_lang = source_lang, target_lang = target_lang)
35
  elif file_type == "Word":
36
+ final_id = translate_docx_from_mongodb(file_id, source_lang = source_lang, target_lang = target_lang)
37
  else:
38
  st.error("❌ Loại file không hỗ trợ!")
39
  return
translate/translator.py CHANGED
@@ -7,12 +7,12 @@ import os
7
  dotenv.load_dotenv(".env")
8
 
9
 
10
- def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi") -> Dict[str, List[str]]:
11
  def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
12
  """Translates a single batch of text."""
13
  prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
14
 
15
- Read through the entire dictionary, then translate the texts into {target_lang} so that the meaning is as close to the intended context as possible.
16
 
17
  Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
18
  Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.
 
7
  dotenv.load_dotenv(".env")
8
 
9
 
10
+ def translate_text_dict(text_dict: Dict[str, List[str]], source_lang: str = "vi", target_lang: str = "vi") -> Dict[str, List[str]]:
11
  def translate_batch(batch_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
12
  """Translates a single batch of text."""
13
  prompt = f"""The following python dictionary contains pieces of text that form a whole document: {json.dumps(batch_dict)}.
14
 
15
+ Read through the entire dictionary, then translate the texts from {source_lang} into {target_lang} so that the meaning is as close to the intended context as possible.
16
 
17
  Specialized jargon for which there are no direct translations, or names, titles, etc. should be kept whole if possible.
18
  Look at the entire dictionary as a whole for context so that the translation is as accurate as possible, and to determine if each text should be translated or not.
word/word_translate.py CHANGED
@@ -17,7 +17,7 @@ api_key = os.getenv("GEMINI_API_KEY")
17
  genai.configure(api_key=api_key)
18
  model = genai.GenerativeModel("gemini-2.0-flash")
19
 
20
- def batch_translate(texts, target_lang="Vietnamese"):
21
  """ Translates multiple text segments in a single API call. """
22
  if not texts:
23
  return texts # Skip if empty
@@ -48,7 +48,7 @@ def batch_translate(texts, target_lang="Vietnamese"):
48
  - Return the result of translation according to the format. Do NOT return code for translating.
49
  """
50
  json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
51
- user_prompt = f"Target language: {target_lang}. JSON file: {json_data}"
52
 
53
  model = genai.GenerativeModel('gemini-2.0-flash')
54
  response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
@@ -63,7 +63,7 @@ def batch_translate(texts, target_lang="Vietnamese"):
63
  translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
64
  return translated_texts
65
 
66
- def full_translate(texts, target_lang="Vietnamese"):
67
  full_translated_texts = []
68
  batch = []
69
  word_count = 0
@@ -71,13 +71,13 @@ def full_translate(texts, target_lang="Vietnamese"):
71
  for string in texts:
72
  if len(string.split()) + word_count >= 1000:
73
  print('Translating a batch.')
74
- full_translated_texts += batch_translate(batch, target_lang)
75
  batch = []
76
  word_count = 0
77
  batch.append(string)
78
  word_count += len(string.split())
79
 
80
- full_translated_texts += batch_translate(batch, target_lang)
81
  return full_translated_texts
82
 
83
  def merge_runs(runs):
@@ -169,7 +169,7 @@ def translate_tables(doc, translated_texts):
169
  cell, i = translate_paragraphs(cell, translated_texts, i)
170
  return doc
171
 
172
- def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
173
  # Kết nối MongoDB
174
  client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
175
  db = client["word"]
@@ -183,10 +183,10 @@ def translate_docx_from_mongodb(file_id, target_lang="Vietnamese"):
183
 
184
  # Lấy nội dung và dịch
185
  para_texts = get_text_elements_para(doc)
186
- translated_para = full_translate(para_texts, target_lang)
187
 
188
  table_texts = get_text_elements_table(doc)
189
- translated_tables = full_translate(table_texts, target_lang)
190
 
191
  # Cập nhật nội dung dịch vào document
192
  doc, _ = translate_paragraphs(doc, translated_para)
 
17
  genai.configure(api_key=api_key)
18
  model = genai.GenerativeModel("gemini-2.0-flash")
19
 
20
+ def batch_translate(texts, source_lang = "English", target_lang="Vietnamese"):
21
  """ Translates multiple text segments in a single API call. """
22
  if not texts:
23
  return texts # Skip if empty
 
48
  - Return the result of translation according to the format. Do NOT return code for translating.
49
  """
50
  json_data = json.dumps([{"index": i, "text": t} for i, t in enumerate(texts)])
51
+ user_prompt = f"Source languag: {source_lang}. Target language: {target_lang}. JSON file: {json_data}"
52
 
53
  model = genai.GenerativeModel('gemini-2.0-flash')
54
  response = model.generate_content(contents = system_prompt.strip() + "\n" + user_prompt.strip(), generation_config={
 
63
  translated_texts = [i['text'] for i in sorted(response_dict, key = lambda x: x['index'])]
64
  return translated_texts
65
 
66
+ def full_translate(texts, source_lang, target_lang="Vietnamese"):
67
  full_translated_texts = []
68
  batch = []
69
  word_count = 0
 
71
  for string in texts:
72
  if len(string.split()) + word_count >= 1000:
73
  print('Translating a batch.')
74
+ full_translated_texts += batch_translate(batch, source_lang, target_lang)
75
  batch = []
76
  word_count = 0
77
  batch.append(string)
78
  word_count += len(string.split())
79
 
80
+ full_translated_texts += batch_translate(batch, source_lang, target_lang)
81
  return full_translated_texts
82
 
83
  def merge_runs(runs):
 
169
  cell, i = translate_paragraphs(cell, translated_texts, i)
170
  return doc
171
 
172
+ def translate_docx_from_mongodb(file_id, source_lang, target_lang="Vietnamese"):
173
  # Kết nối MongoDB
174
  client = MongoClient("mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
175
  db = client["word"]
 
183
 
184
  # Lấy nội dung và dịch
185
  para_texts = get_text_elements_para(doc)
186
+ translated_para = full_translate(para_texts, source_lang, target_lang)
187
 
188
  table_texts = get_text_elements_table(doc)
189
+ translated_tables = full_translate(table_texts, source_lang, target_lang)
190
 
191
  # Cập nhật nội dung dịch vào document
192
  doc, _ = translate_paragraphs(doc, translated_para)