Nattapong Tapachoom
commited on
Commit
·
cf9e0e9
1
Parent(s):
35c0263
Refactor dataset loading and preview functionality; add Hugging Face dataset loading feature with status updates
Browse files
app.py
CHANGED
@@ -1,17 +1,3 @@
|
|
1 |
-
# วิธีใช้งาน:
|
2 |
-
# 1. หากต้องการโหลดโมเดล private หรือโมเดลที่ต้องใช้ token ให้รันใน terminal:
|
3 |
-
# huggingface-cli login
|
4 |
-
# แล้ว login ด้วยบัญชี Hugging Face
|
5 |
-
# 2. หรือ เพิ่ม argument token ใน from_pretrained เช่น:
|
6 |
-
# สรุปการทำงาน:
|
7 |
-
# - ถ้า login ด้วย huggingface-cli login จะใช้ token จากเครื่องอัตโนมัติ โหลดโมเดล public/private ได้เลย
|
8 |
-
# - ถ้าไม่ login ให้กรอก token ใน argument ของ from_pretrained ทุกครั้งที่โหลดโมเดล
|
9 |
-
# - ตัวอย่าง:
|
10 |
-
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", token="hf_xxx")
|
11 |
-
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", token="hf_xxx")
|
12 |
-
# - ถ้าโมเดล public ไม่ต้อง login หรือใส่ token ก็โหลดได้ทันที
|
13 |
-
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", token="YOUR_TOKEN")
|
14 |
-
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", token="YOUR_TOKEN")
|
15 |
import gradio as gr
|
16 |
import os
|
17 |
import json
|
@@ -539,7 +525,7 @@ Format as JSON:
|
|
539 |
# Parse JSON
|
540 |
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
541 |
if json_match:
|
542 |
-
parsed_data = json.loads(json_match.group())
|
543 |
|
544 |
translated_sample = DataSample(
|
545 |
id=f"{sample.id}_{target_lang}",
|
@@ -588,7 +574,7 @@ Make sure the correct answer ({sample.answer}) is included as one of the options
|
|
588 |
# Parse JSON array
|
589 |
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
590 |
if json_match:
|
591 |
-
options = json.loads(json_match.group())
|
592 |
if len(options) == 4:
|
593 |
sample.options = options
|
594 |
|
@@ -714,41 +700,6 @@ def preview_data(source_type, path_or_name, file_upload):
|
|
714 |
else:
|
715 |
return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
|
716 |
|
717 |
-
elif source_type == "hf":
|
718 |
-
if not path_or_name:
|
719 |
-
return gr.update(visible=False), "กรุณาใส่ชื่อ dataset จาก Hugging Face"
|
720 |
-
|
721 |
-
# Preview HF dataset
|
722 |
-
try:
|
723 |
-
ds = load_dataset(path_or_name)
|
724 |
-
available_splits = list(ds.keys())
|
725 |
-
split_name = available_splits[0]
|
726 |
-
data = ds[split_name]
|
727 |
-
|
728 |
-
# แปลงตัวอย่างเป็น DataFrame
|
729 |
-
sample_data = []
|
730 |
-
for i, item in enumerate(data):
|
731 |
-
if i >= 5: # แสดงแค่ 5 รายการแรก
|
732 |
-
break
|
733 |
-
sample_data.append(dict(item))
|
734 |
-
|
735 |
-
if sample_data:
|
736 |
-
df = pd.DataFrame(sample_data)
|
737 |
-
preview_html = f"""
|
738 |
-
<div style="margin: 10px 0;">
|
739 |
-
<h4>🤗 Hugging Face Dataset: {path_or_name}</h4>
|
740 |
-
<p><strong>Split:</strong> {split_name} | <strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
741 |
-
<h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
|
742 |
-
{df.to_html(classes='table table-striped', escape=False)}
|
743 |
-
</div>
|
744 |
-
"""
|
745 |
-
return gr.update(visible=True, value=preview_html), ""
|
746 |
-
else:
|
747 |
-
return gr.update(visible=False), "Dataset ว่างเปล่า"
|
748 |
-
|
749 |
-
except Exception as e:
|
750 |
-
return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
|
751 |
-
|
752 |
return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
|
753 |
|
754 |
except Exception as e:
|
@@ -777,6 +728,7 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
|
|
777 |
return filename
|
778 |
elif format_type == "hf_dataset":
|
779 |
# Export Hugging Face Dataset แบบมาตรฐาน (Arrow directory)
|
|
|
780 |
data_dict = {key: [] for key in samples[0].model_dump().keys()}
|
781 |
for sample in samples:
|
782 |
sample_dict = sample.model_dump()
|
@@ -785,7 +737,10 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
|
|
785 |
dataset = Dataset.from_dict(data_dict)
|
786 |
hf_dir = f"{output_path}_hf_{timestamp}"
|
787 |
dataset.save_to_disk(hf_dir)
|
788 |
-
|
|
|
|
|
|
|
789 |
|
790 |
elif format_type == "parquet":
|
791 |
# Export เป็น Parquet format
|
@@ -848,7 +803,7 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
848 |
else:
|
849 |
model_name = "deepseek-chat" # default for other providers
|
850 |
if llm_provider_type == "huggingface":
|
851 |
-
with gr.Progress(track_tqdm=True
|
852 |
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
|
853 |
else:
|
854 |
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
|
@@ -871,19 +826,26 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
871 |
progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
|
872 |
max_translate_samples = min(10, len(samples)) # จำกัดการแปลไม่เกิน 10 samples
|
873 |
if llm_provider_type == "huggingface":
|
874 |
-
with gr.Progress(track_tqdm=True
|
875 |
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
876 |
else:
|
877 |
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
878 |
samples.extend(translated)
|
879 |
progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
|
880 |
-
|
881 |
# 6. Add multiple choice
|
882 |
if add_multiple_choice:
|
883 |
progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
|
884 |
max_mc_samples = min(10, len(samples)) # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
885 |
if llm_provider_type == "huggingface":
|
886 |
-
with gr.Progress(track_tqdm=True
|
887 |
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
888 |
else:
|
889 |
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
@@ -956,9 +918,9 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
956 |
with gr.Tab("📂 Dataset Input"):
|
957 |
with gr.Row():
|
958 |
source_type = gr.Radio(
|
959 |
-
["local"
|
960 |
label="ประเภทแหล่งข้อมูล",
|
961 |
-
info="local =
|
962 |
value="local"
|
963 |
)
|
964 |
|
@@ -966,7 +928,7 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
966 |
with gr.Column(scale=3): path_or_name = gr.Textbox(
|
967 |
label="Path หรือ Dataset Name",
|
968 |
placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
|
969 |
-
info="
|
970 |
)
|
971 |
with gr.Column(scale=1): file_upload = gr.File(
|
972 |
label="หรือเลือกไฟล์",
|
@@ -1138,12 +1100,31 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
1138 |
inputs=[max_samples_to_process, n_generate],
|
1139 |
outputs=[total_new_samples]
|
1140 |
)
|
1141 |
-
|
1142 |
n_generate.change(
|
1143 |
update_total_samples_calculation,
|
1144 |
inputs=[max_samples_to_process, n_generate],
|
1145 |
outputs=[total_new_samples]
|
1146 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1147 |
|
1148 |
with gr.Tab("🤗 Hugging Face Model Download"):
|
1149 |
hf_model_name = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import json
|
|
|
525 |
# Parse JSON
|
526 |
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
527 |
if json_match:
|
528 |
+
parsed_data = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
|
529 |
|
530 |
translated_sample = DataSample(
|
531 |
id=f"{sample.id}_{target_lang}",
|
|
|
574 |
# Parse JSON array
|
575 |
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
576 |
if json_match:
|
577 |
+
options = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
|
578 |
if len(options) == 4:
|
579 |
sample.options = options
|
580 |
|
|
|
700 |
else:
|
701 |
return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
|
702 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
703 |
return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
|
704 |
|
705 |
except Exception as e:
|
|
|
728 |
return filename
|
729 |
elif format_type == "hf_dataset":
|
730 |
# Export Hugging Face Dataset แบบมาตรฐาน (Arrow directory)
|
731 |
+
import shutil
|
732 |
data_dict = {key: [] for key in samples[0].model_dump().keys()}
|
733 |
for sample in samples:
|
734 |
sample_dict = sample.model_dump()
|
|
|
737 |
dataset = Dataset.from_dict(data_dict)
|
738 |
hf_dir = f"{output_path}_hf_{timestamp}"
|
739 |
dataset.save_to_disk(hf_dir)
|
740 |
+
# Zip the directory for Gradio download
|
741 |
+
zip_path = f"{hf_dir}.zip"
|
742 |
+
shutil.make_archive(hf_dir, 'zip', hf_dir)
|
743 |
+
return zip_path
|
744 |
|
745 |
elif format_type == "parquet":
|
746 |
# Export เป็น Parquet format
|
|
|
803 |
else:
|
804 |
model_name = "deepseek-chat" # default for other providers
|
805 |
if llm_provider_type == "huggingface":
|
806 |
+
with gr.Progress(track_tqdm=True):
|
807 |
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
|
808 |
else:
|
809 |
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
|
|
|
826 |
progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
|
827 |
max_translate_samples = min(10, len(samples)) # จำกัดการแปลไม่เกิน 10 samples
|
828 |
if llm_provider_type == "huggingface":
|
829 |
+
with gr.Progress(track_tqdm=True):
|
830 |
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
831 |
else:
|
832 |
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
833 |
samples.extend(translated)
|
834 |
progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
|
835 |
+
|
836 |
# 6. Add multiple choice
|
837 |
if add_multiple_choice:
|
838 |
progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
|
839 |
max_mc_samples = min(10, len(samples)) # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
|
840 |
+
if llm_provider_type == "ollama":
|
841 |
+
model_name = ollama_model
|
842 |
+
elif llm_provider_type == "deepseek":
|
843 |
+
model_name = deepseek_model
|
844 |
+
else:
|
845 |
+
model_name = "deepseek-chat" # fallback/default
|
846 |
+
|
847 |
if llm_provider_type == "huggingface":
|
848 |
+
with gr.Progress(track_tqdm=True):
|
849 |
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
850 |
else:
|
851 |
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
|
|
918 |
with gr.Tab("📂 Dataset Input"):
|
919 |
with gr.Row():
|
920 |
source_type = gr.Radio(
|
921 |
+
["local"],
|
922 |
label="ประเภทแหล่งข้อมูล",
|
923 |
+
info="local = ไฟล์ในเครื่องหรือ HF dataset directory ที่โหลดมา",
|
924 |
value="local"
|
925 |
)
|
926 |
|
|
|
928 |
with gr.Column(scale=3): path_or_name = gr.Textbox(
|
929 |
label="Path หรือ Dataset Name",
|
930 |
placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
|
931 |
+
info="ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory ที่โหลดมา"
|
932 |
)
|
933 |
with gr.Column(scale=1): file_upload = gr.File(
|
934 |
label="หรือเลือกไฟล์",
|
|
|
1100 |
inputs=[max_samples_to_process, n_generate],
|
1101 |
outputs=[total_new_samples]
|
1102 |
)
|
|
|
1103 |
n_generate.change(
|
1104 |
update_total_samples_calculation,
|
1105 |
inputs=[max_samples_to_process, n_generate],
|
1106 |
outputs=[total_new_samples]
|
1107 |
)
|
1108 |
+
|
1109 |
+
# ปุ่มโหลด Dataset จาก Hugging Face
|
1110 |
+
hf_dataset_name = gr.Textbox(
|
1111 |
+
label="ชื่อ Dataset จาก Hugging Face",
|
1112 |
+
placeholder="เช่น squad หรือ username/dataset-name"
|
1113 |
+
)
|
1114 |
+
hf_dataset_btn = gr.Button("โหลด Dataset จาก Hugging Face", variant="primary")
|
1115 |
+
hf_dataset_status = gr.Textbox(label="สถานะการโหลด", interactive=False)
|
1116 |
+
def download_hf_dataset(dataset_name):
|
1117 |
+
from datasets import load_dataset
|
1118 |
+
try:
|
1119 |
+
ds = load_dataset(dataset_name)
|
1120 |
+
return f"✅ โหลด Dataset {dataset_name} สำเร็จ"
|
1121 |
+
except Exception as e:
|
1122 |
+
return f"❌ โหลด Dataset {dataset_name} ไม่สำเร็จ: {e}"
|
1123 |
+
hf_dataset_btn.click(
|
1124 |
+
fn=download_hf_dataset,
|
1125 |
+
inputs=[hf_dataset_name],
|
1126 |
+
outputs=[hf_dataset_status]
|
1127 |
+
)
|
1128 |
|
1129 |
with gr.Tab("🤗 Hugging Face Model Download"):
|
1130 |
hf_model_name = gr.Textbox(
|