Spaces:

ZombitX64
/

DekData

Running

App Files Files Community

Nattapong Tapachoom commited on 11 days ago

Commit

cf9e0e9

1 Parent(s): 35c0263

Refactor dataset loading and preview functionality; add Hugging Face dataset loading feature with status updates

Browse files

Files changed (1) hide show

app.py +41 -60

app.py CHANGED Viewed

@@ -1,17 +1,3 @@
-# วิธีใช้งาน:
-# 1. หากต้องการโหลดโมเดล private หรือโมเดลที่ต้องใช้ token ให้รันใน terminal:
-#    huggingface-cli login
-#    แล้ว login ด้วยบัญชี Hugging Face
-# 2. หรือ เพิ่ม argument token ใน from_pretrained เช่น:
-# สรุปการทำงาน:
-# - ถ้า login ด้วย huggingface-cli login จะใช้ token จากเครื่องอัตโนมัติ โหลดโมเดล public/private ได้เลย
-# - ถ้าไม่ login ให้กรอก token ใน argument ของ from_pretrained ทุกครั้งที่โหลดโมเดล
-# - ตัวอย่าง:
-#   tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", token="hf_xxx")
-#   model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", token="hf_xxx")
-# - ถ้าโมเดล public ไม่ต้อง login หรือใส่ token ก็โหลดได้ทันที
-#    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", token="YOUR_TOKEN")
-#    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", token="YOUR_TOKEN")
 import gradio as gr
 import os
 import json
@@ -539,7 +525,7 @@ Format as JSON:
             # Parse JSON
             json_match = re.search(r'\{.*\}', response, re.DOTALL)
             if json_match:
-                parsed_data = json.loads(json_match.group())
                 translated_sample = DataSample(
                     id=f"{sample.id}_{target_lang}",
@@ -588,7 +574,7 @@ Make sure the correct answer ({sample.answer}) is included as one of the options
             # Parse JSON array
             json_match = re.search(r'\[.*\]', response, re.DOTALL)
             if json_match:
-                options = json.loads(json_match.group())
                 if len(options) == 4:
                     sample.options = options
@@ -714,41 +700,6 @@ def preview_data(source_type, path_or_name, file_upload):
             else:
                 return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
-        elif source_type == "hf":
-            if not path_or_name:
-                return gr.update(visible=False), "กรุณาใส่ชื่อ dataset จาก Hugging Face"
-            # Preview HF dataset
-            try:
-                ds = load_dataset(path_or_name)
-                available_splits = list(ds.keys())
-                split_name = available_splits[0]
-                data = ds[split_name]
-                # แปลงตัวอย่างเป็น DataFrame
-                sample_data = []
-                for i, item in enumerate(data):
-                    if i >= 5:  # แสดงแค่ 5 รายการแรก
-                        break
-                    sample_data.append(dict(item))
-                if sample_data:
-                    df = pd.DataFrame(sample_data)
-                    preview_html = f"""
-                    <div style="margin: 10px 0;">
-                        <h4>🤗 Hugging Face Dataset: {path_or_name}</h4>
-                        <p><strong>Split:</strong> {split_name} | <strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
-                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
-                        {df.to_html(classes='table table-striped', escape=False)}
-                    </div>
-                    """
-                    return gr.update(visible=True, value=preview_html), ""
-                else:
-                    return gr.update(visible=False), "Dataset ว่างเปล่า"
-            except Exception as e:
-                return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
         return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
     except Exception as e:
@@ -777,6 +728,7 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
         return filename
     elif format_type == "hf_dataset":
         # Export Hugging Face Dataset แบบมาตรฐาน (Arrow directory)
         data_dict = {key: [] for key in samples[0].model_dump().keys()}
         for sample in samples:
             sample_dict = sample.model_dump()
@@ -785,7 +737,10 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
         dataset = Dataset.from_dict(data_dict)
         hf_dir = f"{output_path}_hf_{timestamp}"
         dataset.save_to_disk(hf_dir)
-        return hf_dir
     elif format_type == "parquet":
         # Export เป็น Parquet format
@@ -848,7 +803,7 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
             else:
                 model_name = "deepseek-chat"  # default for other providers
             if llm_provider_type == "huggingface":
-                with gr.Progress(track_tqdm=True, desc="กำลัง generate ด้วย Hugging Face..."):
                     new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
             else:
                 new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
@@ -871,19 +826,26 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
             progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
             max_translate_samples = min(10, len(samples))  # จำกัดการแปลไม่เกิน 10 samples
             if llm_provider_type == "huggingface":
-                with gr.Progress(track_tqdm=True, desc="กำลังแปลด้วย Hugging Face..."):
                     translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             else:
                 translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             samples.extend(translated)
             progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
         # 6. Add multiple choice
         if add_multiple_choice:
             progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
             max_mc_samples = min(10, len(samples))  # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
             if llm_provider_type == "huggingface":
-                with gr.Progress(track_tqdm=True, desc="กำลังเพิ่มตัวเลือกด้วย Hugging Face..."):
                     samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
             else:
                 samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
@@ -956,9 +918,9 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
     with gr.Tab("📂 Dataset Input"):
         with gr.Row():
             source_type = gr.Radio(
-                ["local", "hf"],
                 label="ประเภทแหล่งข้อมูล",
-                info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
                 value="local"
             )
@@ -966,7 +928,7 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
             with gr.Column(scale=3):                path_or_name = gr.Textbox(
                     label="Path หรือ Dataset Name",
                     placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
-                    info="สำหรับ local: ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory / สำหรับ HF: ใส่ชื่อ dataset"
                 )
             with gr.Column(scale=1):                file_upload = gr.File(
                     label="หรือเลือกไฟล์",
@@ -1138,12 +1100,31 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
             inputs=[max_samples_to_process, n_generate],
             outputs=[total_new_samples]
         )
         n_generate.change(
             update_total_samples_calculation,
             inputs=[max_samples_to_process, n_generate],
             outputs=[total_new_samples]
         )
     with gr.Tab("🤗 Hugging Face Model Download"):
         hf_model_name = gr.Textbox(

 import gradio as gr
 import os
 import json
             # Parse JSON
             json_match = re.search(r'\{.*\}', response, re.DOTALL)
             if json_match:
+                parsed_data = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
                 translated_sample = DataSample(
                     id=f"{sample.id}_{target_lang}",
             # Parse JSON array
             json_match = re.search(r'\[.*\]', response, re.DOTALL)
             if json_match:
+                options = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
                 if len(options) == 4:
                     sample.options = options
             else:
                 return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
         return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
     except Exception as e:
         return filename
     elif format_type == "hf_dataset":
         # Export Hugging Face Dataset แบบมาตรฐาน (Arrow directory)
+        import shutil
         data_dict = {key: [] for key in samples[0].model_dump().keys()}
         for sample in samples:
             sample_dict = sample.model_dump()
         dataset = Dataset.from_dict(data_dict)
         hf_dir = f"{output_path}_hf_{timestamp}"
         dataset.save_to_disk(hf_dir)
+        # Zip the directory for Gradio download
+        zip_path = f"{hf_dir}.zip"
+        shutil.make_archive(hf_dir, 'zip', hf_dir)
+        return zip_path
     elif format_type == "parquet":
         # Export เป็น Parquet format
             else:
                 model_name = "deepseek-chat"  # default for other providers
             if llm_provider_type == "huggingface":
+                with gr.Progress(track_tqdm=True):
                     new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
             else:
                 new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
             progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
             max_translate_samples = min(10, len(samples))  # จำกัดการแปลไม่เกิน 10 samples
             if llm_provider_type == "huggingface":
+                with gr.Progress(track_tqdm=True):
                     translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             else:
                 translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             samples.extend(translated)
             progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
         # 6. Add multiple choice
         if add_multiple_choice:
             progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
             max_mc_samples = min(10, len(samples))  # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
+            if llm_provider_type == "ollama":
+                model_name = ollama_model
+            elif llm_provider_type == "deepseek":
+                model_name = deepseek_model
+            else:
+                model_name = "deepseek-chat"  # fallback/default
             if llm_provider_type == "huggingface":
+                with gr.Progress(track_tqdm=True):
                     samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
             else:
                 samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
     with gr.Tab("📂 Dataset Input"):
         with gr.Row():
             source_type = gr.Radio(
+                ["local"],
                 label="ประเภทแหล่งข้อมูล",
+                info="local = ไฟล์ในเครื่องหรือ HF dataset directory ที่โหลดมา",
                 value="local"
             )
             with gr.Column(scale=3):                path_or_name = gr.Textbox(
                     label="Path หรือ Dataset Name",
                     placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
+                    info="ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory ที่โหลดมา"
                 )
             with gr.Column(scale=1):                file_upload = gr.File(
                     label="หรือเลือกไฟล์",
             inputs=[max_samples_to_process, n_generate],
             outputs=[total_new_samples]
         )
         n_generate.change(
             update_total_samples_calculation,
             inputs=[max_samples_to_process, n_generate],
             outputs=[total_new_samples]
         )
+        # ปุ่มโหลด Dataset จาก Hugging Face
+        hf_dataset_name = gr.Textbox(
+            label="ชื่อ Dataset จาก Hugging Face",
+            placeholder="เช่น squad หรือ username/dataset-name"
+        )
+        hf_dataset_btn = gr.Button("โหลด Dataset จาก Hugging Face", variant="primary")
+        hf_dataset_status = gr.Textbox(label="สถานะการโหลด", interactive=False)
+    def download_hf_dataset(dataset_name):
+        from datasets import load_dataset
+        try:
+            ds = load_dataset(dataset_name)
+            return f"✅ โหลด Dataset {dataset_name} สำเร็จ"
+        except Exception as e:
+            return f"❌ โหลด Dataset {dataset_name} ไม่สำเร็จ: {e}"
+    hf_dataset_btn.click(
+        fn=download_hf_dataset,
+        inputs=[hf_dataset_name],
+        outputs=[hf_dataset_status]
+        )
     with gr.Tab("🤗 Hugging Face Model Download"):
         hf_model_name = gr.Textbox(