Spaces:

ZombitX64
/

DekData

Running

App Files Files Community

Nattapong Tapachoom commited on 11 days ago

Commit

ac711d6

1 Parent(s): c484890

Add .gitignore file and update requirements.txt with additional dependencies

Browse files

Files changed (3) hide show

.gitignore +33 -0
app.py +418 -80
requirements.txt +3 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.pyc
+*.ipynb_checkpoints
+# VSCode
+.vscode/
+.history/
+# Data & Output
+*.csv
+*.jsonl
+*.hf_dataset
+output_*/
+*.arrow
+*.parquet
+# Environment
+.env
+.venv/
+env/
+venv/
+*.egg-info/
+# OS
+.DS_Store
+Thumbs.db
+# Others
+*.log

app.py CHANGED Viewed

@@ -48,24 +48,50 @@ def load_data(source_type, path_or_name):
                 with open(path_or_name, 'r', encoding="utf-8") as f:
                     raw_data = json.load(f)
                     data = raw_data if isinstance(raw_data, list) else [raw_data]
             else:
                 raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
             # แปลงเป็น DataSample objects
             samples = []
             for i, item in enumerate(data):
                 try:
-                    # เติมค่า default ถ้าไม่มี
-                    if 'id' not in item:
-                        item['id'] = str(uuid.uuid4())
-                    if 'source' not in item:
-                        item['source'] = f"local_{os.path.basename(path_or_name)}"
-                    if 'difficulty' not in item:
-                        item['difficulty'] = "medium"
-                    if 'language' not in item:
-                        item['language'] = "th"
-                    samples.append(DataSample(**item))
                 except ValidationError as e:
                     print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
                     continue
@@ -116,6 +142,18 @@ def load_data(source_type, path_or_name):
         raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")
 # 3. LLM API Integration (รองรับหลาย provider)
 class LLMProvider:
     def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
         self.provider = provider
@@ -126,12 +164,10 @@ class LLMProvider:
         try:
             if self.provider == "ollama":
                 return self._generate_ollama(prompt, model, temperature, max_tokens)
-            elif self.provider == "openai":
-                return self._generate_openai(prompt, model, temperature, max_tokens)
-            elif self.provider == "huggingface":
-                return self._generate_huggingface(prompt, model, temperature, max_tokens)
             elif self.provider == "deepseek":
                 return self._generate_deepseek(prompt, model, temperature, max_tokens)
             elif self.provider == "hf_local":
                 return self._generate_hf_local(prompt, model, temperature, max_tokens)
             else:
@@ -155,20 +191,7 @@ class LLMProvider:
         response.raise_for_status()
         return response.json()["response"]
-    def _generate_openai(self, prompt, model, temperature, max_tokens):
-        import openai
-        if self.api_key:
-            openai.api_key = self.api_key
-        response = openai.ChatCompletion.create(
-            model=model,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=temperature,
-            max_tokens=max_tokens
-        )
-        return response.choices[0].message.content
-    def _generate_deepseek(self, prompt, model, temperature, max_tokens):
         url = "https://api.deepseek.com/v1/chat/completions"
         headers = {
             "Authorization": f"Bearer {self.api_key}",
@@ -185,7 +208,6 @@ class LLMProvider:
         response = requests.post(url, headers=headers, json=payload)
         response.raise_for_status()
         result = response.json()
-        # DeepSeek API returns: {"choices":[{"message":{"role":"assistant","content":"..."}}], ...}
         return result["choices"][0]["message"]["content"]
     def _generate_hf_local(self, prompt, model, temperature, max_tokens):
@@ -235,13 +257,17 @@ class LLMProvider:
 # 4. Dataset Generation & Augmentation
 def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider,
-                        generation_type="augment", n_generate=1, custom_prompt=""):
     """
     generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
     """
     generated_samples = []
-    for sample in samples[:5]:  # จำกัดแค่ 5 samples แรกเพื่อทดสอบ
         for _ in range(n_generate):
             try:
                 if generation_type == "augment":
@@ -326,10 +352,10 @@ Format as JSON:
 }}"""
                 else:  # custom prompt
-                    prompt = custom_prompt.format(**sample.dict())
                 # Generate ด้วย LLM
-                response = llm_provider.generate(prompt)
                 # Parse JSON response
                 try:
@@ -426,11 +452,11 @@ def difficulty_assessment(samples: List[DataSample]) -> List[DataSample]:
     return samples
-def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en") -> List[DataSample]:
     """Translate samples to target language"""
     translated = []
-    for sample in samples[:3]:  # จำกัดเพื่อทดสอบ
         if sample.language == target_lang:
             continue
@@ -453,7 +479,7 @@ Format as JSON:
     "rationale": "translated rationale"
 }}"""
-            response = llm_provider.generate(prompt)
             # Parse JSON
             json_match = re.search(r'\{.*\}', response, re.DOTALL)
@@ -480,9 +506,9 @@ Format as JSON:
     return translated
-def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider) -> List[DataSample]:
     """Add multiple choice options to samples"""
-    for sample in samples[:3]:  # จำกัดเพื่อทดสอบ
         if sample.options:  # มี options อยู่แล้ว
             continue
@@ -502,7 +528,7 @@ Format as JSON array:
 Make sure the correct answer ({sample.answer}) is included as one of the options.
 """
-            response = llm_provider.generate(prompt)
             # Parse JSON array
             json_match = re.search(r'\[.*\]', response, re.DOTALL)
@@ -518,28 +544,188 @@ Make sure the correct answer ({sample.answer}) is included as one of the options
     return samples
 # 6. Export & Visualization
 def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
     """Export dataset ในรูปแบบต่างๆ"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     if format_type == "csv":
-        df = pd.DataFrame([s.dict() for s in samples])
         filename = f"{output_path}_{timestamp}.csv"
         df.to_csv(filename, index=False, encoding="utf-8-sig")
-        return filename
     elif format_type == "jsonl":
         filename = f"{output_path}_{timestamp}.jsonl"
         with open(filename, 'w', encoding="utf-8") as f:
             for sample in samples:
-                f.write(json.dumps(sample.dict(), ensure_ascii=False) + "\n")
         return filename
     elif format_type == "hf_dataset":
         # Create Hugging Face Dataset
-        data_dict = {key: [] for key in samples[0].dict().keys()}
         for sample in samples:
-            sample_dict = sample.dict()
             for key, value in sample_dict.items():
                 data_dict[key].append(value)
@@ -548,6 +734,26 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
         dataset.save_to_disk(dirname)
         return dirname
     else:
         raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")
@@ -556,7 +762,7 @@ def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
     if not samples:
         return {"total": 0}
-    df = pd.DataFrame([s.dict() for s in samples])
     stats = {
         "total": len(samples),
@@ -575,7 +781,7 @@ def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
 # 7. Main Workflow Function
 def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
-                 generation_type, n_generate, custom_prompt, target_language,
                  add_multiple_choice, export_format):
     try:
         progress_text = "เริ่มต้น workflow...\n"
@@ -591,16 +797,22 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
             provider=llm_provider_type,
             api_key=api_key if api_key else None,
             base_url=base_url if base_url else "http://localhost:11434"
-        )
-        # 3. Generate new samples
         if n_generate > 0:
-            progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type})...\n"
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลัง generate ด้วย Hugging Face..."):
-                    new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt)
             else:
-                new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt)
             samples.extend(new_samples)
             progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
@@ -615,26 +827,27 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
         samples = difficulty_assessment(samples)
         progress_text += f"  - ประเมิน difficulty เสร็จสิ้น\n"
-        # 5. Translation
         if target_language and target_language != "none":
             progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลังแปลด้วย Hugging Face..."):
-                    translated = translate_to_multilingual(samples, llm_provider, target_language)
             else:
-                translated = translate_to_multilingual(samples, llm_provider, target_language)
             samples.extend(translated)
             progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
         # 6. Add multiple choice
         if add_multiple_choice:
             progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลังเพิ่มตัวเลือกด้วย Hugging Face..."):
-                    samples = add_multiple_choice_options(samples, llm_provider)
             else:
-                samples = add_multiple_choice_options(samples, llm_provider)
             progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
         # 7. Export
@@ -651,7 +864,7 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
         progress_text += f"  - Languages: {stats['languages']}\n"
         progress_text += f"  - มี Multiple Choice: {stats['with_options']}\n"
-        return progress_text, pd.DataFrame([s.dict() for s in samples]).head(10).to_html()
     except Exception as e:
         error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
@@ -661,25 +874,41 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
 with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
     gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
     with gr.Tab("📂 Dataset Input"):
         with gr.Row():
             source_type = gr.Radio(
-                ["local", "hf"],
                 label="ประเภทแหล่งข้อมูล",
                 info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
                 value="local"
             )
-            path_or_name = gr.Textbox(
-                label="Path หรือ Dataset Name",
-                placeholder="เช่น data.csv หรือ microsoft/DialoGPT-medium",
-                info="สำหรับ local: ใส่ path ไฟล์ (.csv, .jsonl, .json) / สำหรับ HF: ใส่ชื่อ dataset"
             )
     with gr.Tab("🤖 LLM Settings"):
         with gr.Row():
             llm_provider_type = gr.Dropdown(
-                ["ollama", "openai", "huggingface"],
                 label="LLM Provider",
                 value="ollama",
                 info="เลือกผู้ให้บริการ LLM"
@@ -687,13 +916,74 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
             api_key = gr.Textbox(
                 label="API Key (ถ้าจำเป็น)",
                 type="password",
-                placeholder="สำหรับ OpenAI หรือ HuggingFace"
             )
             base_url = gr.Textbox(
                 label="Base URL",
                 value="http://localhost:11434",
                 info="สำหรับ Ollama หรือ local LLM server"
             )
     with gr.Tab("✨ Generation Settings"):
         with gr.Row():
@@ -708,6 +998,18 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
                 label="จำนวนรอบ Generate",
                 info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
             )
         custom_prompt = gr.Textbox(
             label="Custom Prompt (ถ้าเลือก custom)",
@@ -715,15 +1017,31 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
             lines=3,
             visible=False
         )
         def update_custom_prompt_visibility(gen_type):
             return gr.update(visible=(gen_type == "custom"))
         generation_type.change(
             update_custom_prompt_visibility,
             inputs=[generation_type],
             outputs=[custom_prompt]
         )
     with gr.Tab("🔧 Post-processing"):
         with gr.Row():
@@ -739,9 +1057,8 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
                 info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
             )
-    with gr.Tab("💾 Export Settings"):
-        export_format = gr.Dropdown(
-            ["csv", "jsonl", "hf_dataset"],
             label="รูปแบบ Export",
             value="csv",
             info="รูปแบบไฟล์ที่ต้องการ export"
@@ -763,23 +1080,33 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
         preview_output = gr.HTML(
             label="ตัวอย่างข้อมูล (10 รายการแรก)"
         )
-    # Event handlers
-    run_btn.click(
         fn=main_workflow,
         inputs=[
             source_type, path_or_name, llm_provider_type, api_key, base_url,
-            generation_type, n_generate, custom_prompt, target_language,
             add_multiple_choice, export_format
         ],
         outputs=[progress_output, preview_output]
     )
     clear_btn.click(
         lambda: ("", ""),
         outputs=[progress_output, preview_output]
     )
     # ตัวอย่าง dataset schema
     with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
         gr.Markdown("""
@@ -803,11 +1130,22 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
         id,context,question,answer,rationale,category,difficulty,source,language
         1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
         ```
-        ## ตัวอย่างไฟล์ JSONL:
         ```json
         {"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
         ```
         """)
 demo.launch()

                 with open(path_or_name, 'r', encoding="utf-8") as f:
                     raw_data = json.load(f)
                     data = raw_data if isinstance(raw_data, list) else [raw_data]
+            elif ext == ".parquet":
+                df = pd.read_parquet(path_or_name)
+                data = df.to_dict(orient="records")
+            elif os.path.isdir(path_or_name):
+                # โหลด HF Dataset ที่ save ไว้
+                try:
+                    dataset = Dataset.load_from_disk(path_or_name)
+                    data = [dict(item) for item in dataset]
+                except Exception as e:
+                    raise ValueError(f"ไม่สามารถโหลด HF dataset จาก {path_or_name}: {e}")
             else:
                 raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
             # แปลงเป็น DataSample objects
+            def map_fields_to_datasample(item):
+                # Auto mapping: พยายาม map field ที่ขาดหาย
+                mapped = dict(item)
+                if 'context' not in mapped:
+                    mapped['context'] = mapped.get('subject', '') or mapped.get('title', '') or ''
+                if 'category' not in mapped:
+                    mapped['category'] = str(mapped.get('grade', '')) or mapped.get('category', '') or ''
+                if 'question' not in mapped:
+                    mapped['question'] = mapped.get('question', '') or ''
+                if 'answer' not in mapped:
+                    mapped['answer'] = mapped.get('answer', '') or ''
+                if 'rationale' not in mapped:
+                    mapped['rationale'] = mapped.get('rationale', '') or ''
+                if 'options' not in mapped:
+                    mapped['options'] = mapped.get('options', None)
+                if 'id' not in mapped:
+                    mapped['id'] = str(uuid.uuid4())
+                if 'source' not in mapped:
+                    mapped['source'] = f"local_{os.path.basename(path_or_name)}"
+                if 'difficulty' not in mapped:
+                    mapped['difficulty'] = "medium"
+                if 'language' not in mapped:
+                    mapped['language'] = "th"
+                return mapped
             samples = []
             for i, item in enumerate(data):
                 try:
+                    mapped_item = map_fields_to_datasample(item)
+                    samples.append(DataSample(**mapped_item))
                 except ValidationError as e:
                     print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
                     continue
         raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")
 # 3. LLM API Integration (รองรับหลาย provider)
+def get_ollama_models(base_url="http://localhost:11434"):
+    """ดึงรายชื่อ models จาก Ollama"""
+    try:
+        response = requests.get(f"{base_url}/api/tags")
+        response.raise_for_status()
+        data = response.json()
+        models = [model["name"] for model in data.get("models", [])]
+        return models if models else ["llama3.2"]  # fallback
+    except Exception as e:
+        print(f"Warning: ไม่สามารถดึงรายชื่อ models จาก Ollama: {e}")
+        return ["llama3.2", "llama3.1", "gemma2", "qwen2.5"]  # default models
 class LLMProvider:
     def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
         self.provider = provider
         try:
             if self.provider == "ollama":
                 return self._generate_ollama(prompt, model, temperature, max_tokens)
             elif self.provider == "deepseek":
                 return self._generate_deepseek(prompt, model, temperature, max_tokens)
+            elif self.provider == "huggingface":
+                return self._generate_huggingface(prompt, model, temperature, max_tokens)
             elif self.provider == "hf_local":
                 return self._generate_hf_local(prompt, model, temperature, max_tokens)
             else:
         response.raise_for_status()
         return response.json()["response"]
+    def _generate_deepseek(self, prompt, model="deepseek-chat", temperature=0.7, max_tokens=1000):
         url = "https://api.deepseek.com/v1/chat/completions"
         headers = {
             "Authorization": f"Bearer {self.api_key}",
         response = requests.post(url, headers=headers, json=payload)
         response.raise_for_status()
         result = response.json()
         return result["choices"][0]["message"]["content"]
     def _generate_hf_local(self, prompt, model, temperature, max_tokens):
 # 4. Dataset Generation & Augmentation
 def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider,
+                        generation_type="augment", n_generate=1, custom_prompt="", model="llama3.2", max_samples_to_process=5):
     """
     generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
+    max_samples_to_process: จำนวน samples เดิมที่จะใช้ในการ generate
     """
     generated_samples = []
+    # จำกัดจำนวน samples ตามที่ผู้ใช้เลือก
+    samples_to_use = samples[:max_samples_to_process]
+    for sample in samples_to_use:
         for _ in range(n_generate):
             try:
                 if generation_type == "augment":
 }}"""
                 else:  # custom prompt
+                    prompt = custom_prompt.format(**sample.model_dump())
                 # Generate ด้วย LLM
+                response = llm_provider.generate(prompt, model=model)
                 # Parse JSON response
                 try:
     return samples
+def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en", model="llama3.2", max_samples=3) -> List[DataSample]:
     """Translate samples to target language"""
     translated = []
+    for sample in samples[:max_samples]:  # จำกัดตามที่ระบุ
         if sample.language == target_lang:
             continue
     "rationale": "translated rationale"
 }}"""
+            response = llm_provider.generate(prompt, model=model)
             # Parse JSON
             json_match = re.search(r'\{.*\}', response, re.DOTALL)
     return translated
+def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider, model="llama3.2", max_samples=3) -> List[DataSample]:
     """Add multiple choice options to samples"""
+    for sample in samples[:max_samples]:  # จำกัดตามที่ระบุ
         if sample.options:  # มี options อยู่แล้ว
             continue
 Make sure the correct answer ({sample.answer}) is included as one of the options.
 """
+            response = llm_provider.generate(prompt, model=model)
             # Parse JSON array
             json_match = re.search(r'\[.*\]', response, re.DOTALL)
     return samples
 # 6. Export & Visualization
+def preview_data(source_type, path_or_name, file_upload):
+    """Preview dataset before processing"""
+    try:
+        # ใช้ไฟล์ที่อัปโหลดถ้ามี หรือใช้ path ที่กรอก
+        file_path = file_upload.name if file_upload else path_or_name
+        if source_type == "local":
+            if not file_path:
+                return gr.update(visible=False), "กรุณาเลือกไฟล์หรือใส่ path"
+            if not os.path.exists(file_path):
+                return gr.update(visible=False), f"ไม่พบไฟล์: {file_path}"
+            ext = os.path.splitext(file_path)[-1].lower()
+            if ext == ".csv":
+                df = pd.read_csv(file_path, encoding="utf-8")
+                preview_html = f"""
+                <div style="margin: 10px 0;">
+                    <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
+                    <p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
+                    <p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                    <h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
+                    {df.head().to_html(classes='table table-striped', escape=False)}
+                </div>
+                """
+                return gr.update(visible=True, value=preview_html), ""
+            elif ext == ".jsonl":
+                data = []
+                with open(file_path, 'r', encoding="utf-8") as f:
+                    for i, line in enumerate(f):
+                        if i >= 5:  # แสดงแค่ 5 บรรทัดแรก
+                            break
+                        try:
+                            data.append(json.loads(line.strip()))
+                        except json.JSONDecodeError:
+                            continue
+                if data:
+                    df = pd.DataFrame(data)
+                    total_lines = sum(1 for _ in open(file_path, 'r', encoding="utf-8"))
+                    preview_html = f"""
+                    <div style="margin: 10px 0;">
+                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
+                        <p><strong>จำนวนบรรทัด:</strong> {total_lines} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
+                        {df.to_html(classes='table table-striped', escape=False)}
+                    </div>
+                    """
+                    return gr.update(visible=True, value=preview_html), ""
+                else:
+                    return gr.update(visible=False), "ไม่สามารถอ่านข้อมูลจากไฟล์ JSONL"
+            elif ext == ".json":
+                with open(file_path, 'r', encoding="utf-8") as f:
+                    data = json.load(f)
+                if isinstance(data, list):
+                    df = pd.DataFrame(data[:5])  # แสดงแค่ 5 รายการแรก
+                    preview_html = f"""
+                    <div style="margin: 10px 0;">
+                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
+                        <p><strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
+                        {df.to_html(classes='table table-striped', escape=False)}
+                    </div>
+                    """
+                else:
+                    # Single object
+                    df = pd.DataFrame([data])
+                    preview_html = f"""
+                    <div style="margin: 10px 0;">
+                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
+                        <p><strong>ประเภท:</strong> Object เดียว | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                        <h5>ข้อมูล:</h5>
+                        {df.to_html(classes='table table-striped', escape=False)}                    </div>
+                    """
+                return gr.update(visible=True, value=preview_html), ""
+            elif ext == ".parquet":
+                df = pd.read_parquet(file_path)
+                preview_html = f"""
+                <div style="margin: 10px 0;">
+                    <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
+                    <p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
+                    <p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                    <h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
+                    {df.head().to_html(classes='table table-striped', escape=False)}
+                </div>
+                """
+                return gr.update(visible=True, value=preview_html), ""
+            elif os.path.isdir(file_path):
+                # ตรวจสอบว่าเป็น HF dataset directory หรือไม่
+                if os.path.exists(os.path.join(file_path, "dataset_info.json")):
+                    try:
+                        dataset = Dataset.load_from_disk(file_path)
+                        sample_data = [dict(item) for i, item in enumerate(dataset) if i < 5]
+                        df = pd.DataFrame(sample_data)
+                        preview_html = f"""
+                        <div style="margin: 10px 0;">
+                            <h4>📁 HF Dataset Directory: {os.path.basename(file_path)}</h4>
+                            <p><strong>จำนวนรายการ:</strong> {len(dataset)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                            <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
+                            {df.to_html(classes='table table-striped', escape=False)}
+                        </div>
+                        """
+                        return gr.update(visible=True, value=preview_html), ""
+                    except Exception as e:
+                        return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
+                else:
+                    return gr.update(visible=False), f"ไม่ใช่ HF dataset directory ที่ถูกต้อง"
+            else:
+                return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
+        elif source_type == "hf":
+            if not path_or_name:
+                return gr.update(visible=False), "กรุณาใส่ชื่อ dataset จาก Hugging Face"
+            # Preview HF dataset
+            try:
+                ds = load_dataset(path_or_name)
+                available_splits = list(ds.keys())
+                split_name = available_splits[0]
+                data = ds[split_name]
+                # แปลงตัวอย่างเป็น DataFrame
+                sample_data = []
+                for i, item in enumerate(data):
+                    if i >= 5:  # แสดงแค่ 5 รายการแรก
+                        break
+                    sample_data.append(dict(item))
+                if sample_data:
+                    df = pd.DataFrame(sample_data)
+                    preview_html = f"""
+                    <div style="margin: 10px 0;">
+                        <h4>🤗 Hugging Face Dataset: {path_or_name}</h4>
+                        <p><strong>Split:</strong> {split_name} | <strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
+                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
+                        {df.to_html(classes='table table-striped', escape=False)}
+                    </div>
+                    """
+                    return gr.update(visible=True, value=preview_html), ""
+                else:
+                    return gr.update(visible=False), "Dataset ว่างเปล่า"
+            except Exception as e:
+                return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
+        return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
+    except Exception as e:
+        return gr.update(visible=False), f"เกิดข้อผิดพลาด: {str(e)}"
+def update_path_from_file(file_upload):
+    """อัปเดต path เมื่อมีการเลือกไฟล์"""
+    if file_upload:
+        return file_upload.name
+    return ""
 def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
     """Export dataset ในรูปแบบต่างๆ"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     if format_type == "csv":
+        df = pd.DataFrame([s.model_dump() for s in samples])
         filename = f"{output_path}_{timestamp}.csv"
         df.to_csv(filename, index=False, encoding="utf-8-sig")
+        return filename
     elif format_type == "jsonl":
         filename = f"{output_path}_{timestamp}.jsonl"
         with open(filename, 'w', encoding="utf-8") as f:
             for sample in samples:
+                f.write(json.dumps(sample.model_dump(), ensure_ascii=False) + "\n")
         return filename
     elif format_type == "hf_dataset":
         # Create Hugging Face Dataset
+        data_dict = {key: [] for key in samples[0].model_dump().keys()}
         for sample in samples:
+            sample_dict = sample.model_dump()
             for key, value in sample_dict.items():
                 data_dict[key].append(value)
         dataset.save_to_disk(dirname)
         return dirname
+    elif format_type == "parquet":
+        # Export เป็น Parquet format
+        df = pd.DataFrame([s.model_dump() for s in samples])
+        filename = f"{output_path}_{timestamp}.parquet"
+        df.to_parquet(filename, index=False, engine='pyarrow')
+        return filename
+    elif format_type == "hf_dataset_parquet":
+        # Create Hugging Face Dataset และ save เป็น Parquet
+        data_dict = {key: [] for key in samples[0].model_dump().keys()}
+        for sample in samples:
+            sample_dict = sample.model_dump()
+            for key, value in sample_dict.items():
+                data_dict[key].append(value)
+        dataset = Dataset.from_dict(data_dict)
+        filename = f"{output_path}_{timestamp}.parquet"
+        dataset.to_parquet(filename)
+        return filename
     else:
         raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")
     if not samples:
         return {"total": 0}
+    df = pd.DataFrame([s.model_dump() for s in samples])
     stats = {
         "total": len(samples),
 # 7. Main Workflow Function
 def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
+                 ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, target_language,
                  add_multiple_choice, export_format):
     try:
         progress_text = "เริ่มต้น workflow...\n"
             provider=llm_provider_type,
             api_key=api_key if api_key else None,
             base_url=base_url if base_url else "http://localhost:11434"
+        )        # 3. Generate new samples
         if n_generate > 0:
+            progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type}) จาก {min(max_samples_to_process, len(samples))} samples เดิม...\n"
+            # เลือกโมเดลที่เหมาะสม
+            if llm_provider_type == "ollama":
+                model_name = ollama_model
+            elif llm_provider_type == "deepseek":
+                model_name = deepseek_model
+            else:
+                model_name = "deepseek-chat"  # default for other providers
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลัง generate ด้วย Hugging Face..."):
+                    new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process)
             else:
+                new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process)
             samples.extend(new_samples)
             progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
         samples = difficulty_assessment(samples)
         progress_text += f"  - ประเมิน difficulty เสร็จสิ้น\n"
+          # 5. Translation
         if target_language and target_language != "none":
             progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
+            max_translate_samples = min(10, len(samples))  # จำกัดการแปลไม่เกิน 10 samples
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลังแปลด้วย Hugging Face..."):
+                    translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             else:
+                translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
             samples.extend(translated)
             progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
         # 6. Add multiple choice
         if add_multiple_choice:
             progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
+            max_mc_samples = min(10, len(samples))  # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
             if llm_provider_type == "huggingface":
                 with gr.Progress(track_tqdm=True, desc="กำลังเพิ่มตัวเลือกด้วย Hugging Face..."):
+                    samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
             else:
+                samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
             progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
         # 7. Export
         progress_text += f"  - Languages: {stats['languages']}\n"
         progress_text += f"  - มี Multiple Choice: {stats['with_options']}\n"
+        return progress_text, pd.DataFrame([s.model_dump() for s in samples]).head(10).to_html()
     except Exception as e:
         error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
 with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
     gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
     with gr.Tab("📂 Dataset Input"):
         with gr.Row():
             source_type = gr.Radio(
+                ["local", "hf"],
                 label="ประเภทแหล่งข้อมูล",
                 info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
                 value="local"
             )
+        with gr.Row():
+            with gr.Column(scale=3):                path_or_name = gr.Textbox(
+                    label="Path หรือ Dataset Name",
+                    placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
+                    info="สำหรับ local: ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory / สำหรับ HF: ใส่ชื่อ dataset"
+                )
+            with gr.Column(scale=1):                file_upload = gr.File(
+                    label="หรือเลือกไฟล์",
+                    file_types=[".csv", ".jsonl", ".json", ".parquet"],
+                    visible=True
+                )
+        # Preview section
+        with gr.Row():
+            preview_btn = gr.Button("🔍 ดูตัวอย่างข้อมูล", variant="secondary")
+        with gr.Row():
+            data_preview = gr.HTML(
+                label="ตัวอย่างข้อมูล",
+                visible=False
             )
     with gr.Tab("🤖 LLM Settings"):
         with gr.Row():
             llm_provider_type = gr.Dropdown(
+                ["ollama", "deepseek", "huggingface", "hf_local"],
                 label="LLM Provider",
                 value="ollama",
                 info="เลือกผู้ให้บริการ LLM"
             api_key = gr.Textbox(
                 label="API Key (ถ้าจำเป็น)",
                 type="password",
+                placeholder="สำหรับ DeepSeek หรือ HuggingFace"
             )
+          with gr.Row():
             base_url = gr.Textbox(
                 label="Base URL",
                 value="http://localhost:11434",
                 info="สำหรับ Ollama หรือ local LLM server"
             )
+        with gr.Row():
+            # Get available models and set appropriate default
+            available_models = get_ollama_models()
+            default_model = available_models[0] if available_models else "llama3.2"
+            ollama_model = gr.Dropdown(
+                choices=available_models,
+                label="Ollama Model",
+                value=default_model,
+                visible=True,
+                allow_custom_value=True,
+                info="เลือก model จาก Ollama"
+            )
+            deepseek_model = gr.Dropdown(
+                choices=["deepseek-chat", "deepseek-reasoner"],
+                label="DeepSeek Model",
+                value="deepseek-chat",
+                visible=False,
+                info="deepseek-chat = DeepSeek-V3-0324, deepseek-reasoner = DeepSeek-R1-0528"
+            )
+            refresh_models_btn = gr.Button(
+                "🔄 รีเฟรช Models",
+                size="sm",
+                visible=True
+            )
+          # ฟังก์ชันสำหรับรีเฟรช models
+        def refresh_ollama_models(base_url_val):
+            try:
+                models = get_ollama_models(base_url_val)
+                if models:
+                    return gr.update(choices=models, value=models[0])
+                else:
+                    return gr.update(choices=["llama3.2"], value="llama3.2")
+            except Exception as e:
+                print(f"Error refreshing models: {e}")
+                return gr.update(choices=["llama3.2"], value="llama3.2")
+          # ฟังก์ชันสำหรับแสดง/ซ่อน model dropdown ตามผู้ให้บริการ
+        def update_model_visibility(provider):
+            ollama_visible = (provider == "ollama")
+            deepseek_visible = (provider == "deepseek")
+            return (
+                gr.update(visible=ollama_visible),
+                gr.update(visible=deepseek_visible),
+                gr.update(visible=ollama_visible)
+            )
+        # Event handlers
+        refresh_models_btn.click(
+            fn=refresh_ollama_models,
+            inputs=[base_url],
+            outputs=[ollama_model]
+        )
+          llm_provider_type.change(
+            fn=update_model_visibility,
+            inputs=[llm_provider_type],
+            outputs=[ollama_model, deepseek_model, refresh_models_btn]
+        )
     with gr.Tab("✨ Generation Settings"):
         with gr.Row():
                 label="จำนวนรอบ Generate",
                 info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
             )
+        with gr.Row():
+            max_samples_to_process = gr.Slider(
+                1, 50, value=5, step=1,
+                label="จำนวน Samples เดิมที่จะใช้ Generate",
+                info="เลือกจำนวน samples จากข้อมูลเดิมที่จะใช้สร้างข้อมูลใหม่"
+            )
+            total_new_samples = gr.Number(
+                label="รวมจำนวน Samples ใหม่ที่คาดว่าจะได้",
+                value=5,
+                interactive=False,
+                info="คำนวณจาก: จำนวน samples เดิม × จำนวนรอบ generate"
+            )
         custom_prompt = gr.Textbox(
             label="Custom Prompt (ถ้าเลือก custom)",
             lines=3,
             visible=False
         )
         def update_custom_prompt_visibility(gen_type):
             return gr.update(visible=(gen_type == "custom"))
+        def update_total_samples_calculation(max_samples, n_gen):
+            total = max_samples * n_gen
+            return gr.update(value=total)
         generation_type.change(
             update_custom_prompt_visibility,
             inputs=[generation_type],
             outputs=[custom_prompt]
         )
+        # อัปเดตการคำนวณจำนวน samples ใหม่
+        max_samples_to_process.change(
+            update_total_samples_calculation,
+            inputs=[max_samples_to_process, n_generate],
+            outputs=[total_new_samples]
+        )
+        n_generate.change(
+            update_total_samples_calculation,
+            inputs=[max_samples_to_process, n_generate],
+            outputs=[total_new_samples]
+        )
     with gr.Tab("🔧 Post-processing"):
         with gr.Row():
                 info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
             )
+    with gr.Tab("💾 Export Settings"):        export_format = gr.Dropdown(
+            ["csv", "jsonl", "hf_dataset", "parquet", "hf_dataset_parquet"],
             label="รูปแบบ Export",
             value="csv",
             info="รูปแบบไฟล์ที่ต้องการ export"
         preview_output = gr.HTML(
             label="ตัวอย่างข้อมูล (10 รายการแรก)"
         )
+    # Event handlers    run_btn.click(
         fn=main_workflow,
         inputs=[
             source_type, path_or_name, llm_provider_type, api_key, base_url,
+            ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, target_language,
             add_multiple_choice, export_format
         ],
         outputs=[progress_output, preview_output]
     )
     clear_btn.click(
         lambda: ("", ""),
         outputs=[progress_output, preview_output]
     )
+    # Preview event handlers
+    preview_btn.click(
+        fn=preview_data,
+        inputs=[source_type, path_or_name, file_upload],
+        outputs=[data_preview, progress_output]
+    )
+    file_upload.upload(
+        fn=update_path_from_file,
+        inputs=[file_upload],
+        outputs=[path_or_name]
+    )
     # ตัวอย่าง dataset schema
     with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
         gr.Markdown("""
         id,context,question,answer,rationale,category,difficulty,source,language
         1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
         ```
+          ## ตัวอย่างไฟล์ JSONL:
         ```json
         {"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
         ```
+        ## รูปแบบ Export ที่รองรับ:
+        - **CSV**: ไฟล์ Excel/Spreadsheet ทั่วไป
+        - **JSONL**: JSON Lines สำหรับ machine learning
+        - **Parquet**: รูปแบบคอลัมน์ที่มีประสิทธิภาพสูง
+        - **HF Dataset**: Hugging Face Dataset directory (Arrow format)
+        - **HF Dataset Parquet**: Hugging Face Dataset เป็น Parquet
+        ## การโหลด Dataset ที่สร้างแล้ว:
+        - สามารถโหลด output ที่สร้างแล้วกลับมาใช้ได้
+        - รองรับ `.csv`, `.jsonl`, `.json`, `.parquet` และ HF dataset directories
+        - ใส่ path ของไฟล์หรือ directory ใน "Path หรือ Dataset Name"
         """)
 demo.launch()

requirements.txt CHANGED Viewed

@@ -3,5 +3,7 @@ pandas>=1.5.0
 datasets>=2.0.0
 pydantic>=2.0.0
 requests>=2.28.0
-openai>=1.0.0
 huggingface-hub>=0.16.0

 datasets>=2.0.0
 pydantic>=2.0.0
 requests>=2.28.0
+transformers>=4.20.0
+torch>=1.12.0
 huggingface-hub>=0.16.0
+pyarrow>=10.0.0