Spaces:

ZombitX64
/

DekData

Running

App Files Files Community

Nattapong Tapachoom commited on 11 days ago

Commit

18f1382

1 Parent(s): 861a5b2

Add

Browse files

Files changed (3) hide show

app.py +747 -3
requirements.txt +7 -0
sample_data.csv +6 -0

app.py CHANGED Viewed

@@ -1,7 +1,751 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import os
+import json
+import uuid
+import re
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, ValidationError
+from datasets import load_dataset, Dataset, DatasetDict
+import pandas as pd
+import requests
+from datetime import datetime
+import hashlib
+# 1. Dataset Schema
+class DataSample(BaseModel):
+    id: str
+    context: str
+    question: str
+    options: Optional[List[str]] = None
+    answer: str
+    rationale: str
+    category: str
+    difficulty: str
+    source: str
+    language: str
+# 2. Load dataset (local file หรือ Hugging Face)
+def load_data(source_type, path_or_name):
+    try:
+        if source_type == "local":
+            if not os.path.exists(path_or_name):
+                raise FileNotFoundError(f"ไฟล์ {path_or_name} ไม่พบ")
+            ext = os.path.splitext(path_or_name)[-1].lower()
+            if ext == ".jsonl":
+                data = []
+                with open(path_or_name, 'r', encoding="utf-8") as f:
+                    for line_num, line in enumerate(f, 1):
+                        try:
+                            data.append(json.loads(line.strip()))
+                        except json.JSONDecodeError as e:
+                            print(f"Warning: บรรทัด {line_num} มีข้อผิดพลาด JSON: {e}")
+                            continue
+            elif ext == ".csv":
+                df = pd.read_csv(path_or_name, encoding="utf-8")
+                data = df.to_dict(orient="records")
+            elif ext == ".json":
+                with open(path_or_name, 'r', encoding="utf-8") as f:
+                    raw_data = json.load(f)
+                    data = raw_data if isinstance(raw_data, list) else [raw_data]
+            else:
+                raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
+            # แปลงเป็น DataSample objects
+            samples = []
+            for i, item in enumerate(data):
+                try:
+                    # เติมค่า default ถ้าไม่มี
+                    if 'id' not in item:
+                        item['id'] = str(uuid.uuid4())
+                    if 'source' not in item:
+                        item['source'] = f"local_{os.path.basename(path_or_name)}"
+                    if 'difficulty' not in item:
+                        item['difficulty'] = "medium"
+                    if 'language' not in item:
+                        item['language'] = "th"
+                    samples.append(DataSample(**item))
+                except ValidationError as e:
+                    print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
+                    continue
+            return samples
+        elif source_type == "hf":
+            try:
+                ds = load_dataset(path_or_name)
+                # หา split ที่มีข้อมูล
+                available_splits = list(ds.keys())
+                if not available_splits:
+                    raise ValueError("ไม่พบข้อมูลใน dataset")
+                # ใช้ split แรกที่มีข้อมูล
+                split_name = available_splits[0]
+                data = ds[split_name]
+                samples = []
+                for i, item in enumerate(data):
+                    try:
+                        # แปลง HF format เป็น DataSample
+                        sample_dict = dict(item)
+                        # เติมค่า default
+                        if 'id' not in sample_dict:
+                            sample_dict['id'] = f"hf_{i}"
+                        if 'source' not in sample_dict:
+                            sample_dict['source'] = f"hf_{path_or_name}"
+                        if 'difficulty' not in sample_dict:
+                            sample_dict['difficulty'] = "medium"
+                        if 'language' not in sample_dict:
+                            sample_dict['language'] = "en"
+                        samples.append(DataSample(**sample_dict))
+                    except ValidationError as e:
+                        print(f"Warning: รายการที่ {i+1} จาก HF ข้อมูลไม่ถูกต้อง: {e}")
+                        continue
+                return samples
+            except Exception as e:
+                raise ValueError(f"ไม่สามารถโหลด HF dataset '{path_or_name}': {e}")
+        else:
+            raise ValueError("source_type ต้องเป็น 'local' หรือ 'hf'")
+    except Exception as e:
+        raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")
+# 3. LLM API Integration (รองรับหลาย provider)
+class LLMProvider:
+    def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
+        self.provider = provider
+        self.api_key = api_key
+        self.base_url = base_url
+    def generate(self, prompt, model="llama3.2", temperature=0.7, max_tokens=1000):
+        try:
+            if self.provider == "ollama":
+                return self._generate_ollama(prompt, model, temperature, max_tokens)
+            elif self.provider == "openai":
+                return self._generate_openai(prompt, model, temperature, max_tokens)
+            elif self.provider == "huggingface":
+                return self._generate_huggingface(prompt, model, temperature, max_tokens)
+            else:
+                raise ValueError(f"ไม่รองรับ provider: {self.provider}")
+        except Exception as e:
+            return f"Error generating response: {e}"
+    def _generate_ollama(self, prompt, model, temperature, max_tokens):
+        response = requests.post(
+            f"{self.base_url}/api/generate",
+            json={
+                "model": model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {
+                    "temperature": temperature,
+                    "num_predict": max_tokens
+                }
+            }
+        )
+        response.raise_for_status()
+        return response.json()["response"]
+    def _generate_openai(self, prompt, model, temperature, max_tokens):
+        import openai
+        if self.api_key:
+            openai.api_key = self.api_key
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        return response.choices[0].message.content
+    def _generate_huggingface(self, prompt, model, temperature, max_tokens):
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        response = requests.post(
+            f"https://api-inference.huggingface.co/models/{model}",
+            headers=headers,
+            json={
+                "inputs": prompt,
+                "parameters": {
+                    "temperature": temperature,
+                    "max_new_tokens": max_tokens
+                }
+            }
+        )
+        response.raise_for_status()
+        result = response.json()
+        if isinstance(result, list) and len(result) > 0:
+            return result[0].get("generated_text", "").replace(prompt, "").strip()
+        return str(result)
+# 4. Dataset Generation & Augmentation
+def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider,
+                        generation_type="augment", n_generate=1, custom_prompt=""):
+    """
+    generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
+    """
+    generated_samples = []
+    for sample in samples[:5]:  # จำกัดแค่ 5 samples แรกเพื่อทดสอบ
+        for _ in range(n_generate):
+            try:
+                if generation_type == "augment":
+                    prompt = f"""
+Based on this context and question, create a similar but different scenario:
+Context: {sample.context}
+Question: {sample.question}
+Answer: {sample.answer}
+Rationale: {sample.rationale}
+Generate a new scenario in the same category ({sample.category}) with:
+- Different context but similar moral/logical challenge
+- Appropriate question
+- Clear answer
+- Detailed rationale
+Format as JSON:
+{{
+    "context": "new context here",
+    "question": "new question here",
+    "answer": "new answer here",
+    "rationale": "detailed reasoning here"
+}}"""
+                elif generation_type == "roleplay":
+                    roles = ["ครูใหญ่", "หมอ", "นักบวช", "นักจิตวิทยา", "ผู้ปกครอง"]
+                    role = roles[len(generated_samples) % len(roles)]
+                    prompt = f"""
+คุณคือ{role} กำลังให้คำแนะนำเกี่ยวกับสถานการณ์นี้:
+Context: {sample.context}
+Question: {sample.question}
+ในฐานะ{role} จงสร้างคำตอบและเหตุผลที่เหมาะสมจากมุมมองของบทบาทนี้
+Format as JSON:
+{{
+    "context": "{sample.context}",
+    "question": "{sample.question}",
+    "answer": "คำตอบในฐานะ{role}",
+    "rationale": "เหตุผลจากมุมมอง{role}"
+}}"""
+                elif generation_type == "topic_conditioning":
+                    topics = ["ปัญหาวัยรุ่น", "ความยากจน", "เทคโนโลยี", "สิ่งแวดล้อม", "คร���บครัว"]
+                    topic = topics[len(generated_samples) % len(topics)]
+                    prompt = f"""
+สร้างสถานการณ์ใหม่ในหัวข้อ "{topic}" ที่มีความซับซ้อนทางจริยธรรมคล้ายกับ:
+Original context: {sample.context}
+Category: {sample.category}
+สร้างสถานการณ์ใหม่ที่เกี่ยวข้องกับ{topic}:
+Format as JSON:
+{{
+    "context": "สถานการณ์เกี่ยวกับ{topic}",
+    "question": "คำถามที่เหมาะสม",
+    "answer": "คำตอบที่ดีที่สุด",
+    "rationale": "เหตุผลโดยละเอียด"
+}}"""
+                elif generation_type == "self_critique":
+                    prompt = f"""
+Analyze and improve this moral reasoning scenario:
+Context: {sample.context}
+Question: {sample.question}
+Answer: {sample.answer}
+Rationale: {sample.rationale}
+1. First, critique the reasoning - what could be improved?
+2. Then provide an enhanced version with better rationale
+Format as JSON:
+{{
+    "context": "{sample.context}",
+    "question": "{sample.question}",
+    "answer": "improved answer",
+    "rationale": "enhanced rationale with deeper analysis"
+}}"""
+                else:  # custom prompt
+                    prompt = custom_prompt.format(**sample.dict())
+                # Generate ด้วย LLM
+                response = llm_provider.generate(prompt)
+                # Parse JSON response
+                try:
+                    # ลองหา JSON ใน response
+                    json_match = re.search(r'\{.*\}', response, re.DOTALL)
+                    if json_match:
+                        json_str = json_match.group()
+                        parsed_data = json.loads(json_str)
+                        # สร้าง DataSample ใหม่
+                        new_sample = DataSample(
+                            id=str(uuid.uuid4()),
+                            context=parsed_data.get("context", sample.context),
+                            question=parsed_data.get("question", sample.question),
+                            answer=parsed_data.get("answer", sample.answer),
+                            rationale=parsed_data.get("rationale", sample.rationale),
+                            category=sample.category,
+                            difficulty=sample.difficulty,
+                            source=f"generated_{generation_type}",
+                            language=sample.language,
+                            options=sample.options
+                        )
+                        generated_samples.append(new_sample)
+                except (json.JSONDecodeError, KeyError) as e:
+                    print(f"Warning: ไม่สามารถ parse JSON response: {e}")
+                    continue
+            except Exception as e:
+                print(f"Warning: ไม่สามารถ generate sample: {e}")
+                continue
+    return generated_samples
+# 5. Post-processing & Filtering
+def remove_duplicates(samples: List[DataSample]) -> List[DataSample]:
+    """Remove duplicate samples based on context and question"""
+    seen = set()
+    unique = []
+    for s in samples:
+        # สร้าง hash จาก context + question
+        content_hash = hashlib.md5(f"{s.context.lower().strip()}{s.question.lower().strip()}".encode()).hexdigest()
+        if content_hash not in seen:
+            unique.append(s)
+            seen.add(content_hash)
+    return unique
+def syntax_check(samples: List[DataSample]) -> List[DataSample]:
+    """Check for basic syntax issues and filter out problematic samples"""
+    valid_samples = []
+    for s in samples:
+        # Check ว่ามีเนื้อหาครบถ้วน
+        if (len(s.context.strip()) < 10 or
+            len(s.question.strip()) < 5 or
+            len(s.answer.strip()) < 3 or
+            len(s.rationale.strip()) < 10):
+            continue
+        # Check ว่าไม่มี placeholder text
+        placeholder_texts = ["[ใส่ข้อความ]", "TODO", "xxx", "example", "sample"]
+        has_placeholder = any(placeholder in s.context.lower() or
+                            placeholder in s.question.lower() or
+                            placeholder in s.answer.lower() or
+                            placeholder in s.rationale.lower()
+                            for placeholder in placeholder_texts)
+        if has_placeholder:
+            continue
+        valid_samples.append(s)
+    return valid_samples
+def difficulty_assessment(samples: List[DataSample]) -> List[DataSample]:
+    """Assess and update difficulty based on heuristics"""
+    for sample in samples:
+        # Heuristic based on token count and complexity
+        total_tokens = len(sample.context.split()) + len(sample.question.split()) + len(sample.rationale.split())
+        # Count complexity indicators
+        complexity_indicators = [
+            "ถ้า", "แต่", "อย่างไรก็ตาม", "ในขณะที่", "แม้ว่า",
+            "เนื่องจาก", "ดังนั้น", "เพราะว่า", "หากว่า", "เว้นแต่"
+        ]
+        complexity_count = sum(1 for indicator in complexity_indicators
+                             if indicator in sample.context or indicator in sample.rationale)
+        # Assess difficulty
+        if total_tokens < 50 and complexity_count < 2:
+            sample.difficulty = "easy"
+        elif total_tokens > 150 or complexity_count > 4:
+            sample.difficulty = "hard"
+        else:
+            sample.difficulty = "medium"
+    return samples
+def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en") -> List[DataSample]:
+    """Translate samples to target language"""
+    translated = []
+    for sample in samples[:3]:  # จำกัดเพื่อทดสอบ
+        if sample.language == target_lang:
+            continue
+        try:
+            prompt = f"""
+Translate this moral reasoning scenario to {target_lang}:
+Context: {sample.context}
+Question: {sample.question}
+Answer: {sample.answer}
+Rationale: {sample.rationale}
+Maintain the moral and cultural context appropriately.
+Format as JSON:
+{{
+    "context": "translated context",
+    "question": "translated question",
+    "answer": "translated answer",
+    "rationale": "translated rationale"
+}}"""
+            response = llm_provider.generate(prompt)
+            # Parse JSON
+            json_match = re.search(r'\{.*\}', response, re.DOTALL)
+            if json_match:
+                parsed_data = json.loads(json_match.group())
+                translated_sample = DataSample(
+                    id=f"{sample.id}_{target_lang}",
+                    context=parsed_data["context"],
+                    question=parsed_data["question"],
+                    answer=parsed_data["answer"],
+                    rationale=parsed_data["rationale"],
+                    category=sample.category,
+                    difficulty=sample.difficulty,
+                    source=f"{sample.source}_translated",
+                    language=target_lang,
+                    options=sample.options
+                )
+                translated.append(translated_sample)
+        except Exception as e:
+            print(f"Warning: ไม่สามารถแปลภาษา sample {sample.id}: {e}")
+            continue
+    return translated
+def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider) -> List[DataSample]:
+    """Add multiple choice options to samples"""
+    for sample in samples[:3]:  # จำกัดเพื่อทดสอบ
+        if sample.options:  # มี options อยู่แล้ว
+            continue
+        try:
+            prompt = f"""
+Create 4 multiple choice options for this scenario, with one correct answer:
+Context: {sample.context}
+Question: {sample.question}
+Correct Answer: {sample.answer}
+Generate 3 plausible but incorrect options and include the correct answer.
+Format as JSON array:
+["option A", "option B", "option C", "option D"]
+Make sure the correct answer ({sample.answer}) is included as one of the options.
+"""
+            response = llm_provider.generate(prompt)
+            # Parse JSON array
+            json_match = re.search(r'\[.*\]', response, re.DOTALL)
+            if json_match:
+                options = json.loads(json_match.group())
+                if len(options) == 4:
+                    sample.options = options
+        except Exception as e:
+            print(f"Warning: ไม่สามารถสร้าง multiple choice สำหรับ {sample.id}: {e}")
+            continue
+    return samples
+# 6. Export & Visualization
+def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
+    """Export dataset ในรูปแบบต่างๆ"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if format_type == "csv":
+        df = pd.DataFrame([s.dict() for s in samples])
+        filename = f"{output_path}_{timestamp}.csv"
+        df.to_csv(filename, index=False, encoding="utf-8-sig")
+        return filename
+    elif format_type == "jsonl":
+        filename = f"{output_path}_{timestamp}.jsonl"
+        with open(filename, 'w', encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample.dict(), ensure_ascii=False) + "\n")
+        return filename
+    elif format_type == "hf_dataset":
+        # Create Hugging Face Dataset
+        data_dict = {key: [] for key in samples[0].dict().keys()}
+        for sample in samples:
+            sample_dict = sample.dict()
+            for key, value in sample_dict.items():
+                data_dict[key].append(value)
+        dataset = Dataset.from_dict(data_dict)
+        dirname = f"{output_path}_hf_{timestamp}"
+        dataset.save_to_disk(dirname)
+        return dirname
+    else:
+        raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")
+def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
+    """สถิติของ dataset"""
+    if not samples:
+        return {"total": 0}
+    df = pd.DataFrame([s.dict() for s in samples])
+    stats = {
+        "total": len(samples),
+        "categories": df["category"].value_counts().to_dict(),
+        "difficulties": df["difficulty"].value_counts().to_dict(),
+        "languages": df["language"].value_counts().to_dict(),
+        "sources": df["source"].value_counts().to_dict(),
+        "avg_context_length": df["context"].str.len().mean(),
+        "avg_question_length": df["question"].str.len().mean(),
+        "avg_answer_length": df["answer"].str.len().mean(),
+        "avg_rationale_length": df["rationale"].str.len().mean(),
+        "with_options": sum(1 for s in samples if s.options is not None)
+    }
+    return stats
+# 7. Main Workflow Function
+def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
+                 generation_type, n_generate, custom_prompt, target_language,
+                 add_multiple_choice, export_format):
+    try:
+        progress_text = "เริ่มต้น workflow...\n"
+        # 1. Load dataset
+        progress_text += "📂 กำลังโหลด dataset...\n"
+        samples = load_data(source_type, path_or_name)
+        progress_text += f"✅ โหลดสำเร็จ {len(samples)} samples\n"
+        # 2. Setup LLM
+        progress_text += f"🤖 กำลังตั้งค่า LLM ({llm_provider_type})...\n"
+        llm_provider = LLMProvider(
+            provider=llm_provider_type,
+            api_key=api_key if api_key else None,
+            base_url=base_url if base_url else "http://localhost:11434"
+        )
+        # 3. Generate new samples
+        if n_generate > 0:
+            progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type})...\n"
+            new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt)
+            samples.extend(new_samples)
+            progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
+        # 4. Post-processing
+        progress_text += "🔧 กำลัง post-process...\n"
+        original_count = len(samples)
+        samples = remove_duplicates(samples)
+        progress_text += f"  - ลบ duplicate: {original_count} -> {len(samples)}\n"
+        samples = syntax_check(samples)
+        progress_text += f"  - syntax check: {len(samples)} samples ผ่าน\n"
+        samples = difficulty_assessment(samples)
+        progress_text += f"  - ประเมิน difficulty เสร็จสิ้น\n"
+        # 5. Translation
+        if target_language and target_language != "none":
+            progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
+            translated = translate_to_multilingual(samples, llm_provider, target_language)
+            samples.extend(translated)
+            progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
+        # 6. Add multiple choice
+        if add_multiple_choice:
+            progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
+            samples = add_multiple_choice_options(samples, llm_provider)
+            progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
+        # 7. Export
+        progress_text += f"💾 กำลัง export เป็น {export_format}...\n"
+        output_file = export_dataset(samples, export_format)
+        progress_text += f"✅ Export สำเร็จ: {output_file}\n"
+        # 8. Stats
+        stats = get_dataset_stats(samples)
+        progress_text += "\n📊 สถิติ Dataset:\n"
+        progress_text += f"  - จำนวนทั้งหมด: {stats['total']}\n"
+        progress_text += f"  - Categories: {stats['categories']}\n"
+        progress_text += f"  - Difficulties: {stats['difficulties']}\n"
+        progress_text += f"  - Languages: {stats['languages']}\n"
+        progress_text += f"  - มี Multiple Choice: {stats['with_options']}\n"
+        return progress_text, pd.DataFrame([s.dict() for s in samples]).head(10).to_html()
+    except Exception as e:
+        error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
+        return error_text, ""
+# 8. Gradio Interface
+with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
+    gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
+    with gr.Tab("📂 Dataset Input"):
+        with gr.Row():
+            source_type = gr.Radio(
+                ["local", "hf"],
+                label="ประเภทแหล่งข้อมูล",
+                info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
+                value="local"
+            )
+            path_or_name = gr.Textbox(
+                label="Path หรือ Dataset Name",
+                placeholder="เช่น data.csv หรือ microsoft/DialoGPT-medium",
+                info="สำหรับ local: ใส่ path ไฟล์ (.csv, .jsonl, .json) / สำหรับ HF: ใส่ชื่อ dataset"
+            )
+    with gr.Tab("🤖 LLM Settings"):
+        with gr.Row():
+            llm_provider_type = gr.Dropdown(
+                ["ollama", "openai", "huggingface"],
+                label="LLM Provider",
+                value="ollama",
+                info="เลือกผู้ให้บริการ LLM"
+            )
+            api_key = gr.Textbox(
+                label="API Key (ถ้าจำเป็น)",
+                type="password",
+                placeholder="สำหรับ OpenAI หรือ HuggingFace"
+            )
+            base_url = gr.Textbox(
+                label="Base URL",
+                value="http://localhost:11434",
+                info="สำหรับ Ollama หรือ local LLM server"
+            )
+    with gr.Tab("✨ Generation Settings"):
+        with gr.Row():
+            generation_type = gr.Dropdown(
+                ["augment", "roleplay", "topic_conditioning", "self_critique", "custom"],
+                label="ประเภทการ Generate",
+                value="augment",
+                info="วิธีการสร้างข้อมูลใหม่"
+            )
+            n_generate = gr.Slider(
+                1, 5, value=1, step=1,
+                label="จำนวนรอบ Generate",
+                info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
+            )
+        custom_prompt = gr.Textbox(
+            label="Custom Prompt (ถ้าเลือก custom)",
+            placeholder="ใช้ {context}, {question}, {answer} เป็น placeholder",
+            lines=3,
+            visible=False
+        )
+        def update_custom_prompt_visibility(gen_type):
+            return gr.update(visible=(gen_type == "custom"))
+        generation_type.change(
+            update_custom_prompt_visibility,
+            inputs=[generation_type],
+            outputs=[custom_prompt]
+        )
+    with gr.Tab("🔧 Post-processing"):
+        with gr.Row():
+            target_language = gr.Dropdown(
+                ["none", "en", "th", "zh", "ja"],
+                label="แปลภาษา",
+                value="none",
+                info="แปลเป็นภาษาเป้าหมาย (none = ไม่แปล)"
+            )
+            add_multiple_choice = gr.Checkbox(
+                label="เพิ่ม Multiple Choice Options",
+                value=False,
+                info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
+            )
+    with gr.Tab("💾 Export Settings"):
+        export_format = gr.Dropdown(
+            ["csv", "jsonl", "hf_dataset"],
+            label="รูปแบบ Export",
+            value="csv",
+            info="รูปแบบไฟล์ที่ต้องการ export"
+        )
+    with gr.Row():
+        run_btn = gr.Button("🚀 เริ่มต้น Workflow", variant="primary", size="lg")
+        clear_btn = gr.Button("🗑️ ล้างข้อมูล", variant="secondary")
+    with gr.Tab("📊 ผลลัพธ์"):
+        progress_output = gr.Textbox(
+            label="สถานะ",
+            lines=15,
+            max_lines=20,
+            interactive=False,
+            show_copy_button=True
+        )
+        preview_output = gr.HTML(
+            label="ตัวอย่างข้อมูล (10 รายการแรก)"
+        )
+    # Event handlers
+    run_btn.click(
+        fn=main_workflow,
+        inputs=[
+            source_type, path_or_name, llm_provider_type, api_key, base_url,
+            generation_type, n_generate, custom_prompt, target_language,
+            add_multiple_choice, export_format
+        ],
+        outputs=[progress_output, preview_output]
+    )
+    clear_btn.click(
+        lambda: ("", ""),
+        outputs=[progress_output, preview_output]
+    )
+    # ตัวอย่าง dataset schema
+    with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
+        gr.Markdown("""
+        ## Schema ของ Dataset
+        | Field | ประเภท | อธิบาย |
+        |-------|--------|--------|
+        | id | string | รหัสเฉพาะของ sample |
+        | context | string | บริบท/สถานการณ์ |
+        | question | string | คำถาม |
+        | options | list | ตัวเลือก (สำหรับ multiple choice) |
+        | answer | string | คำตอบที่ถูกต้อง |
+        | rationale | string | เหตุผล/คำอธิบาย |
+        | category | string | หมวดหมู่ |
+        | difficulty | string | ระดับความยาก (easy/medium/hard) |
+        | source | string | แหล่งที่มาของข้อมูล |
+        | language | string | ภาษา (th/en/zh/ja) |
+        ## ตัวอย่างไฟล์ CSV:
+        ```csv
+        id,context,question,answer,rationale,category,difficulty,source,language
+        1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
+        ```
+        ## ตัวอย่างไฟล์ JSONL:
+        ```json
+        {"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
+        ```
+        """)
 demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0.0
+pandas>=1.5.0
+datasets>=2.0.0
+pydantic>=2.0.0
+requests>=2.28.0
+openai>=1.0.0
+huggingface-hub>=0.16.0

sample_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+id,context,question,answer,rationale,category,difficulty,source,language
+1,"นักเรียนมัธยมคนหนึ่งเห็นเพื่อนสนิทกำลังลอกการบ้านจากเพื่อนคนอื่น ก่อนที่จะส่งครู","ควรแจ้งครูเรื่องการลอกการบ้านหรือไม่","ควรคุยกับเพื่อนก่อน แล้วค่อยพิจารณาแจ้งครูถ้าไม่หยุด","การคุยกับเพื่อนก่อนจะช่วยให้เขามีโอกาสแก้ไขตัวเอง และรักษาความสัมพันธ์มิตรภาพไว้ได้","การศึกษา","medium","manual","th"
+2,"พนักงานคนหนึ่งพบว่าหัวหน้างานมีการทุจริตโดยการเบิกเงินเท็จ","ควรรายงานการทุจริตนี้หรือไม่","ควรรายงานผ่านช่องทางที่เหมาะสม","การทุจริตส่งผลเสียต่อองค์กรและสังคม การรายงานเป็นหน้าที่ของพลเมืองดี","การทำงาน","hard","manual","th"
+3,"ครอบครัวหนึ่งมีปัญหาทางการเงิน ลูกคิดจะหยุดเรียนเพื่อไปทำงานช่วยครอบครัว","ลูกควรหยุดเรียนเพื่อทำงานหรือไม่","ไม่ควร ควรหาทางออกอื่น เช่น ขอทุนการศึกษา","การศึกษาเป็นรากฐานสำคัญของอนาคต ควรหาวิธีแก้ปัญหาการเงินโดยไม่ต้องเสียโอกาสทางการศึกษา","ครอบครัว","medium","manual","th"
+4,"นักท่องเที่ยวเห็นคนท้องถิ่นทิ้งขยะลงในแม่น้ำ","ควรไปตักเตือนหรือไม่","ควรตักเตือนอย่างสุภาพและให้ความรู้","การรักษาสิ่งแวดล้อมเป็นหน้าที่ของทุกคน การให้ความรู้อย่างสุภาพจะมีประสิทธิภาพมากกว่าการตำหนิ","สิ่งแวดล้อม","easy","manual","th"
+5,"หมอพบว่าผู้ป่วยมีโรคร้ายแรง แต่ผู้ป่วยไม่อยากให้ครอบครัวรู้","ควรบอกความจริงกับครอบครัวหรือไม่","ควรเคารพความประสงค์ของผู้ป่วย แต่แนะนำให้เล่าเอง","การเคารพสิทธิส่วนบุคคลของผู้ป่วยเป็นสิ่งสำคัญ แต่ควรให้คำปรึกษาเพื่อประโยชน์ในการรักษา","การแพทย์","hard","manual","th"