Spaces:

ZombitX64
/

DekData

Running

File size: 61,038 Bytes

import gradio as gr
import os
import json
import uuid
import re
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, ValidationError
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import requests
from datetime import datetime
import hashlib
# Hugging Face: ฟังก์ชันดาวน์โหลดโมเดลตามชื่อ
def download_hf_model(model_name, output_dir=None, hf_token=None):
    """
    ดาวน์โหลด Hugging Face model + tokenizer ไปยัง output_dir (cache_dir)
    รองรับการส่ง token สำหรับ private model
    """
    try:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        kwargs = {}
        if output_dir:
            kwargs['cache_dir'] = output_dir
        if hf_token:
            kwargs['token'] = hf_token
        AutoTokenizer.from_pretrained(model_name, **kwargs)
        AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
        return f"✅ ดาวน์โหลดโมเดล {model_name} สำเร็จที่ {output_dir if output_dir else '[default cache]'}\n\nหากโมเดลเป็น private หรือ restricted กรุณาใส่ Hugging Face token ให้ถูกต้องด้วย"
    except Exception as e:
        return f"❌ ดาวน์โหลดโมเดล {model_name} ไม่สำเร็จ: {e}"
# Ollama: ดึงรายชื่อโมเดล
def get_ollama_models(base_url="http://localhost:11434"):
    try:
        resp = requests.get(f"{base_url}/api/tags", timeout=5)
        resp.raise_for_status()
        tags = resp.json().get("models", [])
        return [tag["name"] for tag in tags]
    except Exception as e:
        print(f"ไม่สามารถดึงรายชื่อ Ollama models: {e}")
        return []

# 1. Dataset Schema
class DataSample(BaseModel):
    id: str
    context: str
    question: str
    options: Optional[List[str]] = None
    answer: str
    rationale: str
    category: str
    difficulty: str
    source: str
    language: str

# 2. Load dataset (local file หรือ Hugging Face)
def load_data(source_type, path_or_name):
    try:
        if source_type == "local":
            if not os.path.exists(path_or_name):
                raise FileNotFoundError(f"ไฟล์ {path_or_name} ไม่พบ")
            
            ext = os.path.splitext(path_or_name)[-1].lower()
            if ext == ".jsonl":
                data = []
                with open(path_or_name, 'r', encoding="utf-8") as f:
                    for line_num, line in enumerate(f, 1):
                        try:
                            data.append(json.loads(line.strip()))
                        except json.JSONDecodeError as e:
                            print(f"Warning: บรรทัด {line_num} มีข้อผิดพลาด JSON: {e}")
                            continue
            elif ext == ".csv":
                df = pd.read_csv(path_or_name, encoding="utf-8")
                data = df.to_dict(orient="records")
            elif ext == ".json":
                with open(path_or_name, 'r', encoding="utf-8") as f:
                    raw_data = json.load(f)
                    data = raw_data if isinstance(raw_data, list) else [raw_data]
            elif ext == ".parquet":
                df = pd.read_parquet(path_or_name)
                data = df.to_dict(orient="records")
            elif os.path.isdir(path_or_name):
                # โหลด HF Dataset ที่ save ไว้
                try:
                    dataset = Dataset.load_from_disk(path_or_name)
                    data = [dict(item) for item in dataset]
                except Exception as e:
                    raise ValueError(f"ไม่สามารถโหลด HF dataset จาก {path_or_name}: {e}")
            else:
                raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
            
            # แปลงเป็น DataSample objects
            def map_fields_to_datasample(item):
                # Auto mapping: พยายาม map field ที่ขาดหาย
                mapped = dict(item)
                if 'context' not in mapped:
                    mapped['context'] = mapped.get('subject', '') or mapped.get('title', '') or ''
                if 'category' not in mapped:
                    mapped['category'] = str(mapped.get('grade', '')) or mapped.get('category', '') or ''
                if 'question' not in mapped:
                    mapped['question'] = mapped.get('question', '') or ''
                if 'answer' not in mapped:
                    mapped['answer'] = mapped.get('answer', '') or ''
                if 'rationale' not in mapped:
                    mapped['rationale'] = mapped.get('rationale', '') or ''
                if 'options' not in mapped:
                    mapped['options'] = mapped.get('options', None)
                if 'id' not in mapped:
                    mapped['id'] = str(uuid.uuid4())
                if 'source' not in mapped:
                    mapped['source'] = f"local_{os.path.basename(path_or_name)}"
                if 'difficulty' not in mapped:
                    mapped['difficulty'] = "medium"
                if 'language' not in mapped:
                    mapped['language'] = "th"
                return mapped

            samples = []
            for i, item in enumerate(data):
                try:
                    mapped_item = map_fields_to_datasample(item)
                    samples.append(DataSample(**mapped_item))
                except ValidationError as e:
                    print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
                    continue
            
            return samples
            
        elif source_type == "hf":
            try:
                ds = load_dataset(path_or_name)
                # หา split ที่มีข้อมูล
                available_splits = list(ds.keys())
                if not available_splits:
                    raise ValueError("ไม่พบข้อมูลใน dataset")
                
                # ใช้ split แรกที่มีข้อมูล
                split_name = available_splits[0]
                data = ds[split_name]
                
                samples = []
                for i, item in enumerate(data):
                    try:
                        # แปลง HF format เป็น DataSample
                        sample_dict = dict(item)
                        
                        # เติมค่า default
                        if 'id' not in sample_dict:
                            sample_dict['id'] = f"hf_{i}"
                        if 'source' not in sample_dict:
                            sample_dict['source'] = f"hf_{path_or_name}"
                        if 'difficulty' not in sample_dict:
                            sample_dict['difficulty'] = "medium"
                        if 'language' not in sample_dict:
                            sample_dict['language'] = "en"
                        
                        samples.append(DataSample(**sample_dict))
                    except ValidationError as e:
                        print(f"Warning: รายการที่ {i+1} จาก HF ข้อมูลไม่ถูกต้อง: {e}")
                        continue
                
                return samples
                
            except Exception as e:
                raise ValueError(f"ไม่สามารถโหลด HF dataset '{path_or_name}': {e}")
        else:
            raise ValueError("source_type ต้องเป็น 'local' หรือ 'hf'")
            
    except Exception as e:
        raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")

# 3. LLM API Integration (รองรับหลาย provider)
def get_ollama_models(base_url="http://localhost:11434"):
    """ดึงรายชื่อ models จาก Ollama"""
    try:
        response = requests.get(f"{base_url}/api/tags")
        response.raise_for_status()
        data = response.json()
        models = [model["name"] for model in data.get("models", [])]
        return models if models else ["llama3.2"]  # fallback
    except Exception as e:
        print(f"Warning: ไม่สามารถดึงรายชื่อ models จาก Ollama: {e}")
        return ["llama3.2", "llama3.1", "gemma2", "qwen2.5"]  # default models

class LLMProvider:
    def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
        self.provider = provider
        self.api_key = api_key
        self.base_url = base_url
    
    def generate(self, prompt, model="llama3.2", temperature=0.7, max_tokens=1000):
        try:
            if self.provider == "ollama":
                return self._generate_ollama(prompt, model, temperature, max_tokens)
            elif self.provider == "deepseek":
                return self._generate_deepseek(prompt, model, temperature, max_tokens)
            elif self.provider == "huggingface":
                return self._generate_huggingface(prompt, model, temperature, max_tokens)
            elif self.provider == "hf_local":
                return self._generate_hf_local(prompt, model, temperature, max_tokens)
            else:
                raise ValueError(f"ไม่รองรับ provider: {self.provider}")
        except Exception as e:
            return f"Error generating response: {e}"
    
    def _generate_ollama(self, prompt, model, temperature, max_tokens):
        response = requests.post(
            f"{self.base_url}/api/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": temperature,
                    "num_predict": max_tokens
                }
            }
        )
        response.raise_for_status()
        return response.json()["response"]
    
    def _generate_deepseek(self, prompt, model="deepseek-chat", temperature=0.7, max_tokens=1000):
        url = "https://api.deepseek.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        return result["choices"][0]["message"]["content"]

    def _generate_hf_local(self, prompt, model, temperature, max_tokens):
        # โหลดโมเดลและ tokenizer แค่ครั้งแรก (cache ใน instance)
        if not hasattr(self, "_hf_local_model") or self._hf_local_model_name != model:
            from transformers import AutoModelForCausalLM, AutoTokenizer
            import torch
            self._hf_local_model_name = model
            self._hf_local_tokenizer = AutoTokenizer.from_pretrained(model)
            self._hf_local_model = AutoModelForCausalLM.from_pretrained(model)
            self._hf_local_model.eval()
        tokenizer = self._hf_local_tokenizer
        model = self._hf_local_model
        import torch
        inputs = tokenizer(prompt, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True
            )
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # ตัด prompt ออกถ้ามี
        if result.startswith(prompt):
            result = result[len(prompt):].strip()
        return result

    def _generate_huggingface(self, prompt, model, temperature, max_tokens):
        headers = {"Authorization": f"Bearer {self.api_key}"}
        response = requests.post(
            f"https://api-inference.huggingface.co/models/{model}",
            headers=headers,
            json={
                "inputs": prompt,
                "parameters": {
                    "temperature": temperature,
                    "max_new_tokens": max_tokens
                }
            }
        )
        response.raise_for_status()
        result = response.json()
        if isinstance(result, list) and len(result) > 0:
            return result[0].get("generated_text", "").replace(prompt, "").strip()
        return str(result)

# 4. Dataset Generation & Augmentation
def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider, 
                        generation_type="augment", n_generate=1, custom_prompt="", model="llama3.2", max_samples_to_process=5, generation_language="auto"):
    """
    generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
    max_samples_to_process: จำนวน samples เดิมที่จะใช้ในการ generate
    generation_language: ภาษาที่ต้องการให้ LLM generate ("auto", "th", "en", "zh", "ja")
    """
    generated_samples = []
    
    # จำกัดจำนวน samples ตามที่ผู้ใช้เลือก
    samples_to_use = samples[:max_samples_to_process]
    
    for sample in samples_to_use:
        for _ in range(n_generate):
            try:
                # กำหนดภาษาที่จะใช้ในการ generate
                target_lang = sample.language if generation_language == "auto" else generation_language
                
                # เพิ่มคำแนะนำภาษาใน prompt
                language_instruction = ""
                if target_lang == "th":
                    language_instruction = "Please respond in Thai language. "
                elif target_lang == "en":
                    language_instruction = "Please respond in English. "
                elif target_lang == "zh":
                    language_instruction = "Please respond in Chinese. "
                elif target_lang == "ja":
                    language_instruction = "Please respond in Japanese. "
                
                if generation_type == "augment":
                    prompt = f"""
{language_instruction}Based on this context and question, create a similar but different scenario:

Context: {sample.context}
Question: {sample.question}
Answer: {sample.answer}
Rationale: {sample.rationale}

Generate a new scenario in the same category ({sample.category}) with:
- Different context but similar moral/logical challenge
- Appropriate question
- Clear answer
- Detailed rationale

Format as JSON:
{{
    "context": "new context here",
    "question": "new question here", 
    "answer": "new answer here",
    "rationale": "detailed reasoning here"
}}"""
                elif generation_type == "roleplay":
                    roles = ["ครูใหญ่", "หมอ", "นักบวช", "นักจิตวิทยา", "ผู้ปกครอง"]
                    role = roles[len(generated_samples) % len(roles)]
                    prompt = f"""
{language_instruction}คุณคือ{role} กำลังให้คำแนะนำเกี่ยวกับสถานการณ์นี้:

Context: {sample.context}
Question: {sample.question}

ในฐานะ{role} จงสร้างคำตอบและเหตุผลที่เหมาะสมจากมุมมองของบทบาทนี้

Format as JSON:
{{
    "context": "{sample.context}",
    "question": "{sample.question}",
    "answer": "คำตอบในฐานะ{role}",
    "rationale": "เหตุผลจากมุมมอง{role}"
}}"""

                elif generation_type == "topic_conditioning":
                    topics = ["ปัญหาวัยรุ่น", "ความยากจน", "เทคโนโลยี", "สิ่งแวดล้อม", "ครอบครัว"]
                    topic = topics[len(generated_samples) % len(topics)]
                    prompt = f"""
{language_instruction}สร้างสถานการณ์ใหม่ในหัวข้อ "{topic}" ที่มีความซับซ้อนทางจริยธรรมคล้ายกับ:

Original context: {sample.context}
Category: {sample.category}

สร้างสถานการณ์ใหม่ที่เกี่ยวข้องกับ{topic}:

Format as JSON:
{{
    "context": "สถานการณ์เกี่ยวกับ{topic}",
    "question": "คำถามที่เหมาะสม",
    "answer": "คำตอบที่ดีที่สุด",
    "rationale": "เหตุผลโดยละเอียด"
}}"""

                elif generation_type == "self_critique":
                    prompt = f"""
{language_instruction}Analyze and improve this moral reasoning scenario:

Context: {sample.context}
Question: {sample.question}
Answer: {sample.answer}
Rationale: {sample.rationale}

1. First, critique the reasoning - what could be improved?
2. Then provide an enhanced version with better rationale

Format as JSON:
{{
    "context": "{sample.context}",
    "question": "{sample.question}",
    "answer": "improved answer",
    "rationale": "enhanced rationale with deeper analysis"
}}"""

                else:  # custom prompt
                    prompt = custom_prompt.format(**sample.model_dump())

                # Generate ด้วย LLM
                response = llm_provider.generate(prompt, model=model)
                
                # Parse JSON response
                try:
                    # ลองหา JSON ใน response
                    json_match = re.search(r'\{.*\}', response, re.DOTALL)
                    if json_match:
                        json_str = json_match.group()
                        parsed_data = json.loads(json_str)
                          # สร้าง DataSample ใหม่
                        new_sample = DataSample(
                            id=str(uuid.uuid4()),
                            context=parsed_data.get("context", sample.context),
                            question=parsed_data.get("question", sample.question),
                            answer=parsed_data.get("answer", sample.answer),
                            rationale=parsed_data.get("rationale", sample.rationale),
                            category=sample.category,
                            difficulty=sample.difficulty,
                            source=f"generated_{generation_type}",
                            language=target_lang,  # ใช้ภาษาที่เลือก
                            options=sample.options
                        )
                        generated_samples.append(new_sample)
                        
                except (json.JSONDecodeError, KeyError) as e:
                    print(f"Warning: ไม่สามารถ parse JSON response: {e}")
                    continue
                    
            except Exception as e:
                print(f"Warning: ไม่สามารถ generate sample: {e}")
                continue
    
    return generated_samples

# 5. Post-processing & Filtering
def remove_duplicates(samples: List[DataSample]) -> List[DataSample]:
    """Remove duplicate samples based on context and question"""
    seen = set()
    unique = []
    for s in samples:
        # สร้าง hash จาก context + question
        content_hash = hashlib.md5(f"{s.context.lower().strip()}{s.question.lower().strip()}".encode()).hexdigest()
        if content_hash not in seen:
            unique.append(s)
            seen.add(content_hash)
    return unique

def syntax_check(samples: List[DataSample]) -> List[DataSample]:
    """Check for basic syntax issues and filter out problematic samples"""
    valid_samples = []
    for s in samples:
        # Check ว่ามีเนื้อหาครบถ้วน
        if (len(s.context.strip()) < 10 or 
            len(s.question.strip()) < 5 or 
            len(s.answer.strip()) < 3 or
            len(s.rationale.strip()) < 10):
            continue
        
        # Check ว่าไม่มี placeholder text
        placeholder_texts = ["[ใส่ข้อความ]", "TODO", "xxx", "example", "sample"]
        has_placeholder = any(placeholder in s.context.lower() or 
                            placeholder in s.question.lower() or
                            placeholder in s.answer.lower() or
                            placeholder in s.rationale.lower() 
                            for placeholder in placeholder_texts)
        if has_placeholder:
            continue
            
        valid_samples.append(s)
    
    return valid_samples

def difficulty_assessment(samples: List[DataSample]) -> List[DataSample]:
    """Assess and update difficulty based on heuristics"""
    for sample in samples:
        # Heuristic based on token count and complexity
        total_tokens = len(sample.context.split()) + len(sample.question.split()) + len(sample.rationale.split())
        
        # Count complexity indicators
        complexity_indicators = [
            "ถ้า", "แต่", "อย่างไรก็ตาม", "ในขณะที่", "แม้ว่า", 
            "เนื่องจาก", "ดังนั้น", "เพราะว่า", "หากว่า", "เว้นแต่"
        ]
        complexity_count = sum(1 for indicator in complexity_indicators 
                             if indicator in sample.context or indicator in sample.rationale)
        
        # Assess difficulty
        if total_tokens < 50 and complexity_count < 2:
            sample.difficulty = "easy"
        elif total_tokens > 150 or complexity_count > 4:
            sample.difficulty = "hard"
        else:
            sample.difficulty = "medium"
    
    return samples

def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en", model="llama3.2", max_samples=3) -> List[DataSample]:
    """Translate samples to target language"""
    translated = []
    
    for sample in samples[:max_samples]:  # จำกัดตามที่ระบุ
        if sample.language == target_lang:
            continue
            
        try:
            prompt = f"""
Translate this moral reasoning scenario to {target_lang}:

Context: {sample.context}
Question: {sample.question}
Answer: {sample.answer}
Rationale: {sample.rationale}

Maintain the moral and cultural context appropriately.

Format as JSON:
{{
    "context": "translated context",
    "question": "translated question",
    "answer": "translated answer", 
    "rationale": "translated rationale"
}}"""

            response = llm_provider.generate(prompt, model=model)
            
            # Parse JSON
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                parsed_data = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
                
                translated_sample = DataSample(
                    id=f"{sample.id}_{target_lang}",
                    context=parsed_data["context"],
                    question=parsed_data["question"],
                    answer=parsed_data["answer"],
                    rationale=parsed_data["rationale"],
                    category=sample.category,
                    difficulty=sample.difficulty,
                    source=f"{sample.source}_translated",
                    language=target_lang,
                    options=sample.options
                )
                translated.append(translated_sample)
                
        except Exception as e:
            print(f"Warning: ไม่สามารถแปลภาษา sample {sample.id}: {e}")
            continue
    
    return translated

def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider, model="llama3.2", max_samples=3) -> List[DataSample]:
    """Add multiple choice options to samples"""
    for sample in samples[:max_samples]:  # จำกัดตามที่ระบุ
        if sample.options:  # มี options อยู่แล้ว
            continue
            
        try:
            prompt = f"""
Create 4 multiple choice options for this scenario, with one correct answer:

Context: {sample.context}
Question: {sample.question}
Correct Answer: {sample.answer}

Generate 3 plausible but incorrect options and include the correct answer.

Format as JSON array:
["option A", "option B", "option C", "option D"]

Make sure the correct answer ({sample.answer}) is included as one of the options.
"""

            response = llm_provider.generate(prompt, model=model)
            
            # Parse JSON array
            json_match = re.search(r'\[.*\]', response, re.DOTALL)
            if json_match:
                options = json.loads(re.sub(r'[\x00-\x1F\x7F]', ' ', json_match.group()))
                if len(options) == 4:
                    sample.options = options
                    
        except Exception as e:
            print(f"Warning: ไม่สามารถสร้าง multiple choice สำหรับ {sample.id}: {e}")
            continue
    
    return samples

# 6. Export & Visualization
def preview_data(source_type, path_or_name, file_upload):
    """Preview dataset before processing"""
    try:
        # ใช้ไฟล์ที่อัปโหลดถ้ามี หรือใช้ path ที่กรอก
        file_path = file_upload.name if file_upload else path_or_name
        
        if source_type == "local":
            if not file_path:
                return gr.update(visible=False), "กรุณาเลือกไฟล์หรือใส่ path"
            
            if not os.path.exists(file_path):
                return gr.update(visible=False), f"ไม่พบไฟล์: {file_path}"
            
            ext = os.path.splitext(file_path)[-1].lower()
            
            if ext == ".csv":
                df = pd.read_csv(file_path, encoding="utf-8")
                preview_html = f"""
                <div style="margin: 10px 0;">
                    <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
                    <p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
                    <p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                    <h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
                    {df.head().to_html(classes='table table-striped', escape=False)}
                </div>
                """
                return gr.update(visible=True, value=preview_html), ""
                
            elif ext == ".jsonl":
                data = []
                with open(file_path, 'r', encoding="utf-8") as f:
                    for i, line in enumerate(f):
                        if i >= 5:  # แสดงแค่ 5 บรรทัดแรก
                            break
                        try:
                            data.append(json.loads(line.strip()))
                        except json.JSONDecodeError:
                            continue
                
                if data:
                    df = pd.DataFrame(data)
                    total_lines = sum(1 for _ in open(file_path, 'r', encoding="utf-8"))
                    preview_html = f"""
                    <div style="margin: 10px 0;">
                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
                        <p><strong>จำนวนบรรทัด:</strong> {total_lines} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
                        {df.to_html(classes='table table-striped', escape=False)}
                    </div>
                    """
                    return gr.update(visible=True, value=preview_html), ""
                else:
                    return gr.update(visible=False), "ไม่สามารถอ่านข้อมูลจากไฟล์ JSONL"
                    
            elif ext == ".json":
                with open(file_path, 'r', encoding="utf-8") as f:
                    data = json.load(f)
                
                if isinstance(data, list):
                    df = pd.DataFrame(data[:5])  # แสดงแค่ 5 รายการแรก
                    preview_html = f"""
                    <div style="margin: 10px 0;">
                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
                        <p><strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                        <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
                        {df.to_html(classes='table table-striped', escape=False)}
                    </div>
                    """
                else:
                    # Single object
                    df = pd.DataFrame([data])
                    preview_html = f"""
                    <div style="margin: 10px 0;">
                        <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
                        <p><strong>ประเภท:</strong> Object เดียว | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                        <h5>ข้อมูล:</h5>
                        {df.to_html(classes='table table-striped', escape=False)}                    </div>
                    """
                return gr.update(visible=True, value=preview_html), ""
            elif ext == ".parquet":
                df = pd.read_parquet(file_path)
                preview_html = f"""
                <div style="margin: 10px 0;">
                    <h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
                    <p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
                    <p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                    <h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
                    {df.head().to_html(classes='table table-striped', escape=False)}
                </div>
                """
                return gr.update(visible=True, value=preview_html), ""
            elif os.path.isdir(file_path):
                # ตรวจสอบว่าเป็น HF dataset directory หรือไม่
                if os.path.exists(os.path.join(file_path, "dataset_info.json")):
                    try:
                        dataset = Dataset.load_from_disk(file_path)
                        sample_data = [dict(item) for i, item in enumerate(dataset) if i < 5]
                        df = pd.DataFrame(sample_data)
                        
                        preview_html = f"""
                        <div style="margin: 10px 0;">
                            <h4>📁 HF Dataset Directory: {os.path.basename(file_path)}</h4>
                            <p><strong>จำนวนรายการ:</strong> {len(dataset)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
                            <h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
                            {df.to_html(classes='table table-striped', escape=False)}
                        </div>
                        """
                        return gr.update(visible=True, value=preview_html), ""
                    except Exception as e:
                        return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
                else:
                    return gr.update(visible=False), f"ไม่ใช่ HF dataset directory ที่ถูกต้อง"
            else:
                return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
                
        return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
        
    except Exception as e:
        return gr.update(visible=False), f"เกิดข้อผิดพลาด: {str(e)}"

def update_path_from_file(file_upload):
    """อัปเดต path เมื่อมีการเลือกไฟล์"""
    if file_upload:
        return file_upload.name
    return ""

def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
    """Export dataset ในรูปแบบต่างๆ"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if format_type == "csv":
        df = pd.DataFrame([s.model_dump() for s in samples])
        filename = f"{output_path}_{timestamp}.csv"
        df.to_csv(filename, index=False, encoding="utf-8-sig")
        return filename        
    elif format_type == "jsonl":
        filename = f"{output_path}_{timestamp}.jsonl"
        with open(filename, 'w', encoding="utf-8") as f:
            for sample in samples:
                f.write(json.dumps(sample.model_dump(), ensure_ascii=False) + "\n")
        return filename
    elif format_type == "hf_dataset":
        # Export Hugging Face Dataset แบบมาตรฐาน (Arrow directory)
        import shutil
        data_dict = {key: [] for key in samples[0].model_dump().keys()}
        for sample in samples:
            sample_dict = sample.model_dump()
            for key, value in sample_dict.items():
                data_dict[key].append(value)
        dataset = Dataset.from_dict(data_dict)
        hf_dir = f"{output_path}_hf_{timestamp}"
        dataset.save_to_disk(hf_dir)
        # Zip the directory for Gradio download
        zip_path = f"{hf_dir}.zip"
        shutil.make_archive(hf_dir, 'zip', hf_dir)
        return zip_path
        
    elif format_type == "parquet":
        # Export เป็น Parquet format
        df = pd.DataFrame([s.model_dump() for s in samples])
        filename = f"{output_path}_{timestamp}.parquet"
        df.to_parquet(filename, index=False, engine='pyarrow')
        return filename
        
    else:
        raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")

def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
    """สถิติของ dataset"""
    if not samples:
        return {"total": 0}
    
    df = pd.DataFrame([s.model_dump() for s in samples])
    
    stats = {
        "total": len(samples),
        "categories": df["category"].value_counts().to_dict(),
        "difficulties": df["difficulty"].value_counts().to_dict(), 
        "languages": df["language"].value_counts().to_dict(),
        "sources": df["source"].value_counts().to_dict(),
        "avg_context_length": df["context"].str.len().mean(),
        "avg_question_length": df["question"].str.len().mean(),
        "avg_answer_length": df["answer"].str.len().mean(),
        "avg_rationale_length": df["rationale"].str.len().mean(),
        "with_options": sum(1 for s in samples if s.options is not None)
    }
    
    return stats

# 7. Main Workflow Function
def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
                 ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, generation_language, target_language,
                 add_multiple_choice, export_format):
    try:
        progress_text = "เริ่มต้น workflow...\n"
        
        # 1. Load dataset
        progress_text += "📂 กำลังโหลด dataset...\n"
        samples = load_data(source_type, path_or_name)
        progress_text += f"✅ โหลดสำเร็จ {len(samples)} samples\n"
        
        # 2. Setup LLM
        progress_text += f"🤖 กำลังตั้งค่า LLM ({llm_provider_type})...\n"
        llm_provider = LLMProvider(
            provider=llm_provider_type,
            api_key=api_key if api_key else None,
            base_url=base_url if base_url else "http://localhost:11434"
        )        # 3. Generate new samples
        if n_generate > 0:
            progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type}) จาก {min(max_samples_to_process, len(samples))} samples เดิม...\n"
            # เลือกโมเดลที่เหมาะสม
            if llm_provider_type == "ollama":
                model_name = ollama_model
            elif llm_provider_type == "deepseek":
                model_name = deepseek_model
            else:
                model_name = "deepseek-chat"  # default for other providers
            if llm_provider_type == "huggingface":
                with gr.Progress(track_tqdm=True):
                    new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
            else:
                new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process, generation_language)
            samples.extend(new_samples)
            progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
        
        # 4. Post-processing
        progress_text += "🔧 กำลัง post-process...\n"
        original_count = len(samples)
        samples = remove_duplicates(samples)
        progress_text += f"  - ลบ duplicate: {original_count} -> {len(samples)}\n"
        
        samples = syntax_check(samples)
        progress_text += f"  - syntax check: {len(samples)} samples ผ่าน\n"
        
        samples = difficulty_assessment(samples)
        progress_text += f"  - ประเมิน difficulty เสร็จสิ้น\n"
          # 5. Translation
        if target_language and target_language != "none":
            progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
            max_translate_samples = min(10, len(samples))  # จำกัดการแปลไม่เกิน 10 samples
            if llm_provider_type == "huggingface":
                with gr.Progress(track_tqdm=True):
                    translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
            else:
                translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
            samples.extend(translated)
            progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"

        # 6. Add multiple choice
        if add_multiple_choice:
            progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
            max_mc_samples = min(10, len(samples))  # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
            if llm_provider_type == "ollama":
                model_name = ollama_model
            elif llm_provider_type == "deepseek":
                model_name = deepseek_model
            else:
                model_name = "deepseek-chat"  # fallback/default

            if llm_provider_type == "huggingface":
                with gr.Progress(track_tqdm=True):
                    samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
            else:
                samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
            progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
        
        # 7. Export
        progress_text += f"💾 กำลัง export เป็น {export_format}...\n"
        output_file = export_dataset(samples, export_format)
        progress_text += f"✅ Export สำเร็จ: {output_file}\n"
          # 8. Stats
        stats = get_dataset_stats(samples)
        progress_text += "\n📊 สถิติ Dataset:\n"
        progress_text += f"  - จำนวนทั้งหมด: {stats['total']}\n"
        progress_text += f"  - Categories: {stats['categories']}\n"
        progress_text += f"  - Difficulties: {stats['difficulties']}\n"
        progress_text += f"  - Languages: {stats['languages']}\n"
        progress_text += f"  - มี Multiple Choice: {stats['with_options']}\n"
        
        # สร้างข้อมูลสำหรับดาวน์โหลด
        file_size = os.path.getsize(output_file) if os.path.exists(output_file) else 0
        file_size_mb = file_size / (1024 * 1024)
        
        download_info_text = f"""
### 📁 ไฟล์พร้อมดาวน์โหลด
- **ชื่อไฟล์**: `{os.path.basename(output_file)}`
- **รูปแบบ**: {export_format.upper()}
- **ขนาด**: {file_size_mb:.2f} MB
- **จำนวนข้อมูล**: {stats['total']} samples
- **สร้างเมื่อ**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
"""
        
        return (
            progress_text, 
            pd.DataFrame([s.model_dump() for s in samples]).head(10).to_html(),
            gr.update(value=output_file, visible=True),
            gr.update(value=download_info_text, visible=True)
        )
    except Exception as e:
        error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
        return (
            error_text,
            "",
            gr.update(visible=False),
            gr.update(visible=False)
        )

# 8. Gradio Interface
with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
    gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
    
    # ⚠️ คำเตือนเรื่องทรัพย์สินทางปัญญา
    gr.Markdown("""
    ---
    ### ⚠️ **คำเตือนเรื่องทรัพย์สินทางปัญญา**
    
    **ระบบนี้เป็นทรัพย์สินทางปัญญา** ห้ามคัดลอก แก้ไข หรือนำไปใช้เพื่อการพาณิชย์โดยไม่ได้รับอนุญาต
    
    - 🚫 **ห้ามคัดลอกโค้ด** หรือส่วนใดส่วนหนึ่งของระบบ
    - 🚫 **ห้ามแก้ไขหรือดัดแปลง** เพื่อสร้างผลงานใหม่
    - 🚫 **ห้ามจำหน่าย** หรือแจกจ่ายต่อโดยไม่ได้รับอนุญาต
    - ✅ **อนุญาตให้ใช้งาน** เฉพาะเพื่อการทดสอบและเรียนรู้เท่านั้น
    
    **สงวนลิขสิทธิ์ © 2025 - All Rights Reserved**
    
    การใช้งานระบบนี้ถือว่าท่านรับทราบและยอมรับเงื่อนไขข้างต้น
    
    ---
    """)
    with gr.Tab("📂 Dataset Input"):
        with gr.Row():
            source_type = gr.Radio(
                ["local"],
                label="ประเภทแหล่งข้อมูล",
                info="local = ไฟล์ในเครื่องหรือ HF dataset directory ที่โหลดมา",
                value="local"
            )
        
        with gr.Row():
            with gr.Column(scale=3):                path_or_name = gr.Textbox(
                    label="Path หรือ Dataset Name",
                    placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
                    info="ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory ที่โหลดมา"
                )
            with gr.Column(scale=1):                file_upload = gr.File(
                    label="หรือเลือกไฟล์",
                    file_types=[".csv", ".jsonl", ".json", ".parquet"],
                    visible=True
                )
        
        # Preview section
        with gr.Row():
            preview_btn = gr.Button("🔍 ดูตัวอย่างข้อมูล", variant="secondary")
        
        with gr.Row():
            data_preview = gr.HTML(
                label="ตัวอย่างข้อมูล",
                visible=False
            )
    
    with gr.Tab("🤖 LLM Settings"):
        with gr.Row():
            llm_provider_type = gr.Dropdown(
                ["ollama", "deepseek", "huggingface", "hf_local"],
                label="LLM Provider", 
                value="ollama",
                info="เลือกผู้ให้บริการ LLM"
            )
            api_key = gr.Textbox(
                label="API Key (ถ้าจำเป็น)",
                type="password",
                placeholder="สำหรับ DeepSeek หรือ HuggingFace"
            )
        with gr.Row():
            base_url = gr.Textbox(
                label="Base URL",
                value="http://localhost:11434",
                info="สำหรับ Ollama หรือ local LLM server"
            )

        with gr.Row():
            hf_token = gr.Textbox(
                label="Hugging Face Token (สำหรับโหลดโมเดล private)",
                type="password",
                placeholder="กรอก HF Token ที่นี่"
            )
            hf_login_btn = gr.Button("Login Hugging Face", variant="primary")
            def login_hf(token):
                import os
                os.environ["HF_TOKEN"] = token
                return "Token ถูกตั้งค่าสำเร็จ"
            hf_login_status = gr.Textbox(label="สถานะการ Login", interactive=False)
            hf_login_btn.click(
                fn=login_hf,
                inputs=[hf_token],
                outputs=[hf_login_status]
            )

            # Get available models and set appropriate default
            available_models = get_ollama_models()
            default_model = available_models[0] if available_models else "llama3.2"
            
            ollama_model = gr.Dropdown(
                choices=available_models,
                label="Ollama Model",
                value=default_model,
                visible=True,
                allow_custom_value=True,
                info="เลือก model จาก Ollama"
            )
            
            deepseek_model = gr.Dropdown(
                choices=["deepseek-chat", "deepseek-reasoner"],
                label="DeepSeek Model",
                value="deepseek-chat",
                visible=False,
                info="deepseek-chat = DeepSeek-V3-0324, deepseek-reasoner = DeepSeek-R1-0528"
            )
            
            refresh_models_btn = gr.Button(
                "🔄 รีเฟรช Models", 
                size="sm",
                visible=True
            )
          # ฟังก์ชันสำหรับรีเฟรช models
        def refresh_ollama_models(base_url_val):
            try:
                models = get_ollama_models(base_url_val)
                if models:
                    return gr.update(choices=models, value=models[0])
                else:
                    return gr.update(choices=["llama3.2"], value="llama3.2")
            except Exception as e:
                print(f"Error refreshing models: {e}")
                return gr.update(choices=["llama3.2"], value="llama3.2")
          # ฟังก์ชันสำหรับแสดง/ซ่อน model dropdown ตามผู้ให้บริการ
        def update_model_visibility(provider):
            ollama_visible = (provider == "ollama")
            deepseek_visible = (provider == "deepseek")
            return (
                gr.update(visible=ollama_visible), 
                gr.update(visible=deepseek_visible),
                gr.update(visible=ollama_visible)
            )
        
        # Event handlers
        refresh_models_btn.click(
            fn=refresh_ollama_models,
            inputs=[base_url],
            outputs=[ollama_model]
        )
        llm_provider_type.change(
            fn=update_model_visibility,
            inputs=[llm_provider_type],
            outputs=[ollama_model, deepseek_model, refresh_models_btn]
        )
        with gr.Tab("✨ Generation Settings"):
            with gr.Row():
                generation_type = gr.Dropdown(
                    ["augment", "roleplay", "topic_conditioning", "self_critique", "custom"],
                    label="ประเภทการ Generate",
                    value="augment",
                info="วิธีการสร้างข้อมูลใหม่"
            )
            generation_language = gr.Dropdown(
                ["auto", "th", "en", "zh", "ja"],
                label="ภาษาในการ Generate",
                value="auto",
                info="ภาษาที่ต้องการให้ LLM สร้างข้อมูลใหม่ (auto = ตามข้อมูลเดิม)"
            )
        with gr.Row():
            n_generate = gr.Slider(
                1, 5, value=1, step=1,
                label="จำนวนรอบ Generate",
                info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
            )
        with gr.Row():
            max_samples_to_process = gr.Slider(
                1, 50, value=5, step=1,
                label="จำนวน Samples เดิมที่จะใช้ Generate",
                info="เลือกจำนวน samples จากข้อมูลเดิมที่จะใช้สร้างข้อมูลใหม่"
            )
            total_new_samples = gr.Number(
                label="รวมจำนวน Samples ใหม่ที่คาดว่าจะได้",
                value=5,
                interactive=False,
                info="คำนวณจาก: จำนวน samples เดิม × จำนวนรอบ generate"
            )
        
        custom_prompt = gr.Textbox(
            label="Custom Prompt (ถ้าเลือก custom)",
            placeholder="ใช้ {context}, {question}, {answer} เป็น placeholder",
            lines=3,
            visible=False
        )
        def update_custom_prompt_visibility(gen_type):
            return gr.update(visible=(gen_type == "custom"))
        
        def update_total_samples_calculation(max_samples, n_gen):
            total = max_samples * n_gen
            return gr.update(value=total)
        
        generation_type.change(
            update_custom_prompt_visibility,
            inputs=[generation_type],
            outputs=[custom_prompt]
        )
        
        # อัปเดตการคำนวณจำนวน samples ใหม่
        max_samples_to_process.change(
            update_total_samples_calculation,
            inputs=[max_samples_to_process, n_generate],
            outputs=[total_new_samples]
        )
        n_generate.change(
            update_total_samples_calculation,
            inputs=[max_samples_to_process, n_generate],
            outputs=[total_new_samples]
        )

        # ปุ่มโหลด Dataset จาก Hugging Face
        hf_dataset_name = gr.Textbox(
            label="ชื่อ Dataset จาก Hugging Face",
            placeholder="เช่น squad หรือ username/dataset-name"
        )
        hf_dataset_btn = gr.Button("โหลด Dataset จาก Hugging Face", variant="primary")
        hf_dataset_status = gr.Textbox(label="สถานะการโหลด", interactive=False)
    def download_hf_dataset(dataset_name):
        from datasets import load_dataset
        try:
            ds = load_dataset(dataset_name)
            return f"✅ โหลด Dataset {dataset_name} สำเร็จ"
        except Exception as e:
            return f"❌ โหลด Dataset {dataset_name} ไม่สำเร็จ: {e}"
    hf_dataset_btn.click(
        fn=download_hf_dataset,
        inputs=[hf_dataset_name],
        outputs=[hf_dataset_status]
        )
    
    with gr.Tab("🤗 Hugging Face Model Download"):
        hf_model_name = gr.Textbox(
            label="ชื่อโมเดล Hugging Face",
            placeholder="เช่น meta-llama/Llama-2-7b-chat-hf"
        )
        hf_download_btn = gr.Button("ดาวน์โหลดโมเดล", variant="primary")
        hf_download_status = gr.Textbox(label="สถานะการดาวน์โหลด", interactive=False)
        hf_download_btn.click(
            fn=download_hf_model,
            inputs=[hf_model_name],
            outputs=[hf_download_status]
        )
    with gr.Tab("🔧 Post-processing"):
        with gr.Row():
            target_language = gr.Dropdown(
                ["none", "en", "th", "zh", "ja"],
                label="แปลภาษา",
                value="none",
                info="แปลเป็นภาษาเป้าหมาย (none = ไม่แปล)"
            )
            add_multiple_choice = gr.Checkbox(
                label="เพิ่ม Multiple Choice Options",
                value=False,
                info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
            )
    
    with gr.Tab("💾 Export Settings"):
        export_format = gr.Dropdown(
            ["csv", "jsonl", "parquet","hf_dataset"],
            label="รูปแบบ Export",
            value="parquet",
            info="hf_dataset = HF Dataset (Parquet), parquet = Parquet ไฟล์"
        )
    with gr.Tab("📊 ผลลัพธ์"):
        progress_output = gr.Textbox(
            label="สถานะ",
            lines=15,
            max_lines=20,
            interactive=False,
            show_copy_button=True
        )

        preview_output = gr.HTML(
            label="ตัวอย่างข้อมูล (10 รายการแรก)"
        )

        with gr.Row():
            download_file = gr.File(
                label="💾 ดาวน์โหลด Dataset ที่สร้างแล้ว",
                visible=False,
                interactive=False
            )
        download_info = gr.Markdown(
            value="",
            visible=False
        )
    with gr.Row():
        run_btn = gr.Button("🚀 เริ่มต้น Workflow", variant="primary", size="lg")
        clear_btn = gr.Button("🗑️ ล้างข้อมูล", variant="secondary")
    run_btn.click(
        fn=main_workflow,
        inputs=[
            source_type, path_or_name, llm_provider_type, api_key, base_url,
            ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, generation_language, target_language,
            add_multiple_choice, export_format
        ],
        outputs=[progress_output, preview_output, download_file, download_info]
    )
    preview_output = gr.HTML(
        label="ตัวอย่างข้อมูล (10 รายการแรก)"
    )
    clear_btn.click(
        lambda: ("", "", gr.update(visible=False), gr.update(visible=False)),
        outputs=[progress_output, preview_output, download_file, download_info]
    )
    
    # Preview event handlers
    preview_btn.click(
        fn=preview_data,
        inputs=[source_type, path_or_name, file_upload],
        outputs=[data_preview, progress_output]
    )
    
    file_upload.upload(
        fn=update_path_from_file,
        inputs=[file_upload],
        outputs=[path_or_name]
    )
    
    # ตัวอย่าง dataset schema
    with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
        gr.Markdown("""
        ## Schema ของ Dataset
        
        | Field | ประเภท | อธิบาย |
        |-------|--------|--------|
        | id | string | รหัสเฉพาะของ sample |
        | context | string | บริบท/สถานการณ์ |
        | question | string | คำถาม |
        | options | list | ตัวเลือก (สำหรับ multiple choice) |
        | answer | string | คำตอบที่ถูกต้อง |
        | rationale | string | เหตุผล/คำอธิบาย |
        | category | string | หมวดหมู่ |
        | difficulty | string | ระดับความยาก (easy/medium/hard) |
        | source | string | แหล่งที่มาของข้อมูล |
        | language | string | ภาษา (th/en/zh/ja) |
        
        ## ตัวอย่างไฟล์ CSV:
        ```csv
        id,context,question,answer,rationale,category,difficulty,source,language
        1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
        ```
          ## ตัวอย่างไฟล์ JSONL:
        ```json
        {"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
        ```
          ## รูปแบบ Export ที่รองรับ:
        - **CSV**: ไฟล์ Excel/Spreadsheet ทั่วไป
        - **JSONL**: JSON Lines สำหรับ machine learning
        - **Parquet**: รูปแบบคอลัมน์ที่มีประสิทธิภาพสูง (แนะนำ)
        - **HF Dataset**: Hugging Face Dataset เป็น Parquet format
        
        ## ฟีเจอร์การ Generate ข้อมูล:
        - **เลือกภาษา**: สามารถเลือกภาษาที่ต้องการให้ LLM generate (auto, th, en, zh, ja)
        - **เลือกจำนวน samples**: กำหนดได้ว่าจะใช้ข้อมูลเดิมกี่ sample ในการ generate
        - **Multiple choice generation**: เพิ่มตัวเลือกผิดสำหรับทำ multiple choice
        - **Translation**: แปลข้อมูลเป็นภาษาอื่นๆ
        
        ## การโหลด Dataset ที่สร้างแล้ว:
        - สามารถโหลด output ที่สร้างแล้วกลับมาใช้ได้
        - รองรับ `.csv`, `.jsonl`, `.json`, `.parquet` และ HF dataset directories
        - ใส่ path ของไฟล์หรือ directory ใน "Path หรือ Dataset Name"
        """)

demo.launch()