Nattapong Tapachoom
commited on
Commit
·
ac711d6
1
Parent(s):
c484890
Add .gitignore file and update requirements.txt with additional dependencies
Browse files- .gitignore +33 -0
- app.py +418 -80
- requirements.txt +3 -1
.gitignore
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*.pyo
|
5 |
+
*.pyd
|
6 |
+
*.pyc
|
7 |
+
*.ipynb_checkpoints
|
8 |
+
|
9 |
+
# VSCode
|
10 |
+
.vscode/
|
11 |
+
.history/
|
12 |
+
|
13 |
+
# Data & Output
|
14 |
+
*.csv
|
15 |
+
*.jsonl
|
16 |
+
*.hf_dataset
|
17 |
+
output_*/
|
18 |
+
*.arrow
|
19 |
+
*.parquet
|
20 |
+
|
21 |
+
# Environment
|
22 |
+
.env
|
23 |
+
.venv/
|
24 |
+
env/
|
25 |
+
venv/
|
26 |
+
*.egg-info/
|
27 |
+
|
28 |
+
# OS
|
29 |
+
.DS_Store
|
30 |
+
Thumbs.db
|
31 |
+
|
32 |
+
# Others
|
33 |
+
*.log
|
app.py
CHANGED
@@ -48,24 +48,50 @@ def load_data(source_type, path_or_name):
|
|
48 |
with open(path_or_name, 'r', encoding="utf-8") as f:
|
49 |
raw_data = json.load(f)
|
50 |
data = raw_data if isinstance(raw_data, list) else [raw_data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
else:
|
52 |
raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
|
53 |
|
54 |
# แปลงเป็น DataSample objects
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
samples = []
|
56 |
for i, item in enumerate(data):
|
57 |
try:
|
58 |
-
|
59 |
-
|
60 |
-
item['id'] = str(uuid.uuid4())
|
61 |
-
if 'source' not in item:
|
62 |
-
item['source'] = f"local_{os.path.basename(path_or_name)}"
|
63 |
-
if 'difficulty' not in item:
|
64 |
-
item['difficulty'] = "medium"
|
65 |
-
if 'language' not in item:
|
66 |
-
item['language'] = "th"
|
67 |
-
|
68 |
-
samples.append(DataSample(**item))
|
69 |
except ValidationError as e:
|
70 |
print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
|
71 |
continue
|
@@ -116,6 +142,18 @@ def load_data(source_type, path_or_name):
|
|
116 |
raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")
|
117 |
|
118 |
# 3. LLM API Integration (รองรับหลาย provider)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
class LLMProvider:
|
120 |
def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
|
121 |
self.provider = provider
|
@@ -126,12 +164,10 @@ class LLMProvider:
|
|
126 |
try:
|
127 |
if self.provider == "ollama":
|
128 |
return self._generate_ollama(prompt, model, temperature, max_tokens)
|
129 |
-
elif self.provider == "openai":
|
130 |
-
return self._generate_openai(prompt, model, temperature, max_tokens)
|
131 |
-
elif self.provider == "huggingface":
|
132 |
-
return self._generate_huggingface(prompt, model, temperature, max_tokens)
|
133 |
elif self.provider == "deepseek":
|
134 |
return self._generate_deepseek(prompt, model, temperature, max_tokens)
|
|
|
|
|
135 |
elif self.provider == "hf_local":
|
136 |
return self._generate_hf_local(prompt, model, temperature, max_tokens)
|
137 |
else:
|
@@ -155,20 +191,7 @@ class LLMProvider:
|
|
155 |
response.raise_for_status()
|
156 |
return response.json()["response"]
|
157 |
|
158 |
-
def
|
159 |
-
import openai
|
160 |
-
if self.api_key:
|
161 |
-
openai.api_key = self.api_key
|
162 |
-
|
163 |
-
response = openai.ChatCompletion.create(
|
164 |
-
model=model,
|
165 |
-
messages=[{"role": "user", "content": prompt}],
|
166 |
-
temperature=temperature,
|
167 |
-
max_tokens=max_tokens
|
168 |
-
)
|
169 |
-
return response.choices[0].message.content
|
170 |
-
|
171 |
-
def _generate_deepseek(self, prompt, model, temperature, max_tokens):
|
172 |
url = "https://api.deepseek.com/v1/chat/completions"
|
173 |
headers = {
|
174 |
"Authorization": f"Bearer {self.api_key}",
|
@@ -185,7 +208,6 @@ class LLMProvider:
|
|
185 |
response = requests.post(url, headers=headers, json=payload)
|
186 |
response.raise_for_status()
|
187 |
result = response.json()
|
188 |
-
# DeepSeek API returns: {"choices":[{"message":{"role":"assistant","content":"..."}}], ...}
|
189 |
return result["choices"][0]["message"]["content"]
|
190 |
|
191 |
def _generate_hf_local(self, prompt, model, temperature, max_tokens):
|
@@ -235,13 +257,17 @@ class LLMProvider:
|
|
235 |
|
236 |
# 4. Dataset Generation & Augmentation
|
237 |
def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider,
|
238 |
-
generation_type="augment", n_generate=1, custom_prompt=""):
|
239 |
"""
|
240 |
generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
|
|
|
241 |
"""
|
242 |
generated_samples = []
|
243 |
|
244 |
-
|
|
|
|
|
|
|
245 |
for _ in range(n_generate):
|
246 |
try:
|
247 |
if generation_type == "augment":
|
@@ -326,10 +352,10 @@ Format as JSON:
|
|
326 |
}}"""
|
327 |
|
328 |
else: # custom prompt
|
329 |
-
prompt = custom_prompt.format(**sample.
|
330 |
|
331 |
# Generate ด้วย LLM
|
332 |
-
response = llm_provider.generate(prompt)
|
333 |
|
334 |
# Parse JSON response
|
335 |
try:
|
@@ -426,11 +452,11 @@ def difficulty_assessment(samples: List[DataSample]) -> List[DataSample]:
|
|
426 |
|
427 |
return samples
|
428 |
|
429 |
-
def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en") -> List[DataSample]:
|
430 |
"""Translate samples to target language"""
|
431 |
translated = []
|
432 |
|
433 |
-
for sample in samples[:
|
434 |
if sample.language == target_lang:
|
435 |
continue
|
436 |
|
@@ -453,7 +479,7 @@ Format as JSON:
|
|
453 |
"rationale": "translated rationale"
|
454 |
}}"""
|
455 |
|
456 |
-
response = llm_provider.generate(prompt)
|
457 |
|
458 |
# Parse JSON
|
459 |
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
@@ -480,9 +506,9 @@ Format as JSON:
|
|
480 |
|
481 |
return translated
|
482 |
|
483 |
-
def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider) -> List[DataSample]:
|
484 |
"""Add multiple choice options to samples"""
|
485 |
-
for sample in samples[:
|
486 |
if sample.options: # มี options อยู่แล้ว
|
487 |
continue
|
488 |
|
@@ -502,7 +528,7 @@ Format as JSON array:
|
|
502 |
Make sure the correct answer ({sample.answer}) is included as one of the options.
|
503 |
"""
|
504 |
|
505 |
-
response = llm_provider.generate(prompt)
|
506 |
|
507 |
# Parse JSON array
|
508 |
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
@@ -518,28 +544,188 @@ Make sure the correct answer ({sample.answer}) is included as one of the options
|
|
518 |
return samples
|
519 |
|
520 |
# 6. Export & Visualization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
|
522 |
"""Export dataset ในรูปแบบต่างๆ"""
|
523 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
524 |
|
525 |
if format_type == "csv":
|
526 |
-
df = pd.DataFrame([s.
|
527 |
filename = f"{output_path}_{timestamp}.csv"
|
528 |
df.to_csv(filename, index=False, encoding="utf-8-sig")
|
529 |
-
return filename
|
530 |
-
|
531 |
elif format_type == "jsonl":
|
532 |
filename = f"{output_path}_{timestamp}.jsonl"
|
533 |
with open(filename, 'w', encoding="utf-8") as f:
|
534 |
for sample in samples:
|
535 |
-
f.write(json.dumps(sample.
|
536 |
return filename
|
537 |
|
538 |
elif format_type == "hf_dataset":
|
539 |
# Create Hugging Face Dataset
|
540 |
-
data_dict = {key: [] for key in samples[0].
|
541 |
for sample in samples:
|
542 |
-
sample_dict = sample.
|
543 |
for key, value in sample_dict.items():
|
544 |
data_dict[key].append(value)
|
545 |
|
@@ -548,6 +734,26 @@ def export_dataset(samples: List[DataSample], format_type="csv", output_path="ou
|
|
548 |
dataset.save_to_disk(dirname)
|
549 |
return dirname
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
else:
|
552 |
raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")
|
553 |
|
@@ -556,7 +762,7 @@ def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
|
|
556 |
if not samples:
|
557 |
return {"total": 0}
|
558 |
|
559 |
-
df = pd.DataFrame([s.
|
560 |
|
561 |
stats = {
|
562 |
"total": len(samples),
|
@@ -575,7 +781,7 @@ def get_dataset_stats(samples: List[DataSample]) -> Dict[str, Any]:
|
|
575 |
|
576 |
# 7. Main Workflow Function
|
577 |
def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
|
578 |
-
generation_type, n_generate, custom_prompt, target_language,
|
579 |
add_multiple_choice, export_format):
|
580 |
try:
|
581 |
progress_text = "เริ่มต้น workflow...\n"
|
@@ -591,16 +797,22 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
591 |
provider=llm_provider_type,
|
592 |
api_key=api_key if api_key else None,
|
593 |
base_url=base_url if base_url else "http://localhost:11434"
|
594 |
-
)
|
595 |
-
|
596 |
-
# 3. Generate new samples
|
597 |
if n_generate > 0:
|
598 |
-
progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
if llm_provider_type == "huggingface":
|
600 |
with gr.Progress(track_tqdm=True, desc="กำลัง generate ด้วย Hugging Face..."):
|
601 |
-
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt)
|
602 |
else:
|
603 |
-
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt)
|
604 |
samples.extend(new_samples)
|
605 |
progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
|
606 |
|
@@ -615,26 +827,27 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
615 |
|
616 |
samples = difficulty_assessment(samples)
|
617 |
progress_text += f" - ประเมิน difficulty เสร็จสิ้น\n"
|
618 |
-
|
619 |
-
# 5. Translation
|
620 |
if target_language and target_language != "none":
|
621 |
progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
|
|
|
622 |
if llm_provider_type == "huggingface":
|
623 |
with gr.Progress(track_tqdm=True, desc="กำลังแปลด้วย Hugging Face..."):
|
624 |
-
translated = translate_to_multilingual(samples, llm_provider, target_language)
|
625 |
else:
|
626 |
-
translated = translate_to_multilingual(samples, llm_provider, target_language)
|
627 |
samples.extend(translated)
|
628 |
progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
|
629 |
|
630 |
# 6. Add multiple choice
|
631 |
if add_multiple_choice:
|
632 |
progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
|
|
|
633 |
if llm_provider_type == "huggingface":
|
634 |
with gr.Progress(track_tqdm=True, desc="กำลังเพิ่มตัวเลือกด้วย Hugging Face..."):
|
635 |
-
samples = add_multiple_choice_options(samples, llm_provider)
|
636 |
else:
|
637 |
-
samples = add_multiple_choice_options(samples, llm_provider)
|
638 |
progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
|
639 |
|
640 |
# 7. Export
|
@@ -651,7 +864,7 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
651 |
progress_text += f" - Languages: {stats['languages']}\n"
|
652 |
progress_text += f" - มี Multiple Choice: {stats['with_options']}\n"
|
653 |
|
654 |
-
return progress_text, pd.DataFrame([s.
|
655 |
|
656 |
except Exception as e:
|
657 |
error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
|
@@ -661,25 +874,41 @@ def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_ur
|
|
661 |
with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
|
662 |
gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
|
663 |
gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
|
664 |
-
|
665 |
with gr.Tab("📂 Dataset Input"):
|
666 |
with gr.Row():
|
667 |
source_type = gr.Radio(
|
668 |
-
["local", "hf"],
|
669 |
label="ประเภทแหล่งข้อมูล",
|
670 |
info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
|
671 |
value="local"
|
672 |
)
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
)
|
678 |
|
679 |
with gr.Tab("🤖 LLM Settings"):
|
680 |
with gr.Row():
|
681 |
llm_provider_type = gr.Dropdown(
|
682 |
-
["ollama", "
|
683 |
label="LLM Provider",
|
684 |
value="ollama",
|
685 |
info="เลือกผู้ให้บริการ LLM"
|
@@ -687,13 +916,74 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
687 |
api_key = gr.Textbox(
|
688 |
label="API Key (ถ้าจำเป็น)",
|
689 |
type="password",
|
690 |
-
placeholder="สำหรับ
|
691 |
)
|
|
|
692 |
base_url = gr.Textbox(
|
693 |
label="Base URL",
|
694 |
value="http://localhost:11434",
|
695 |
info="สำหรับ Ollama หรือ local LLM server"
|
696 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
|
698 |
with gr.Tab("✨ Generation Settings"):
|
699 |
with gr.Row():
|
@@ -708,6 +998,18 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
708 |
label="จำนวนรอบ Generate",
|
709 |
info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
|
710 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
|
712 |
custom_prompt = gr.Textbox(
|
713 |
label="Custom Prompt (ถ้าเลือก custom)",
|
@@ -715,15 +1017,31 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
715 |
lines=3,
|
716 |
visible=False
|
717 |
)
|
718 |
-
|
719 |
def update_custom_prompt_visibility(gen_type):
|
720 |
return gr.update(visible=(gen_type == "custom"))
|
721 |
|
|
|
|
|
|
|
|
|
722 |
generation_type.change(
|
723 |
update_custom_prompt_visibility,
|
724 |
inputs=[generation_type],
|
725 |
outputs=[custom_prompt]
|
726 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
with gr.Tab("🔧 Post-processing"):
|
729 |
with gr.Row():
|
@@ -739,9 +1057,8 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
739 |
info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
|
740 |
)
|
741 |
|
742 |
-
with gr.Tab("💾 Export Settings"):
|
743 |
-
|
744 |
-
["csv", "jsonl", "hf_dataset"],
|
745 |
label="รูปแบบ Export",
|
746 |
value="csv",
|
747 |
info="รูปแบบไฟล์ที่ต้องการ export"
|
@@ -763,23 +1080,33 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
763 |
preview_output = gr.HTML(
|
764 |
label="ตัวอย่างข้อมูล (10 รายการแรก)"
|
765 |
)
|
766 |
-
|
767 |
-
# Event handlers
|
768 |
-
run_btn.click(
|
769 |
fn=main_workflow,
|
770 |
inputs=[
|
771 |
source_type, path_or_name, llm_provider_type, api_key, base_url,
|
772 |
-
generation_type, n_generate, custom_prompt, target_language,
|
773 |
add_multiple_choice, export_format
|
774 |
],
|
775 |
outputs=[progress_output, preview_output]
|
776 |
)
|
777 |
-
|
778 |
clear_btn.click(
|
779 |
lambda: ("", ""),
|
780 |
outputs=[progress_output, preview_output]
|
781 |
)
|
782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
# ตัวอย่าง dataset schema
|
784 |
with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
|
785 |
gr.Markdown("""
|
@@ -803,11 +1130,22 @@ with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo
|
|
803 |
id,context,question,answer,rationale,category,difficulty,source,language
|
804 |
1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
|
805 |
```
|
806 |
-
|
807 |
-
## ตัวอย่างไฟล์ JSONL:
|
808 |
```json
|
809 |
{"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
|
810 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
""")
|
812 |
|
813 |
demo.launch()
|
|
|
48 |
with open(path_or_name, 'r', encoding="utf-8") as f:
|
49 |
raw_data = json.load(f)
|
50 |
data = raw_data if isinstance(raw_data, list) else [raw_data]
|
51 |
+
elif ext == ".parquet":
|
52 |
+
df = pd.read_parquet(path_or_name)
|
53 |
+
data = df.to_dict(orient="records")
|
54 |
+
elif os.path.isdir(path_or_name):
|
55 |
+
# โหลด HF Dataset ที่ save ไว้
|
56 |
+
try:
|
57 |
+
dataset = Dataset.load_from_disk(path_or_name)
|
58 |
+
data = [dict(item) for item in dataset]
|
59 |
+
except Exception as e:
|
60 |
+
raise ValueError(f"ไม่สามารถโหลด HF dataset จาก {path_or_name}: {e}")
|
61 |
else:
|
62 |
raise ValueError(f"ไม่รองรับไฟล์ประเภท {ext}")
|
63 |
|
64 |
# แปลงเป็น DataSample objects
|
65 |
+
def map_fields_to_datasample(item):
|
66 |
+
# Auto mapping: พยายาม map field ที่ขาดหาย
|
67 |
+
mapped = dict(item)
|
68 |
+
if 'context' not in mapped:
|
69 |
+
mapped['context'] = mapped.get('subject', '') or mapped.get('title', '') or ''
|
70 |
+
if 'category' not in mapped:
|
71 |
+
mapped['category'] = str(mapped.get('grade', '')) or mapped.get('category', '') or ''
|
72 |
+
if 'question' not in mapped:
|
73 |
+
mapped['question'] = mapped.get('question', '') or ''
|
74 |
+
if 'answer' not in mapped:
|
75 |
+
mapped['answer'] = mapped.get('answer', '') or ''
|
76 |
+
if 'rationale' not in mapped:
|
77 |
+
mapped['rationale'] = mapped.get('rationale', '') or ''
|
78 |
+
if 'options' not in mapped:
|
79 |
+
mapped['options'] = mapped.get('options', None)
|
80 |
+
if 'id' not in mapped:
|
81 |
+
mapped['id'] = str(uuid.uuid4())
|
82 |
+
if 'source' not in mapped:
|
83 |
+
mapped['source'] = f"local_{os.path.basename(path_or_name)}"
|
84 |
+
if 'difficulty' not in mapped:
|
85 |
+
mapped['difficulty'] = "medium"
|
86 |
+
if 'language' not in mapped:
|
87 |
+
mapped['language'] = "th"
|
88 |
+
return mapped
|
89 |
+
|
90 |
samples = []
|
91 |
for i, item in enumerate(data):
|
92 |
try:
|
93 |
+
mapped_item = map_fields_to_datasample(item)
|
94 |
+
samples.append(DataSample(**mapped_item))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
except ValidationError as e:
|
96 |
print(f"Warning: รายการที่ {i+1} ข้อมูลไม่ถูกต้อง: {e}")
|
97 |
continue
|
|
|
142 |
raise Exception(f"ข้อผิดพลาดในการโหลดข้อมูล: {e}")
|
143 |
|
144 |
# 3. LLM API Integration (รองรับหลาย provider)
|
145 |
+
def get_ollama_models(base_url="http://localhost:11434"):
|
146 |
+
"""ดึงรายชื่อ models จาก Ollama"""
|
147 |
+
try:
|
148 |
+
response = requests.get(f"{base_url}/api/tags")
|
149 |
+
response.raise_for_status()
|
150 |
+
data = response.json()
|
151 |
+
models = [model["name"] for model in data.get("models", [])]
|
152 |
+
return models if models else ["llama3.2"] # fallback
|
153 |
+
except Exception as e:
|
154 |
+
print(f"Warning: ไม่สามารถดึงรายชื่อ models จาก Ollama: {e}")
|
155 |
+
return ["llama3.2", "llama3.1", "gemma2", "qwen2.5"] # default models
|
156 |
+
|
157 |
class LLMProvider:
|
158 |
def __init__(self, provider="ollama", api_key=None, base_url="http://localhost:11434"):
|
159 |
self.provider = provider
|
|
|
164 |
try:
|
165 |
if self.provider == "ollama":
|
166 |
return self._generate_ollama(prompt, model, temperature, max_tokens)
|
|
|
|
|
|
|
|
|
167 |
elif self.provider == "deepseek":
|
168 |
return self._generate_deepseek(prompt, model, temperature, max_tokens)
|
169 |
+
elif self.provider == "huggingface":
|
170 |
+
return self._generate_huggingface(prompt, model, temperature, max_tokens)
|
171 |
elif self.provider == "hf_local":
|
172 |
return self._generate_hf_local(prompt, model, temperature, max_tokens)
|
173 |
else:
|
|
|
191 |
response.raise_for_status()
|
192 |
return response.json()["response"]
|
193 |
|
194 |
+
def _generate_deepseek(self, prompt, model="deepseek-chat", temperature=0.7, max_tokens=1000):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
url = "https://api.deepseek.com/v1/chat/completions"
|
196 |
headers = {
|
197 |
"Authorization": f"Bearer {self.api_key}",
|
|
|
208 |
response = requests.post(url, headers=headers, json=payload)
|
209 |
response.raise_for_status()
|
210 |
result = response.json()
|
|
|
211 |
return result["choices"][0]["message"]["content"]
|
212 |
|
213 |
def _generate_hf_local(self, prompt, model, temperature, max_tokens):
|
|
|
257 |
|
258 |
# 4. Dataset Generation & Augmentation
|
259 |
def generate_new_samples(samples: List[DataSample], llm_provider: LLMProvider,
|
260 |
+
generation_type="augment", n_generate=1, custom_prompt="", model="llama3.2", max_samples_to_process=5):
|
261 |
"""
|
262 |
generation_type: 'augment', 'roleplay', 'topic_conditioning', 'self_critique'
|
263 |
+
max_samples_to_process: จำนวน samples เดิมที่จะใช้ในการ generate
|
264 |
"""
|
265 |
generated_samples = []
|
266 |
|
267 |
+
# จำกัดจำนวน samples ตามที่ผู้ใช้เลือก
|
268 |
+
samples_to_use = samples[:max_samples_to_process]
|
269 |
+
|
270 |
+
for sample in samples_to_use:
|
271 |
for _ in range(n_generate):
|
272 |
try:
|
273 |
if generation_type == "augment":
|
|
|
352 |
}}"""
|
353 |
|
354 |
else: # custom prompt
|
355 |
+
prompt = custom_prompt.format(**sample.model_dump())
|
356 |
|
357 |
# Generate ด้วย LLM
|
358 |
+
response = llm_provider.generate(prompt, model=model)
|
359 |
|
360 |
# Parse JSON response
|
361 |
try:
|
|
|
452 |
|
453 |
return samples
|
454 |
|
455 |
+
def translate_to_multilingual(samples: List[DataSample], llm_provider: LLMProvider, target_lang="en", model="llama3.2", max_samples=3) -> List[DataSample]:
|
456 |
"""Translate samples to target language"""
|
457 |
translated = []
|
458 |
|
459 |
+
for sample in samples[:max_samples]: # จำกัดตามที่ระบุ
|
460 |
if sample.language == target_lang:
|
461 |
continue
|
462 |
|
|
|
479 |
"rationale": "translated rationale"
|
480 |
}}"""
|
481 |
|
482 |
+
response = llm_provider.generate(prompt, model=model)
|
483 |
|
484 |
# Parse JSON
|
485 |
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
|
506 |
|
507 |
return translated
|
508 |
|
509 |
+
def add_multiple_choice_options(samples: List[DataSample], llm_provider: LLMProvider, model="llama3.2", max_samples=3) -> List[DataSample]:
|
510 |
"""Add multiple choice options to samples"""
|
511 |
+
for sample in samples[:max_samples]: # จำกัดตามที่ระบุ
|
512 |
if sample.options: # มี options อยู่แล้ว
|
513 |
continue
|
514 |
|
|
|
528 |
Make sure the correct answer ({sample.answer}) is included as one of the options.
|
529 |
"""
|
530 |
|
531 |
+
response = llm_provider.generate(prompt, model=model)
|
532 |
|
533 |
# Parse JSON array
|
534 |
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
|
|
544 |
return samples
|
545 |
|
546 |
# 6. Export & Visualization
|
547 |
+
def preview_data(source_type, path_or_name, file_upload):
|
548 |
+
"""Preview dataset before processing"""
|
549 |
+
try:
|
550 |
+
# ใช้ไฟล์ที่อัปโหลดถ้ามี หรือใช้ path ที่กรอก
|
551 |
+
file_path = file_upload.name if file_upload else path_or_name
|
552 |
+
|
553 |
+
if source_type == "local":
|
554 |
+
if not file_path:
|
555 |
+
return gr.update(visible=False), "กรุณาเลือกไฟล์หรือใส่ path"
|
556 |
+
|
557 |
+
if not os.path.exists(file_path):
|
558 |
+
return gr.update(visible=False), f"ไม่พบไฟล์: {file_path}"
|
559 |
+
|
560 |
+
ext = os.path.splitext(file_path)[-1].lower()
|
561 |
+
|
562 |
+
if ext == ".csv":
|
563 |
+
df = pd.read_csv(file_path, encoding="utf-8")
|
564 |
+
preview_html = f"""
|
565 |
+
<div style="margin: 10px 0;">
|
566 |
+
<h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
|
567 |
+
<p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
|
568 |
+
<p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
569 |
+
<h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
|
570 |
+
{df.head().to_html(classes='table table-striped', escape=False)}
|
571 |
+
</div>
|
572 |
+
"""
|
573 |
+
return gr.update(visible=True, value=preview_html), ""
|
574 |
+
|
575 |
+
elif ext == ".jsonl":
|
576 |
+
data = []
|
577 |
+
with open(file_path, 'r', encoding="utf-8") as f:
|
578 |
+
for i, line in enumerate(f):
|
579 |
+
if i >= 5: # แสดงแค่ 5 บรรทัดแรก
|
580 |
+
break
|
581 |
+
try:
|
582 |
+
data.append(json.loads(line.strip()))
|
583 |
+
except json.JSONDecodeError:
|
584 |
+
continue
|
585 |
+
|
586 |
+
if data:
|
587 |
+
df = pd.DataFrame(data)
|
588 |
+
total_lines = sum(1 for _ in open(file_path, 'r', encoding="utf-8"))
|
589 |
+
preview_html = f"""
|
590 |
+
<div style="margin: 10px 0;">
|
591 |
+
<h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
|
592 |
+
<p><strong>จำนวนบรรทัด:</strong> {total_lines} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
593 |
+
<h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
|
594 |
+
{df.to_html(classes='table table-striped', escape=False)}
|
595 |
+
</div>
|
596 |
+
"""
|
597 |
+
return gr.update(visible=True, value=preview_html), ""
|
598 |
+
else:
|
599 |
+
return gr.update(visible=False), "ไม่สามารถอ่านข้อมูลจากไฟล์ JSONL"
|
600 |
+
|
601 |
+
elif ext == ".json":
|
602 |
+
with open(file_path, 'r', encoding="utf-8") as f:
|
603 |
+
data = json.load(f)
|
604 |
+
|
605 |
+
if isinstance(data, list):
|
606 |
+
df = pd.DataFrame(data[:5]) # แสดงแค่ 5 รายการแรก
|
607 |
+
preview_html = f"""
|
608 |
+
<div style="margin: 10px 0;">
|
609 |
+
<h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
|
610 |
+
<p><strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
611 |
+
<h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
|
612 |
+
{df.to_html(classes='table table-striped', escape=False)}
|
613 |
+
</div>
|
614 |
+
"""
|
615 |
+
else:
|
616 |
+
# Single object
|
617 |
+
df = pd.DataFrame([data])
|
618 |
+
preview_html = f"""
|
619 |
+
<div style="margin: 10px 0;">
|
620 |
+
<h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
|
621 |
+
<p><strong>ประเภท:</strong> Object เดียว | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
622 |
+
<h5>ข้อมูล:</h5>
|
623 |
+
{df.to_html(classes='table table-striped', escape=False)} </div>
|
624 |
+
"""
|
625 |
+
return gr.update(visible=True, value=preview_html), ""
|
626 |
+
elif ext == ".parquet":
|
627 |
+
df = pd.read_parquet(file_path)
|
628 |
+
preview_html = f"""
|
629 |
+
<div style="margin: 10px 0;">
|
630 |
+
<h4>📄 ไฟล์: {os.path.basename(file_path)}</h4>
|
631 |
+
<p><strong>จำนวนแถว:</strong> {len(df)} | <strong>จำนวนคอลัมน์:</strong> {len(df.columns)}</p>
|
632 |
+
<p><strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
633 |
+
<h5>ตัวอย่างข้อมูล (5 แถวแรก):</h5>
|
634 |
+
{df.head().to_html(classes='table table-striped', escape=False)}
|
635 |
+
</div>
|
636 |
+
"""
|
637 |
+
return gr.update(visible=True, value=preview_html), ""
|
638 |
+
elif os.path.isdir(file_path):
|
639 |
+
# ตรวจสอบว่าเป็น HF dataset directory หรือไม่
|
640 |
+
if os.path.exists(os.path.join(file_path, "dataset_info.json")):
|
641 |
+
try:
|
642 |
+
dataset = Dataset.load_from_disk(file_path)
|
643 |
+
sample_data = [dict(item) for i, item in enumerate(dataset) if i < 5]
|
644 |
+
df = pd.DataFrame(sample_data)
|
645 |
+
|
646 |
+
preview_html = f"""
|
647 |
+
<div style="margin: 10px 0;">
|
648 |
+
<h4>📁 HF Dataset Directory: {os.path.basename(file_path)}</h4>
|
649 |
+
<p><strong>จำนวนรายการ:</strong> {len(dataset)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
650 |
+
<h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
|
651 |
+
{df.to_html(classes='table table-striped', escape=False)}
|
652 |
+
</div>
|
653 |
+
"""
|
654 |
+
return gr.update(visible=True, value=preview_html), ""
|
655 |
+
except Exception as e:
|
656 |
+
return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
|
657 |
+
else:
|
658 |
+
return gr.update(visible=False), f"ไม่ใช่ HF dataset directory ที่ถูกต้อง"
|
659 |
+
else:
|
660 |
+
return gr.update(visible=False), f"ไม่รองรับไฟล์ประเภท {ext}"
|
661 |
+
|
662 |
+
elif source_type == "hf":
|
663 |
+
if not path_or_name:
|
664 |
+
return gr.update(visible=False), "กรุณาใส่ชื่อ dataset จาก Hugging Face"
|
665 |
+
|
666 |
+
# Preview HF dataset
|
667 |
+
try:
|
668 |
+
ds = load_dataset(path_or_name)
|
669 |
+
available_splits = list(ds.keys())
|
670 |
+
split_name = available_splits[0]
|
671 |
+
data = ds[split_name]
|
672 |
+
|
673 |
+
# แปลงตัวอย่างเป็น DataFrame
|
674 |
+
sample_data = []
|
675 |
+
for i, item in enumerate(data):
|
676 |
+
if i >= 5: # แสดงแค่ 5 รายการแรก
|
677 |
+
break
|
678 |
+
sample_data.append(dict(item))
|
679 |
+
|
680 |
+
if sample_data:
|
681 |
+
df = pd.DataFrame(sample_data)
|
682 |
+
preview_html = f"""
|
683 |
+
<div style="margin: 10px 0;">
|
684 |
+
<h4>🤗 Hugging Face Dataset: {path_or_name}</h4>
|
685 |
+
<p><strong>Split:</strong> {split_name} | <strong>จำนวนรายการ:</strong> {len(data)} | <strong>คอลัมน์:</strong> {', '.join(df.columns.tolist())}</p>
|
686 |
+
<h5>ตัวอย่างข้อมูล (5 รายการแรก):</h5>
|
687 |
+
{df.to_html(classes='table table-striped', escape=False)}
|
688 |
+
</div>
|
689 |
+
"""
|
690 |
+
return gr.update(visible=True, value=preview_html), ""
|
691 |
+
else:
|
692 |
+
return gr.update(visible=False), "Dataset ว่างเปล่า"
|
693 |
+
|
694 |
+
except Exception as e:
|
695 |
+
return gr.update(visible=False), f"ไม่สามารถโหลด HF dataset: {str(e)}"
|
696 |
+
|
697 |
+
return gr.update(visible=False), "กรุณาเลือกประเภทข้อมูล"
|
698 |
+
|
699 |
+
except Exception as e:
|
700 |
+
return gr.update(visible=False), f"เกิดข้อผิดพลาด: {str(e)}"
|
701 |
+
|
702 |
+
def update_path_from_file(file_upload):
|
703 |
+
"""อัปเดต path เมื่อมีการเลือกไฟล์"""
|
704 |
+
if file_upload:
|
705 |
+
return file_upload.name
|
706 |
+
return ""
|
707 |
+
|
708 |
def export_dataset(samples: List[DataSample], format_type="csv", output_path="output"):
|
709 |
"""Export dataset ในรูปแบบต่างๆ"""
|
710 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
711 |
|
712 |
if format_type == "csv":
|
713 |
+
df = pd.DataFrame([s.model_dump() for s in samples])
|
714 |
filename = f"{output_path}_{timestamp}.csv"
|
715 |
df.to_csv(filename, index=False, encoding="utf-8-sig")
|
716 |
+
return filename
|
|
|
717 |
elif format_type == "jsonl":
|
718 |
filename = f"{output_path}_{timestamp}.jsonl"
|
719 |
with open(filename, 'w', encoding="utf-8") as f:
|
720 |
for sample in samples:
|
721 |
+
f.write(json.dumps(sample.model_dump(), ensure_ascii=False) + "\n")
|
722 |
return filename
|
723 |
|
724 |
elif format_type == "hf_dataset":
|
725 |
# Create Hugging Face Dataset
|
726 |
+
data_dict = {key: [] for key in samples[0].model_dump().keys()}
|
727 |
for sample in samples:
|
728 |
+
sample_dict = sample.model_dump()
|
729 |
for key, value in sample_dict.items():
|
730 |
data_dict[key].append(value)
|
731 |
|
|
|
734 |
dataset.save_to_disk(dirname)
|
735 |
return dirname
|
736 |
|
737 |
+
elif format_type == "parquet":
|
738 |
+
# Export เป็น Parquet format
|
739 |
+
df = pd.DataFrame([s.model_dump() for s in samples])
|
740 |
+
filename = f"{output_path}_{timestamp}.parquet"
|
741 |
+
df.to_parquet(filename, index=False, engine='pyarrow')
|
742 |
+
return filename
|
743 |
+
|
744 |
+
elif format_type == "hf_dataset_parquet":
|
745 |
+
# Create Hugging Face Dataset และ save เป็น Parquet
|
746 |
+
data_dict = {key: [] for key in samples[0].model_dump().keys()}
|
747 |
+
for sample in samples:
|
748 |
+
sample_dict = sample.model_dump()
|
749 |
+
for key, value in sample_dict.items():
|
750 |
+
data_dict[key].append(value)
|
751 |
+
|
752 |
+
dataset = Dataset.from_dict(data_dict)
|
753 |
+
filename = f"{output_path}_{timestamp}.parquet"
|
754 |
+
dataset.to_parquet(filename)
|
755 |
+
return filename
|
756 |
+
|
757 |
else:
|
758 |
raise ValueError(f"ไม่รองรับรูปแบบ: {format_type}")
|
759 |
|
|
|
762 |
if not samples:
|
763 |
return {"total": 0}
|
764 |
|
765 |
+
df = pd.DataFrame([s.model_dump() for s in samples])
|
766 |
|
767 |
stats = {
|
768 |
"total": len(samples),
|
|
|
781 |
|
782 |
# 7. Main Workflow Function
|
783 |
def main_workflow(source_type, path_or_name, llm_provider_type, api_key, base_url,
|
784 |
+
ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, target_language,
|
785 |
add_multiple_choice, export_format):
|
786 |
try:
|
787 |
progress_text = "เริ่มต้น workflow...\n"
|
|
|
797 |
provider=llm_provider_type,
|
798 |
api_key=api_key if api_key else None,
|
799 |
base_url=base_url if base_url else "http://localhost:11434"
|
800 |
+
) # 3. Generate new samples
|
|
|
|
|
801 |
if n_generate > 0:
|
802 |
+
progress_text += f"✨ กำลัง generate {n_generate} samples ใหม่ ({generation_type}) จาก {min(max_samples_to_process, len(samples))} samples เดิม...\n"
|
803 |
+
# เลือกโมเดลที่เหมาะสม
|
804 |
+
if llm_provider_type == "ollama":
|
805 |
+
model_name = ollama_model
|
806 |
+
elif llm_provider_type == "deepseek":
|
807 |
+
model_name = deepseek_model
|
808 |
+
else:
|
809 |
+
model_name = "deepseek-chat" # default for other providers
|
810 |
+
|
811 |
if llm_provider_type == "huggingface":
|
812 |
with gr.Progress(track_tqdm=True, desc="กำลัง generate ด้วย Hugging Face..."):
|
813 |
+
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process)
|
814 |
else:
|
815 |
+
new_samples = generate_new_samples(samples, llm_provider, generation_type, n_generate, custom_prompt, model_name, max_samples_to_process)
|
816 |
samples.extend(new_samples)
|
817 |
progress_text += f"✅ Generate สำเร็จ {len(new_samples)} samples ใหม่\n"
|
818 |
|
|
|
827 |
|
828 |
samples = difficulty_assessment(samples)
|
829 |
progress_text += f" - ประเมิน difficulty เสร็จสิ้น\n"
|
830 |
+
# 5. Translation
|
|
|
831 |
if target_language and target_language != "none":
|
832 |
progress_text += f"🌐 กำลังแปลเป็น {target_language}...\n"
|
833 |
+
max_translate_samples = min(10, len(samples)) # จำกัดการแปลไม่เกิน 10 samples
|
834 |
if llm_provider_type == "huggingface":
|
835 |
with gr.Progress(track_tqdm=True, desc="กำลังแปลด้วย Hugging Face..."):
|
836 |
+
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
837 |
else:
|
838 |
+
translated = translate_to_multilingual(samples, llm_provider, target_language, model_name, max_translate_samples)
|
839 |
samples.extend(translated)
|
840 |
progress_text += f"✅ แปลภาษาสำเร็จ {len(translated)} samples\n"
|
841 |
|
842 |
# 6. Add multiple choice
|
843 |
if add_multiple_choice:
|
844 |
progress_text += "📝 กำลังเพิ่ม multiple choice options...\n"
|
845 |
+
max_mc_samples = min(10, len(samples)) # จำกัดการสร้าง multiple choice ไม่เกิน 10 samples
|
846 |
if llm_provider_type == "huggingface":
|
847 |
with gr.Progress(track_tqdm=True, desc="กำลังเพิ่มตัวเลือกด้วย Hugging Face..."):
|
848 |
+
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
849 |
else:
|
850 |
+
samples = add_multiple_choice_options(samples, llm_provider, model_name, max_mc_samples)
|
851 |
progress_text += "✅ เพิ่ม multiple choice เสร็จสิ้น\n"
|
852 |
|
853 |
# 7. Export
|
|
|
864 |
progress_text += f" - Languages: {stats['languages']}\n"
|
865 |
progress_text += f" - มี Multiple Choice: {stats['with_options']}\n"
|
866 |
|
867 |
+
return progress_text, pd.DataFrame([s.model_dump() for s in samples]).head(10).to_html()
|
868 |
|
869 |
except Exception as e:
|
870 |
error_text = f"❌ เกิดข้อผิดพลาด: {str(e)}"
|
|
|
874 |
with gr.Blocks(title="Dataset Generator System", theme=gr.themes.Soft()) as demo:
|
875 |
gr.Markdown("# 🤖 ระบบ Generate Dataset จากโมเดล AI")
|
876 |
gr.Markdown("ระบบสำหรับสร้าง, ขยาย, และประมวลผล dataset ด้วย AI models")
|
|
|
877 |
with gr.Tab("📂 Dataset Input"):
|
878 |
with gr.Row():
|
879 |
source_type = gr.Radio(
|
880 |
+
["local", "hf"],
|
881 |
label="ประเภทแหล่งข้อมูล",
|
882 |
info="local = ไฟล์ในเครื่อง, hf = Hugging Face dataset",
|
883 |
value="local"
|
884 |
)
|
885 |
+
|
886 |
+
with gr.Row():
|
887 |
+
with gr.Column(scale=3): path_or_name = gr.Textbox(
|
888 |
+
label="Path หรือ Dataset Name",
|
889 |
+
placeholder="เช่น data.csv, data.parquet, output_hf_xxxx/ หรือ microsoft/DialoGPT-medium",
|
890 |
+
info="สำหรับ local: ใส่ path ไฟล์ (.csv, .jsonl, .json, .parquet) หรือ HF dataset directory / สำหรับ HF: ใส่ชื่อ dataset"
|
891 |
+
)
|
892 |
+
with gr.Column(scale=1): file_upload = gr.File(
|
893 |
+
label="หรือเลือกไฟล์",
|
894 |
+
file_types=[".csv", ".jsonl", ".json", ".parquet"],
|
895 |
+
visible=True
|
896 |
+
)
|
897 |
+
|
898 |
+
# Preview section
|
899 |
+
with gr.Row():
|
900 |
+
preview_btn = gr.Button("🔍 ดูตัวอย่างข้อมูล", variant="secondary")
|
901 |
+
|
902 |
+
with gr.Row():
|
903 |
+
data_preview = gr.HTML(
|
904 |
+
label="ตัวอย่างข้อมูล",
|
905 |
+
visible=False
|
906 |
)
|
907 |
|
908 |
with gr.Tab("🤖 LLM Settings"):
|
909 |
with gr.Row():
|
910 |
llm_provider_type = gr.Dropdown(
|
911 |
+
["ollama", "deepseek", "huggingface", "hf_local"],
|
912 |
label="LLM Provider",
|
913 |
value="ollama",
|
914 |
info="เลือกผู้ให้บริการ LLM"
|
|
|
916 |
api_key = gr.Textbox(
|
917 |
label="API Key (ถ้าจำเป็น)",
|
918 |
type="password",
|
919 |
+
placeholder="สำหรับ DeepSeek หรือ HuggingFace"
|
920 |
)
|
921 |
+
with gr.Row():
|
922 |
base_url = gr.Textbox(
|
923 |
label="Base URL",
|
924 |
value="http://localhost:11434",
|
925 |
info="สำหรับ Ollama หรือ local LLM server"
|
926 |
)
|
927 |
+
|
928 |
+
with gr.Row():
|
929 |
+
# Get available models and set appropriate default
|
930 |
+
available_models = get_ollama_models()
|
931 |
+
default_model = available_models[0] if available_models else "llama3.2"
|
932 |
+
|
933 |
+
ollama_model = gr.Dropdown(
|
934 |
+
choices=available_models,
|
935 |
+
label="Ollama Model",
|
936 |
+
value=default_model,
|
937 |
+
visible=True,
|
938 |
+
allow_custom_value=True,
|
939 |
+
info="เลือก model จาก Ollama"
|
940 |
+
)
|
941 |
+
|
942 |
+
deepseek_model = gr.Dropdown(
|
943 |
+
choices=["deepseek-chat", "deepseek-reasoner"],
|
944 |
+
label="DeepSeek Model",
|
945 |
+
value="deepseek-chat",
|
946 |
+
visible=False,
|
947 |
+
info="deepseek-chat = DeepSeek-V3-0324, deepseek-reasoner = DeepSeek-R1-0528"
|
948 |
+
)
|
949 |
+
|
950 |
+
refresh_models_btn = gr.Button(
|
951 |
+
"🔄 รีเฟรช Models",
|
952 |
+
size="sm",
|
953 |
+
visible=True
|
954 |
+
)
|
955 |
+
# ฟังก์ชันสำหรับรีเฟรช models
|
956 |
+
def refresh_ollama_models(base_url_val):
|
957 |
+
try:
|
958 |
+
models = get_ollama_models(base_url_val)
|
959 |
+
if models:
|
960 |
+
return gr.update(choices=models, value=models[0])
|
961 |
+
else:
|
962 |
+
return gr.update(choices=["llama3.2"], value="llama3.2")
|
963 |
+
except Exception as e:
|
964 |
+
print(f"Error refreshing models: {e}")
|
965 |
+
return gr.update(choices=["llama3.2"], value="llama3.2")
|
966 |
+
# ฟังก์ชันสำหรับแสดง/ซ่อน model dropdown ตามผู้ให้บริการ
|
967 |
+
def update_model_visibility(provider):
|
968 |
+
ollama_visible = (provider == "ollama")
|
969 |
+
deepseek_visible = (provider == "deepseek")
|
970 |
+
return (
|
971 |
+
gr.update(visible=ollama_visible),
|
972 |
+
gr.update(visible=deepseek_visible),
|
973 |
+
gr.update(visible=ollama_visible)
|
974 |
+
)
|
975 |
+
|
976 |
+
# Event handlers
|
977 |
+
refresh_models_btn.click(
|
978 |
+
fn=refresh_ollama_models,
|
979 |
+
inputs=[base_url],
|
980 |
+
outputs=[ollama_model]
|
981 |
+
)
|
982 |
+
llm_provider_type.change(
|
983 |
+
fn=update_model_visibility,
|
984 |
+
inputs=[llm_provider_type],
|
985 |
+
outputs=[ollama_model, deepseek_model, refresh_models_btn]
|
986 |
+
)
|
987 |
|
988 |
with gr.Tab("✨ Generation Settings"):
|
989 |
with gr.Row():
|
|
|
998 |
label="จำนวนรอบ Generate",
|
999 |
info="จำนวน samples ใหม่ที่จะสร้างต่อ original sample"
|
1000 |
)
|
1001 |
+
with gr.Row():
|
1002 |
+
max_samples_to_process = gr.Slider(
|
1003 |
+
1, 50, value=5, step=1,
|
1004 |
+
label="จำนวน Samples เดิมที่จะใช้ Generate",
|
1005 |
+
info="เลือกจำนวน samples จากข้อมูลเดิมที่จะใช้สร้างข้อมูลใหม่"
|
1006 |
+
)
|
1007 |
+
total_new_samples = gr.Number(
|
1008 |
+
label="รวมจำนวน Samples ใหม่ที่คาดว่าจะได้",
|
1009 |
+
value=5,
|
1010 |
+
interactive=False,
|
1011 |
+
info="คำนวณจาก: จำนวน samples เดิม × จำนวนรอบ generate"
|
1012 |
+
)
|
1013 |
|
1014 |
custom_prompt = gr.Textbox(
|
1015 |
label="Custom Prompt (ถ้าเลือก custom)",
|
|
|
1017 |
lines=3,
|
1018 |
visible=False
|
1019 |
)
|
|
|
1020 |
def update_custom_prompt_visibility(gen_type):
|
1021 |
return gr.update(visible=(gen_type == "custom"))
|
1022 |
|
1023 |
+
def update_total_samples_calculation(max_samples, n_gen):
|
1024 |
+
total = max_samples * n_gen
|
1025 |
+
return gr.update(value=total)
|
1026 |
+
|
1027 |
generation_type.change(
|
1028 |
update_custom_prompt_visibility,
|
1029 |
inputs=[generation_type],
|
1030 |
outputs=[custom_prompt]
|
1031 |
)
|
1032 |
+
|
1033 |
+
# อัปเดตการคำนวณจำนวน samples ใหม่
|
1034 |
+
max_samples_to_process.change(
|
1035 |
+
update_total_samples_calculation,
|
1036 |
+
inputs=[max_samples_to_process, n_generate],
|
1037 |
+
outputs=[total_new_samples]
|
1038 |
+
)
|
1039 |
+
|
1040 |
+
n_generate.change(
|
1041 |
+
update_total_samples_calculation,
|
1042 |
+
inputs=[max_samples_to_process, n_generate],
|
1043 |
+
outputs=[total_new_samples]
|
1044 |
+
)
|
1045 |
|
1046 |
with gr.Tab("🔧 Post-processing"):
|
1047 |
with gr.Row():
|
|
|
1057 |
info="สร้างตัวเลือกผิดสำหรับทำ multiple choice"
|
1058 |
)
|
1059 |
|
1060 |
+
with gr.Tab("💾 Export Settings"): export_format = gr.Dropdown(
|
1061 |
+
["csv", "jsonl", "hf_dataset", "parquet", "hf_dataset_parquet"],
|
|
|
1062 |
label="รูปแบบ Export",
|
1063 |
value="csv",
|
1064 |
info="รูปแบบไฟล์ที่ต้องการ export"
|
|
|
1080 |
preview_output = gr.HTML(
|
1081 |
label="ตัวอย่างข้อมูล (10 รายการแรก)"
|
1082 |
)
|
1083 |
+
# Event handlers run_btn.click(
|
|
|
|
|
1084 |
fn=main_workflow,
|
1085 |
inputs=[
|
1086 |
source_type, path_or_name, llm_provider_type, api_key, base_url,
|
1087 |
+
ollama_model, deepseek_model, generation_type, n_generate, max_samples_to_process, custom_prompt, target_language,
|
1088 |
add_multiple_choice, export_format
|
1089 |
],
|
1090 |
outputs=[progress_output, preview_output]
|
1091 |
)
|
|
|
1092 |
clear_btn.click(
|
1093 |
lambda: ("", ""),
|
1094 |
outputs=[progress_output, preview_output]
|
1095 |
)
|
1096 |
|
1097 |
+
# Preview event handlers
|
1098 |
+
preview_btn.click(
|
1099 |
+
fn=preview_data,
|
1100 |
+
inputs=[source_type, path_or_name, file_upload],
|
1101 |
+
outputs=[data_preview, progress_output]
|
1102 |
+
)
|
1103 |
+
|
1104 |
+
file_upload.upload(
|
1105 |
+
fn=update_path_from_file,
|
1106 |
+
inputs=[file_upload],
|
1107 |
+
outputs=[path_or_name]
|
1108 |
+
)
|
1109 |
+
|
1110 |
# ตัวอย่าง dataset schema
|
1111 |
with gr.Tab("📋 ตัวอย่าง Dataset Schema"):
|
1112 |
gr.Markdown("""
|
|
|
1130 |
id,context,question,answer,rationale,category,difficulty,source,language
|
1131 |
1,"นักเรียนคนหนึ่งเห็นเพื่อนทำโกง","ควรรายงานครูหรือไม่","ควรรายงาน","เพื่อความยุติธรรม","การศึกษา","medium","manual","th"
|
1132 |
```
|
1133 |
+
## ตัวอย่างไฟล์ JSONL:
|
|
|
1134 |
```json
|
1135 |
{"id": "1", "context": "นักเรียนคนหนึ่งเห็นเพื่อนทำโกง", "question": "ควรรายงานครูหรือไม่", "answer": "ควรรายงาน", "rationale": "เพื่อความยุติธรรม", "category": "การศึกษา", "difficulty": "medium", "source": "manual", "language": "th"}
|
1136 |
```
|
1137 |
+
|
1138 |
+
## รูปแบบ Export ที่รองรับ:
|
1139 |
+
- **CSV**: ไฟล์ Excel/Spreadsheet ทั่วไป
|
1140 |
+
- **JSONL**: JSON Lines สำหรับ machine learning
|
1141 |
+
- **Parquet**: รูปแบบคอลัมน์ที่มีประสิทธิภาพสูง
|
1142 |
+
- **HF Dataset**: Hugging Face Dataset directory (Arrow format)
|
1143 |
+
- **HF Dataset Parquet**: Hugging Face Dataset เป็น Parquet
|
1144 |
+
|
1145 |
+
## การโหลด Dataset ที่สร้างแล้ว:
|
1146 |
+
- สามารถโหลด output ที่สร้างแล้วกลับมาใช้ได้
|
1147 |
+
- รองรับ `.csv`, `.jsonl`, `.json`, `.parquet` และ HF dataset directories
|
1148 |
+
- ใส่ path ของไฟล์หรือ directory ใน "Path หรือ Dataset Name"
|
1149 |
""")
|
1150 |
|
1151 |
demo.launch()
|
requirements.txt
CHANGED
@@ -3,5 +3,7 @@ pandas>=1.5.0
|
|
3 |
datasets>=2.0.0
|
4 |
pydantic>=2.0.0
|
5 |
requests>=2.28.0
|
6 |
-
|
|
|
7 |
huggingface-hub>=0.16.0
|
|
|
|
3 |
datasets>=2.0.0
|
4 |
pydantic>=2.0.0
|
5 |
requests>=2.28.0
|
6 |
+
transformers>=4.20.0
|
7 |
+
torch>=1.12.0
|
8 |
huggingface-hub>=0.16.0
|
9 |
+
pyarrow>=10.0.0
|