SmolFactory / data.py
Tonic's picture
adds datasets splits
231fcd0 verified
raw
history blame
10.7 kB
"""
SmolLM3 Dataset Handler
Handles data loading, preprocessing, and tokenization for SmolLM3 fine-tuning
"""
import os
import json
import torch
from typing import Dict, List, Optional, Union
from datasets import Dataset, load_dataset
from transformers import PreTrainedTokenizer
import logging
logger = logging.getLogger(__name__)
class SmolLM3Dataset:
"""Dataset handler for SmolLM3 fine-tuning"""
def __init__(
self,
data_path: str,
tokenizer: PreTrainedTokenizer,
max_seq_length: int = 4096,
use_chat_template: bool = True,
chat_template_kwargs: Optional[Dict] = None
):
self.data_path = data_path
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
self.use_chat_template = use_chat_template
self.chat_template_kwargs = chat_template_kwargs or {}
# Load and process dataset
self.dataset = self._load_dataset()
self.processed_dataset = self._process_dataset()
def _load_dataset(self) -> Dataset:
"""Load dataset from various formats"""
logger.info(f"Loading dataset from {self.data_path}")
# Check if it's a Hugging Face dataset
if os.path.isdir(self.data_path):
# Local directory
try:
dataset = load_dataset("json", data_files={
"train": os.path.join(self.data_path, "train.json"),
"validation": os.path.join(self.data_path, "validation.json") if os.path.exists(os.path.join(self.data_path, "validation.json")) else None,
"test": os.path.join(self.data_path, "test.json") if os.path.exists(os.path.join(self.data_path, "test.json")) else None
})
logger.info("Loaded dataset from local JSON files")
return dataset
except Exception as e:
logger.warning(f"Failed to load as JSON dataset: {e}")
# Try to load as a single JSON file
if os.path.isfile(self.data_path) and self.data_path.endswith('.json'):
try:
with open(self.data_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Convert to dataset format
if isinstance(data, list):
dataset = Dataset.from_list(data)
else:
dataset = Dataset.from_dict(data)
logger.info("Loaded dataset from single JSON file")
return dataset
except Exception as e:
logger.error(f"Failed to load JSON file: {e}")
raise
# Try to load as a Hugging Face dataset name
try:
dataset = load_dataset(self.data_path)
logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
# If only 'train' split exists, create validation and test splits
if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
logger.info("Automatically splitting train into train/validation/test (98/1/1)")
split_dataset = dataset["train"].train_test_split(test_size=0.02, seed=42)
# Now split test into validation and test (1% each)
val_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)
dataset = {
"train": split_dataset["train"],
"validation": val_test_split["train"],
"test": val_test_split["test"]
}
return dataset
except Exception as e:
logger.error(f"Failed to load dataset: {e}")
raise
def _process_dataset(self) -> Dataset:
"""Process the dataset for training"""
logger.info("Processing dataset for training")
def format_chat_template(example):
"""Format example using chat template"""
if self.use_chat_template:
try:
# Handle different input formats
if "messages" in example:
messages = example["messages"]
elif "conversations" in example:
messages = example["conversations"]
elif "user" in example and "assistant" in example:
messages = [
{"role": "user", "content": example["user"]},
{"role": "assistant", "content": example["assistant"]}
]
elif "instruction" in example and "output" in example:
messages = [
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["output"]}
]
elif "prompt" in example and "completion" in example:
messages = [
{"role": "user", "content": example["prompt"]},
{"role": "assistant", "content": example["completion"]}
]
elif "prompt" in example and "accepted_completion" in example:
messages = [
{"role": "user", "content": example["prompt"]},
{"role": "assistant", "content": example["accepted_completion"]}
]
else:
# Fallback: treat as plain text
return {"text": str(example)}
# Apply chat template
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
**self.chat_template_kwargs
)
return {"text": text}
except Exception as e:
logger.warning(f"Failed to apply chat template: {e}")
# Fallback to plain text
return {"text": str(example)}
else:
# Use plain text
if "text" in example:
return {"text": example["text"]}
else:
return {"text": str(example)}
def tokenize_function(examples):
"""Tokenize the examples"""
# Tokenize the texts
tokenized = self.tokenizer(
examples["text"],
truncation=True,
padding=False,
max_length=self.max_seq_length,
return_overflowing_tokens=True,
return_length=True,
)
# Calculate input length
input_length = [len(x) for x in tokenized["input_ids"]]
# Create labels (same as input_ids for causal LM)
tokenized["labels"] = tokenized["input_ids"].copy()
return {
"input_ids": tokenized["input_ids"],
"attention_mask": tokenized["attention_mask"],
"labels": tokenized["labels"],
"length": input_length,
}
# Process the dataset
processed_dataset = self.dataset.map(
format_chat_template,
remove_columns=self.dataset["train"].column_names,
desc="Formatting dataset"
)
# Tokenize the dataset
tokenized_dataset = processed_dataset.map(
tokenize_function,
remove_columns=processed_dataset["train"].column_names,
desc="Tokenizing dataset",
batched=True,
)
logger.info(f"Dataset processed. Train samples: {len(tokenized_dataset['train'])}")
if "validation" in tokenized_dataset:
logger.info(f"Validation samples: {len(tokenized_dataset['validation'])}")
return tokenized_dataset
def get_train_dataset(self) -> Dataset:
"""Get training dataset"""
return self.processed_dataset["train"]
def get_eval_dataset(self) -> Optional[Dataset]:
"""Get evaluation dataset if available"""
if "validation" in self.processed_dataset:
return self.processed_dataset["validation"]
elif "test" in self.processed_dataset:
return self.processed_dataset["test"]
else:
return None
def get_data_collator(self):
"""Get data collator for training"""
from transformers import DataCollatorForLanguageModeling
return DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False, # We're doing causal LM, not masked LM
)
def create_sample_dataset(output_path: str = "my_dataset"):
"""Create a sample dataset for testing"""
os.makedirs(output_path, exist_ok=True)
# Sample conversations
conversations = [
{
"messages": [
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed."}
]
},
{
"messages": [
{"role": "user", "content": "Explain gravity in simple terms."},
{"role": "assistant", "content": "Gravity is the force that pulls objects toward each other, like how the Earth pulls things down to the ground."}
]
},
{
"messages": [
{"role": "user", "content": "How do I make a cup of coffee?"},
{"role": "assistant", "content": "To make a cup of coffee: 1) Boil water, 2) Add coffee grounds to a filter, 3) Pour hot water over the grounds, 4) Let it brew for a few minutes, 5) Enjoy!"}
]
}
]
# Split into train/validation
train_data = conversations[:2]
validation_data = conversations[2:]
# Save to files
with open(os.path.join(output_path, "train.json"), 'w', encoding='utf-8') as f:
json.dump(train_data, f, indent=2, ensure_ascii=False)
with open(os.path.join(output_path, "validation.json"), 'w', encoding='utf-8') as f:
json.dump(validation_data, f, indent=2, ensure_ascii=False)
logger.info(f"Sample dataset created in {output_path}")
return output_path