First_agent_template / convert_datasets.py
Agathe1489's picture
Create convert_datasets.py
44fa3df verified
from datasets import load_dataset
from transformers import AutoTokenizer
# These will use different templates automatically
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat")
smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
]
# Each will format according to its model's template
mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False)
qwen_chat = qwen_tokenizer.apply_chat_template(messages, tokenize=False)
smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False)
dataset = load_dataset("HuggingFaceTB/smoltalk")
def convert_to_chatml(example):
return {
"messages": [
{"role": "user", "content": example["input"]},
{"role": "assistant", "content": example["output"]},
]
}