Spaces:
Running
Running
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
# These will use different templates automatically | |
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") | |
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat") | |
smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct") | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Hello!"}, | |
] | |
# Each will format according to its model's template | |
mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False) | |
qwen_chat = qwen_tokenizer.apply_chat_template(messages, tokenize=False) | |
smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False) | |
dataset = load_dataset("HuggingFaceTB/smoltalk") | |
def convert_to_chatml(example): | |
return { | |
"messages": [ | |
{"role": "user", "content": example["input"]}, | |
{"role": "assistant", "content": example["output"]}, | |
] | |
} | |