Agathe1489 commited on
Commit
44fa3df
·
verified ·
1 Parent(s): 012c153

Create convert_datasets.py

Browse files
Files changed (1) hide show
  1. convert_datasets.py +30 -0
convert_datasets.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ # These will use different templates automatically
6
+ mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
7
+ qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat")
8
+ smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")
9
+
10
+ messages = [
11
+ {"role": "system", "content": "You are a helpful assistant."},
12
+ {"role": "user", "content": "Hello!"},
13
+ ]
14
+
15
+ # Each will format according to its model's template
16
+ mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False)
17
+ qwen_chat = qwen_tokenizer.apply_chat_template(messages, tokenize=False)
18
+ smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False)
19
+
20
+ dataset = load_dataset("HuggingFaceTB/smoltalk")
21
+
22
+ def convert_to_chatml(example):
23
+ return {
24
+ "messages": [
25
+ {"role": "user", "content": example["input"]},
26
+ {"role": "assistant", "content": example["output"]},
27
+ ]
28
+ }
29
+
30
+