Transformers
GGUF
imatrix
conversational
leonardlin commited on
Commit
e8f71f6
·
verified ·
1 Parent(s): 1a383d4

Upload make-calibration_chat.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. make-calibration_chat.py +56 -0
make-calibration_chat.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+
4
+ MODEL_ID = "shisa-ai/shisa-v2-llama3.1-405b"
5
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
6
+
7
+ ds = (
8
+ load_dataset('shisa-ai/shisa-v2-sharegpt', split='train')
9
+ .shuffle(seed=42)
10
+ )
11
+
12
+
13
+ def convert_sharegpt_to_chat_format(conversations):
14
+ """
15
+ Convert ShareGPT format to standard chat format expected by chat templates.
16
+ ShareGPT typically has: [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]
17
+ Chat templates expect: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
18
+ """
19
+ chat_format = []
20
+
21
+ # Handle both 'from'/'value' and 'role'/'content' formats
22
+ for conv in conversations:
23
+ if "from" in conv and "value" in conv:
24
+ # ShareGPT format
25
+ role_map = {
26
+ "human": "user",
27
+ "gpt": "assistant",
28
+ "system": "system",
29
+ "user": "user", # Sometimes already in this format
30
+ "assistant": "assistant"
31
+ }
32
+ role = role_map.get(conv["from"], conv["from"])
33
+ chat_format.append({
34
+ "role": role,
35
+ "content": conv["value"]
36
+ })
37
+ elif "role" in conv and "content" in conv:
38
+ # Already in chat format
39
+ chat_format.append(conv)
40
+ else:
41
+ print(f"Warning: Unknown conversation format: {conv}")
42
+ continue
43
+
44
+ return chat_format
45
+
46
+ def to_chat_text(sample):
47
+ # sample["conversation"] is assumed to be a list of {"role": "...", "value": "..."} dicts
48
+ # Replace with the exact field names in your dataset.
49
+ conv = convert_sharegpt_to_chat_format(sample['conversations'])
50
+ return tok.apply_chat_template(conv, tokenize=False)
51
+
52
+ with open("calibration_chat.txt", "w", encoding="utf-8") as f:
53
+ for i, s in enumerate(ds):
54
+ f.write(to_chat_text(s) + "\n")
55
+ if i >= 4000: # ~1 M tokens for 405 B
56
+ break