|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
|
|
MODEL_ID = "shisa-ai/shisa-v2-llama3.1-405b" |
|
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) |
|
|
|
ds = ( |
|
load_dataset('shisa-ai/shisa-v2-sharegpt', split='train') |
|
.shuffle(seed=42) |
|
) |
|
|
|
|
|
def convert_sharegpt_to_chat_format(conversations): |
|
""" |
|
Convert ShareGPT format to standard chat format expected by chat templates. |
|
ShareGPT typically has: [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}] |
|
Chat templates expect: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}] |
|
""" |
|
chat_format = [] |
|
|
|
|
|
for conv in conversations: |
|
if "from" in conv and "value" in conv: |
|
|
|
role_map = { |
|
"human": "user", |
|
"gpt": "assistant", |
|
"system": "system", |
|
"user": "user", |
|
"assistant": "assistant" |
|
} |
|
role = role_map.get(conv["from"], conv["from"]) |
|
chat_format.append({ |
|
"role": role, |
|
"content": conv["value"] |
|
}) |
|
elif "role" in conv and "content" in conv: |
|
|
|
chat_format.append(conv) |
|
else: |
|
print(f"Warning: Unknown conversation format: {conv}") |
|
continue |
|
|
|
return chat_format |
|
|
|
def to_chat_text(sample): |
|
|
|
|
|
conv = convert_sharegpt_to_chat_format(sample['conversations']) |
|
return tok.apply_chat_template(conv, tokenize=False) |
|
|
|
with open("calibration_chat.txt", "w", encoding="utf-8") as f: |
|
for i, s in enumerate(ds): |
|
f.write(to_chat_text(s) + "\n") |
|
if i >= 4000: |
|
break |
|
|