from datasets import load_dataset
from transformers import AutoTokenizer

MODEL_ID = "shisa-ai/shisa-v2-llama3.1-405b"
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

ds = (
    load_dataset('shisa-ai/shisa-v2-sharegpt', split='train')
    .shuffle(seed=42)
)


def convert_sharegpt_to_chat_format(conversations):
    """
    Convert ShareGPT format to standard chat format expected by chat templates.
    ShareGPT typically has: [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]
    Chat templates expect: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
    """
    chat_format = []
    
    # Handle both 'from'/'value' and 'role'/'content' formats
    for conv in conversations:
        if "from" in conv and "value" in conv:
            # ShareGPT format
            role_map = {
                "human": "user",
                "gpt": "assistant",
                "system": "system",
                "user": "user",  # Sometimes already in this format
                "assistant": "assistant"
            }
            role = role_map.get(conv["from"], conv["from"])
            chat_format.append({
                "role": role,
                "content": conv["value"]
            })
        elif "role" in conv and "content" in conv:
            # Already in chat format
            chat_format.append(conv)
        else:
            print(f"Warning: Unknown conversation format: {conv}")
            continue
    
    return chat_format

def to_chat_text(sample):
    # sample["conversation"] is assumed to be a list of {"role": "...", "value": "..."} dicts
    # Replace with the exact field names in your dataset.
    conv = convert_sharegpt_to_chat_format(sample['conversations'])
    return tok.apply_chat_template(conv, tokenize=False)

with open("calibration_chat.txt", "w", encoding="utf-8") as f:
    for i, s in enumerate(ds):
        f.write(to_chat_text(s) + "\n")
        if i >= 4000:          # ~1 M tokens for 405 B
            break