shisa-ai
/

shisa-v2-llama3.1-405b-GGUF

Transformers

GGUF

imatrix

conversational

Model card Files Files and versions Community

leonardlin commited on Jun 2

Commit

e8f71f6

verified ·

1 Parent(s): 1a383d4

Upload make-calibration_chat.py with huggingface_hub

Browse files

Files changed (1) hide show

make-calibration_chat.py +56 -0

make-calibration_chat.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+MODEL_ID = "shisa-ai/shisa-v2-llama3.1-405b"
+tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+ds = (
+    load_dataset('shisa-ai/shisa-v2-sharegpt', split='train')
+    .shuffle(seed=42)
+)
+def convert_sharegpt_to_chat_format(conversations):
+    """
+    Convert ShareGPT format to standard chat format expected by chat templates.
+    ShareGPT typically has: [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]
+    Chat templates expect: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
+    """
+    chat_format = []
+    # Handle both 'from'/'value' and 'role'/'content' formats
+    for conv in conversations:
+        if "from" in conv and "value" in conv:
+            # ShareGPT format
+            role_map = {
+                "human": "user",
+                "gpt": "assistant",
+                "system": "system",
+                "user": "user",  # Sometimes already in this format
+                "assistant": "assistant"
+            }
+            role = role_map.get(conv["from"], conv["from"])
+            chat_format.append({
+                "role": role,
+                "content": conv["value"]
+            })
+        elif "role" in conv and "content" in conv:
+            # Already in chat format
+            chat_format.append(conv)
+        else:
+            print(f"Warning: Unknown conversation format: {conv}")
+            continue
+    return chat_format
+def to_chat_text(sample):
+    # sample["conversation"] is assumed to be a list of {"role": "...", "value": "..."} dicts
+    # Replace with the exact field names in your dataset.
+    conv = convert_sharegpt_to_chat_format(sample['conversations'])
+    return tok.apply_chat_template(conv, tokenize=False)
+with open("calibration_chat.txt", "w", encoding="utf-8") as f:
+    for i, s in enumerate(ds):
+        f.write(to_chat_text(s) + "\n")
+        if i >= 4000:          # ~1 M tokens for 405 B
+            break