File size: 1,035 Bytes
72df28d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json
import os

# Paths
input_path = "../data/code_alpaca_20k.json"
output_path = "../data/final_coding_dataset.jsonl"

# Make sure output folder exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load dataset
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Format into prompt-completion pairs
processed = []
for example in data:
    instruction = example.get("instruction", "").strip()
    input_text = example.get("input", "").strip()
    output_text = example.get("output", "").strip()

    if instruction and output_text:
        prompt = instruction
        if input_text:
            prompt += "\n\n" + input_text

        processed.append({
            "prompt": prompt,
            "completion": output_text
        })

# Save in JSONL format
with open(output_path, "w", encoding="utf-8") as f:
    for item in processed:
        json.dump(item, f)
        f.write("\n")

print(f"Preprocessing complete. Total examples: {len(processed)}")
print(f"Saved to: {output_path}")