File size: 3,188 Bytes
769bb84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
"""
Test script to verify OpenHermes-FR dataset loading
"""

from datasets import load_dataset
import json
import random

def test_openhermes_fr():
    """Test loading and processing OpenHermes-FR dataset"""
    
    print("Loading OpenHermes-FR dataset...")
    try:
        dataset = load_dataset('legmlai/openhermes-fr')
        print(f"βœ… Dataset loaded successfully")
        print(f"   Train samples: {len(dataset['train'])}")
        if 'validation' in dataset:
            print(f"   Validation samples: {len(dataset['validation'])}")
        
        # Show sample structure
        sample = dataset['train'][0]
        print(f"\nπŸ“‹ Sample structure:")
        for key, value in sample.items():
            if isinstance(value, str) and len(value) > 100:
                print(f"   {key}: {value[:100]}...")
            else:
                print(f"   {key}: {value}")
        
        # Test conversion
        print(f"\nπŸ”„ Testing conversion...")
        
        def convert_to_training_format(example):
            # Handle OpenHermes-FR format specifically
            if 'prompt' in example and 'accepted_completion' in example:
                return {
                    'prompt': example['prompt'],
                    'completion': example['accepted_completion']
                }
            elif 'prompt' in example and 'completion' in example:
                return {
                    'prompt': example['prompt'],
                    'completion': example['completion']
                }
            else:
                return None
        
        # Process first 10 examples
        train_data = []
        for i, example in enumerate(dataset['train'][:10]):
            training_example = convert_to_training_format(example)
            if training_example and training_example['prompt'] and training_example['completion']:
                # Filter out bad entries
                if 'bad_entry' in example and example['bad_entry']:
                    print(f"   Skipping bad entry {i}")
                    continue
                train_data.append(training_example)
                print(f"   βœ… Converted example {i}")
        
        print(f"\nπŸ“Š Conversion results:")
        print(f"   Converted: {len(train_data)} valid examples")
        
        if train_data:
            print(f"\nπŸ“ Sample converted example:")
            sample = train_data[0]
            print(f"   Prompt: {sample['prompt'][:100]}...")
            print(f"   Completion: {sample['completion'][:100]}...")
        
        # Test sampling
        if len(dataset['train']) > 100:
            print(f"\n🎲 Testing sampling...")
            random.seed(42)
            sampled_indices = random.sample(range(len(dataset['train'])), 5)
            print(f"   Sampled indices: {sampled_indices}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return False

if __name__ == "__main__":
    success = test_openhermes_fr()
    if success:
        print("\nβœ… Dataset test completed successfully!")
    else:
        print("\n❌ Dataset test failed!")
        exit(1)