Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test script to verify dataset loading works correctly | |
| """ | |
| import os | |
| import sys | |
| import json | |
| from datasets import load_dataset | |
| def test_dataset_loading(): | |
| """Test loading the OpenHermes-FR dataset""" | |
| print("Testing dataset loading...") | |
| try: | |
| # Load the dataset | |
| dataset = load_dataset("legmlai/openhermes-fr") | |
| print(f"β Dataset loaded successfully") | |
| print(f" Train samples: {len(dataset['train'])}") | |
| # Check the first few examples | |
| print("\nSample examples:") | |
| for i in range(min(3, len(dataset['train'])): | |
| example = dataset['train'][i] | |
| print(f"\nExample {i+1}:") | |
| print(f" Keys: {list(example.keys())}") | |
| print(f" Prompt: {example.get('prompt', 'N/A')[:100]}...") | |
| print(f" Accepted completion: {example.get('accepted_completion', 'N/A')[:100]}...") | |
| print(f" Bad entry: {example.get('bad_entry', 'N/A')}") | |
| # Test filtering bad entries | |
| print(f"\nFiltering bad entries...") | |
| original_size = len(dataset['train']) | |
| filtered_dataset = dataset['train'].filter(lambda x: not x.get('bad_entry', False)) | |
| filtered_size = len(filtered_dataset) | |
| print(f" Original size: {original_size}") | |
| print(f" Filtered size: {filtered_size}") | |
| print(f" Removed: {original_size - filtered_size} bad entries") | |
| # Test conversion to training format | |
| print(f"\nTesting conversion to training format...") | |
| train_data = [] | |
| for i, example in enumerate(filtered_dataset): | |
| if i >= 5: # Just test first 5 examples | |
| break | |
| if 'prompt' in example and 'accepted_completion' in example: | |
| train_data.append({ | |
| 'prompt': example['prompt'], | |
| 'completion': example['accepted_completion'] | |
| }) | |
| print(f" Converted {len(train_data)} examples to training format") | |
| # Save a small sample | |
| os.makedirs('test_dataset', exist_ok=True) | |
| with open('test_dataset/train.json', 'w') as f: | |
| json.dump(train_data, f, indent=2) | |
| print(f"β Test completed successfully!") | |
| print(f" Sample saved to: test_dataset/train.json") | |
| return True | |
| except Exception as e: | |
| print(f"β Dataset loading failed: {e}") | |
| return False | |
| if __name__ == "__main__": | |
| success = test_dataset_loading() | |
| sys.exit(0 if success else 1) |