File size: 1,302 Bytes
32519eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from datasets import load_dataset
import json
import os
from pathlib import Path

def test_medical_dataset():
    try:
        # Load a small sample of the medical questions dataset
        dataset = load_dataset("medical_questions_pairs", split="train[:100]")
        print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs")
        
        # Print sample structure
        print("\nSample structure:")
        print(json.dumps(dataset[0], indent=2))
        
        return True
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        return False

def verify_data_directory():
    data_dir = Path("data/raw")
    if not data_dir.exists():
        print(f"Creating data directory: {data_dir}")
        data_dir.mkdir(parents=True, exist_ok=True)
    
    # Check for JSON files
    json_files = list(data_dir.glob("*.json"))
    if json_files:
        print(f"\nFound {len(json_files)} JSON files in data/raw:")
        for file in json_files:
            print(f"- {file.name}")
    else:
        print("\nNo JSON files found in data/raw directory")

if __name__ == "__main__":
    print("Testing Hugging Face dataset loading...")
    test_medical_dataset()
    
    print("\nVerifying data directory structure...")
    verify_data_directory()