Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
import json | |
import os | |
from pathlib import Path | |
def test_medical_dataset(): | |
try: | |
# Load a small sample of the medical questions dataset | |
dataset = load_dataset("medical_questions_pairs", split="train[:100]") | |
print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs") | |
# Print sample structure | |
print("\nSample structure:") | |
print(json.dumps(dataset[0], indent=2)) | |
return True | |
except Exception as e: | |
print(f"Error loading dataset: {str(e)}") | |
return False | |
def verify_data_directory(): | |
data_dir = Path("data/raw") | |
if not data_dir.exists(): | |
print(f"Creating data directory: {data_dir}") | |
data_dir.mkdir(parents=True, exist_ok=True) | |
# Check for JSON files | |
json_files = list(data_dir.glob("*.json")) | |
if json_files: | |
print(f"\nFound {len(json_files)} JSON files in data/raw:") | |
for file in json_files: | |
print(f"- {file.name}") | |
else: | |
print("\nNo JSON files found in data/raw directory") | |
if __name__ == "__main__": | |
print("Testing Hugging Face dataset loading...") | |
test_medical_dataset() | |
print("\nVerifying data directory structure...") | |
verify_data_directory() |