from datasets import load_dataset import json import os from pathlib import Path def test_medical_dataset(): try: # Load a small sample of the medical questions dataset dataset = load_dataset("medical_questions_pairs", split="train[:100]") print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs") # Print sample structure print("\nSample structure:") print(json.dumps(dataset[0], indent=2)) return True except Exception as e: print(f"Error loading dataset: {str(e)}") return False def verify_data_directory(): data_dir = Path("data/raw") if not data_dir.exists(): print(f"Creating data directory: {data_dir}") data_dir.mkdir(parents=True, exist_ok=True) # Check for JSON files json_files = list(data_dir.glob("*.json")) if json_files: print(f"\nFound {len(json_files)} JSON files in data/raw:") for file in json_files: print(f"- {file.name}") else: print("\nNo JSON files found in data/raw directory") if __name__ == "__main__": print("Testing Hugging Face dataset loading...") test_medical_dataset() print("\nVerifying data directory structure...") verify_data_directory()