import os import json import random import time from pathlib import Path from src.generation.medical_generator import MedicalTextGenerator # Check for Gemini API key if not os.getenv('GEMINI_API_KEY'): print("Please set the GEMINI_API_KEY environment variable:") print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'") print("Windows CMD: set GEMINI_API_KEY=your-api-key-here") exit(1) # Ensure the output directory exists output_dir = Path("data/synthetic") output_dir.mkdir(parents=True, exist_ok=True) # Initialize the generator generator = MedicalTextGenerator() # Define supported record types (using the keys from the generator's templates) record_types = ["clinical_note", "discharge_summary", "lab_report"] # Generate 100 mixed records records = [] for i in range(100): # Randomly select record type record_type = random.choice(record_types) # Generate record using Hugging Face try: record = generator.generate_record(record_type, use_gemini=False) print(f"Generated record {i+1}/100: {record_type}") # Append record details records.append({ "id": i + 1, "type": record_type, "content": record, "generator": "Hugging Face", "generated_at": time.strftime("%Y-%m-%d %H:%M:%S") }) # Respect rate limits (e.g., 4 seconds between calls) time.sleep(4) except Exception as e: print(f"Error generating record {i+1}: {str(e)}") continue # Save records to a JSON file output_file = output_dir / "synthetic_records.json" with open(output_file, "w") as f: json.dump(records, f, indent=2) print(f"\nGenerated {len(records)} records and saved to {output_file}")