syn / batch_generate.py
theaniketgiri's picture
� Initial commit to Hugging Face Space
32519eb
import os
import json
import random
import time
from pathlib import Path
from src.generation.medical_generator import MedicalTextGenerator
# Check for Gemini API key
if not os.getenv('GEMINI_API_KEY'):
print("Please set the GEMINI_API_KEY environment variable:")
print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'")
print("Windows CMD: set GEMINI_API_KEY=your-api-key-here")
exit(1)
# Ensure the output directory exists
output_dir = Path("data/synthetic")
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize the generator
generator = MedicalTextGenerator()
# Define supported record types (using the keys from the generator's templates)
record_types = ["clinical_note", "discharge_summary", "lab_report"]
# Generate 100 mixed records
records = []
for i in range(100):
# Randomly select record type
record_type = random.choice(record_types)
# Generate record using Hugging Face
try:
record = generator.generate_record(record_type, use_gemini=False)
print(f"Generated record {i+1}/100: {record_type}")
# Append record details
records.append({
"id": i + 1,
"type": record_type,
"content": record,
"generator": "Hugging Face",
"generated_at": time.strftime("%Y-%m-%d %H:%M:%S")
})
# Respect rate limits (e.g., 4 seconds between calls)
time.sleep(4)
except Exception as e:
print(f"Error generating record {i+1}: {str(e)}")
continue
# Save records to a JSON file
output_file = output_dir / "synthetic_records.json"
with open(output_file, "w") as f:
json.dump(records, f, indent=2)
print(f"\nGenerated {len(records)} records and saved to {output_file}")