File size: 1,781 Bytes
32519eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import json
import random
import time
from pathlib import Path
from src.generation.medical_generator import MedicalTextGenerator

# Check for Gemini API key
if not os.getenv('GEMINI_API_KEY'):
    print("Please set the GEMINI_API_KEY environment variable:")
    print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'")
    print("Windows CMD: set GEMINI_API_KEY=your-api-key-here")
    exit(1)

# Ensure the output directory exists
output_dir = Path("data/synthetic")
output_dir.mkdir(parents=True, exist_ok=True)

# Initialize the generator
generator = MedicalTextGenerator()

# Define supported record types (using the keys from the generator's templates)
record_types = ["clinical_note", "discharge_summary", "lab_report"]

# Generate 100 mixed records
records = []
for i in range(100):
    # Randomly select record type
    record_type = random.choice(record_types)
    
    # Generate record using Hugging Face
    try:
        record = generator.generate_record(record_type, use_gemini=False)
        print(f"Generated record {i+1}/100: {record_type}")
        
        # Append record details
        records.append({
            "id": i + 1,
            "type": record_type,
            "content": record,
            "generator": "Hugging Face",
            "generated_at": time.strftime("%Y-%m-%d %H:%M:%S")
        })
        
        # Respect rate limits (e.g., 4 seconds between calls)
        time.sleep(4)
        
    except Exception as e:
        print(f"Error generating record {i+1}: {str(e)}")
        continue

# Save records to a JSON file
output_file = output_dir / "synthetic_records.json"
with open(output_file, "w") as f:
    json.dump(records, f, indent=2)

print(f"\nGenerated {len(records)} records and saved to {output_file}")