Spaces:
Sleeping
Sleeping
Commit
·
32519eb
0
Parent(s):
� Initial commit to Hugging Face Space
Browse files- .dockerignore +60 -0
- .gitignore +125 -0
- Dockerfile +30 -0
- LICENSE +55 -0
- README.md +284 -0
- analyze_data_quality.py +174 -0
- aniket.py +0 -0
- api.py +95 -0
- app.py +361 -0
- batch_generate.py +57 -0
- data/processed/.gitkeep +0 -0
- data/reports/plots/sample_distribution.png +0 -0
- data/synthetic/.gitkeep +0 -0
- requirements.txt +29 -0
- setup.py +77 -0
- setup_data.py +48 -0
- src/__init__.py +6 -0
- src/api/app.py +95 -0
- src/data_collection/data_collection.py +440 -0
- src/generation/__init__.py +7 -0
- src/generation/medical_generator.py +441 -0
- src/streamlit_app.py +40 -0
- src/web/index.html +127 -0
- streamlit_app.py +253 -0
- templates/index.html +66 -0
- test_dataset.py +41 -0
- test_pubmed.py +78 -0
- web_app.py +31 -0
.dockerignore
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Virtual Environment
|
24 |
+
synthex_env/
|
25 |
+
venv/
|
26 |
+
ENV/
|
27 |
+
|
28 |
+
# IDE
|
29 |
+
.idea/
|
30 |
+
.vscode/
|
31 |
+
*.swp
|
32 |
+
*.swo
|
33 |
+
|
34 |
+
# Git
|
35 |
+
.git
|
36 |
+
.gitignore
|
37 |
+
|
38 |
+
# Data
|
39 |
+
data/raw/*
|
40 |
+
data/processed/*
|
41 |
+
data/synthetic/*
|
42 |
+
!data/raw/.gitkeep
|
43 |
+
!data/processed/.gitkeep
|
44 |
+
!data/synthetic/.gitkeep
|
45 |
+
|
46 |
+
# Logs
|
47 |
+
*.log
|
48 |
+
|
49 |
+
# Local development
|
50 |
+
.env
|
51 |
+
.env.local
|
52 |
+
.env.*.local
|
53 |
+
|
54 |
+
# Docker
|
55 |
+
Dockerfile
|
56 |
+
.dockerignore
|
57 |
+
|
58 |
+
# Misc
|
59 |
+
.DS_Store
|
60 |
+
Thumbs.db
|
.gitignore
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Virtual Environment
|
24 |
+
venv/
|
25 |
+
env/
|
26 |
+
ENV/
|
27 |
+
.env
|
28 |
+
|
29 |
+
# IDE
|
30 |
+
.idea/
|
31 |
+
.vscode/
|
32 |
+
*.swp
|
33 |
+
*.swo
|
34 |
+
|
35 |
+
# Project specific
|
36 |
+
data/raw/
|
37 |
+
data/generated/
|
38 |
+
*.log
|
39 |
+
.DS_Store
|
40 |
+
.coverage
|
41 |
+
htmlcov/
|
42 |
+
.pytest_cache/
|
43 |
+
|
44 |
+
# Hugging Face
|
45 |
+
.huggingface/
|
46 |
+
.hf/
|
47 |
+
|
48 |
+
# Docker
|
49 |
+
.docker/
|
50 |
+
docker-compose.override.yml
|
51 |
+
|
52 |
+
# Security
|
53 |
+
*.pem
|
54 |
+
*.key
|
55 |
+
*.cert
|
56 |
+
|
57 |
+
# Large files
|
58 |
+
*.json
|
59 |
+
*.csv
|
60 |
+
*.xlsx
|
61 |
+
*.xls
|
62 |
+
*.db
|
63 |
+
*.sqlite
|
64 |
+
*.h5
|
65 |
+
*.pkl
|
66 |
+
*.model
|
67 |
+
*.bin
|
68 |
+
*.pt
|
69 |
+
*.pth
|
70 |
+
*.onnx
|
71 |
+
|
72 |
+
# Python
|
73 |
+
__pycache__/
|
74 |
+
*.py[cod]
|
75 |
+
*$py.class
|
76 |
+
*.so
|
77 |
+
.Python
|
78 |
+
build/
|
79 |
+
develop-eggs/
|
80 |
+
dist/
|
81 |
+
downloads/
|
82 |
+
eggs/
|
83 |
+
.eggs/
|
84 |
+
lib/
|
85 |
+
lib64/
|
86 |
+
parts/
|
87 |
+
sdist/
|
88 |
+
var/
|
89 |
+
wheels/
|
90 |
+
*.egg-info/
|
91 |
+
.installed.cfg
|
92 |
+
*.egg
|
93 |
+
|
94 |
+
# Virtual Environment
|
95 |
+
venv/
|
96 |
+
env/
|
97 |
+
ENV/
|
98 |
+
.env
|
99 |
+
|
100 |
+
# IDE
|
101 |
+
.idea/
|
102 |
+
.vscode/
|
103 |
+
*.swp
|
104 |
+
*.swo
|
105 |
+
|
106 |
+
# Project specific
|
107 |
+
data/generated/
|
108 |
+
*.log
|
109 |
+
.DS_Store
|
110 |
+
.coverage
|
111 |
+
htmlcov/
|
112 |
+
.pytest_cache/
|
113 |
+
|
114 |
+
# Hugging Face
|
115 |
+
.huggingface/
|
116 |
+
.hf/
|
117 |
+
|
118 |
+
# Docker
|
119 |
+
.docker/
|
120 |
+
docker-compose.override.yml
|
121 |
+
|
122 |
+
# Security
|
123 |
+
*.pem
|
124 |
+
*.key
|
125 |
+
*.cert
|
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use official Python image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
build-essential \
|
10 |
+
&& rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Copy requirements first to leverage Docker cache
|
13 |
+
COPY requirements.txt .
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Copy the rest of the application
|
17 |
+
COPY . .
|
18 |
+
|
19 |
+
# Create necessary directories
|
20 |
+
RUN mkdir -p src/web
|
21 |
+
|
22 |
+
# Expose the port
|
23 |
+
EXPOSE 8000
|
24 |
+
|
25 |
+
# Set environment variables
|
26 |
+
ENV PYTHONPATH=/app
|
27 |
+
ENV PORT=8000
|
28 |
+
|
29 |
+
# Command to run the application
|
30 |
+
CMD ["uvicorn", "src.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
LICENSE
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Synthex AI - Commercial License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Synthex AI
|
4 |
+
|
5 |
+
This software and associated documentation files (the "Software") are proprietary and confidential.
|
6 |
+
The Software is protected by copyright laws and international copyright treaties, as well as other
|
7 |
+
intellectual property laws and treaties.
|
8 |
+
|
9 |
+
TERMS AND CONDITIONS
|
10 |
+
|
11 |
+
1. License Grant
|
12 |
+
This license grants you a limited, non-exclusive, non-transferable license to use the Software
|
13 |
+
solely for your internal business purposes, subject to the terms and conditions of this Agreement.
|
14 |
+
|
15 |
+
2. Restrictions
|
16 |
+
You may not:
|
17 |
+
- Copy, modify, or create derivative works of the Software
|
18 |
+
- Reverse engineer, decompile, or disassemble the Software
|
19 |
+
- Remove or alter any proprietary notices or labels on the Software
|
20 |
+
- Use the Software for any illegal purpose
|
21 |
+
- Transfer, sublicense, or resell the Software
|
22 |
+
|
23 |
+
3. Proprietary Rights
|
24 |
+
The Software and all copies, modifications, and derivative works are owned by Synthex AI and
|
25 |
+
are protected by copyright, trade secret, and other intellectual property laws.
|
26 |
+
|
27 |
+
4. Confidentiality
|
28 |
+
You agree to maintain the confidentiality of the Software and not disclose it to any third party
|
29 |
+
without Synthex AI's prior written consent.
|
30 |
+
|
31 |
+
5. Warranty Disclaimer
|
32 |
+
THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. SYNTHEX AI DISCLAIMS ALL
|
33 |
+
WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
34 |
+
FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT.
|
35 |
+
|
36 |
+
6. Limitation of Liability
|
37 |
+
IN NO EVENT SHALL SYNTHEX AI BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY ARISING FROM,
|
38 |
+
OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
39 |
+
|
40 |
+
7. Termination
|
41 |
+
This license is effective until terminated. Your rights under this license will terminate
|
42 |
+
automatically without notice if you fail to comply with any of its terms.
|
43 |
+
|
44 |
+
8. Governing Law
|
45 |
+
This Agreement shall be governed by and construed in accordance with the laws of the State of
|
46 |
+
Delaware, without regard to its conflict of law provisions.
|
47 |
+
|
48 |
+
9. Contact Information
|
49 |
+
For licensing inquiries, please contact:
|
50 |
+
Synthex AI
|
51 |
+
Email: [email protected]
|
52 |
+
Website: https://synthex.ai
|
53 |
+
|
54 |
+
By using the Software, you acknowledge that you have read this Agreement, understand it, and agree
|
55 |
+
to be bound by its terms and conditions.
|
README.md
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Synthex AI - Medical Text Generation Platform
|
2 |
+
|
3 |
+

|
4 |
+

|
5 |
+

|
6 |
+
|
7 |
+
> Synthex AI is a cutting-edge platform that generates HIPAA-compliant synthetic medical records for healthcare AI development, testing, and research.
|
8 |
+
|
9 |
+
## 🏢 Enterprise Solution
|
10 |
+
|
11 |
+
Synthex AI provides enterprise-grade synthetic medical data generation with:
|
12 |
+
|
13 |
+
- **HIPAA Compliance**: All generated data is synthetic and compliant with healthcare regulations
|
14 |
+
- **Enterprise Security**: SOC 2 Type II certified infrastructure
|
15 |
+
- **Custom Solutions**: Tailored generation for specific medical domains
|
16 |
+
- **API Access**: RESTful API for integration with existing systems
|
17 |
+
- **Dedicated Support**: 24/7 enterprise support and SLAs
|
18 |
+
|
19 |
+
## 💼 Use Cases
|
20 |
+
|
21 |
+
### Healthcare AI Development
|
22 |
+
- Train and test AI models without real patient data
|
23 |
+
- Generate diverse medical scenarios for model validation
|
24 |
+
- Create synthetic datasets for research and development
|
25 |
+
|
26 |
+
### Medical Software Testing
|
27 |
+
- Test EHR systems with realistic synthetic data
|
28 |
+
- Validate clinical decision support systems
|
29 |
+
- QA medical software with diverse patient scenarios
|
30 |
+
|
31 |
+
### Healthcare Research
|
32 |
+
- Conduct research with privacy-compliant data
|
33 |
+
- Generate synthetic datasets for medical studies
|
34 |
+
- Test hypotheses without patient privacy concerns
|
35 |
+
|
36 |
+
## 🚀 Features
|
37 |
+
|
38 |
+
### Core Features
|
39 |
+
- Multiple medical record types:
|
40 |
+
- Clinical Notes
|
41 |
+
- Discharge Summaries
|
42 |
+
- Lab Reports
|
43 |
+
- Prescriptions
|
44 |
+
- Patient Intake Forms
|
45 |
+
- Advanced generation methods:
|
46 |
+
- Hugging Face models (default)
|
47 |
+
- Google Gemini API (premium)
|
48 |
+
- Custom model integration (enterprise)
|
49 |
+
- Enterprise-grade UI/UX
|
50 |
+
- Multiple export formats (JSON, CSV, TXT)
|
51 |
+
- Batch generation capabilities
|
52 |
+
- API access (enterprise)
|
53 |
+
|
54 |
+
### Enterprise Features
|
55 |
+
- Custom model training
|
56 |
+
- Domain-specific generation
|
57 |
+
- Advanced data validation
|
58 |
+
- Integration support
|
59 |
+
- Dedicated infrastructure
|
60 |
+
- Custom SLAs
|
61 |
+
|
62 |
+
## 💰 Pricing
|
63 |
+
|
64 |
+
### Free Tier
|
65 |
+
- Basic medical record generation
|
66 |
+
- Limited to 100 records/month
|
67 |
+
- Community support
|
68 |
+
- Basic templates
|
69 |
+
|
70 |
+
### Pro Plan ($99/month)
|
71 |
+
- Up to 10,000 records/month
|
72 |
+
- Advanced generation features
|
73 |
+
- Priority support
|
74 |
+
- API access
|
75 |
+
- Custom templates
|
76 |
+
|
77 |
+
### Enterprise Plan (Custom)
|
78 |
+
- Unlimited generation
|
79 |
+
- Custom model training
|
80 |
+
- Dedicated support
|
81 |
+
- Custom integrations
|
82 |
+
- SLA guarantees
|
83 |
+
- On-premise deployment
|
84 |
+
|
85 |
+
## 🛠️ Technical Details
|
86 |
+
|
87 |
+
### Architecture
|
88 |
+
```
|
89 |
+
synthex/
|
90 |
+
├── app.py # Main Streamlit application
|
91 |
+
├── src/
|
92 |
+
│ ├── generation/ # Core generation logic
|
93 |
+
│ ├── api/ # REST API endpoints
|
94 |
+
│ ├── validation/ # Data validation
|
95 |
+
│ └── enterprise/ # Enterprise features
|
96 |
+
├── data/
|
97 |
+
│ └── generated/ # Generated records storage
|
98 |
+
├── tests/ # Test suite
|
99 |
+
├── Dockerfile # Docker configuration
|
100 |
+
└── requirements.txt # Python dependencies
|
101 |
+
```
|
102 |
+
|
103 |
+
### API Reference
|
104 |
+
|
105 |
+
```python
|
106 |
+
from synthex import SynthexClient
|
107 |
+
|
108 |
+
# Initialize client
|
109 |
+
client = SynthexClient(api_key="your_api_key")
|
110 |
+
|
111 |
+
# Generate records
|
112 |
+
records = client.generate_records(
|
113 |
+
record_type="clinical_note",
|
114 |
+
count=100,
|
115 |
+
options={
|
116 |
+
"include_metadata": True,
|
117 |
+
"custom_fields": ["patient_demographics", "vital_signs"]
|
118 |
+
}
|
119 |
+
)
|
120 |
+
|
121 |
+
# Export data
|
122 |
+
client.export_records(
|
123 |
+
records,
|
124 |
+
format="json",
|
125 |
+
destination="s3://your-bucket/path"
|
126 |
+
)
|
127 |
+
```
|
128 |
+
|
129 |
+
## 🔒 Security & Compliance
|
130 |
+
|
131 |
+
- HIPAA Compliance
|
132 |
+
- SOC 2 Type II Certification
|
133 |
+
- GDPR Compliance
|
134 |
+
- Data Encryption at Rest and in Transit
|
135 |
+
- Regular Security Audits
|
136 |
+
- Access Control and Audit Logging
|
137 |
+
|
138 |
+
## 🤝 Enterprise Support
|
139 |
+
|
140 |
+
- 24/7 Technical Support
|
141 |
+
- Dedicated Account Manager
|
142 |
+
- Custom Integration Support
|
143 |
+
- Training and Onboarding
|
144 |
+
- Regular Updates and Maintenance
|
145 |
+
- Custom Development Services
|
146 |
+
|
147 |
+
## 📞 Contact
|
148 |
+
|
149 |
+
### Sales Inquiries
|
150 |
+
- Email: [email protected]
|
151 |
+
- Phone: +1 (555) 123-4567
|
152 |
+
- [Schedule a Demo](https://synthex.ai/demo)
|
153 |
+
|
154 |
+
### Technical Support
|
155 |
+
- Email: [email protected]
|
156 |
+
- [Documentation](https://docs.synthex.ai)
|
157 |
+
- [API Reference](https://api.synthex.ai)
|
158 |
+
|
159 |
+
## 🌟 Why Choose Synthex AI?
|
160 |
+
|
161 |
+
1. **Enterprise-Ready**: Built for scale and security
|
162 |
+
2. **Compliance-First**: HIPAA and GDPR compliant
|
163 |
+
3. **Customizable**: Tailored to your needs
|
164 |
+
4. **Support**: Enterprise-grade support
|
165 |
+
5. **Innovation**: Cutting-edge AI technology
|
166 |
+
|
167 |
+
## 🚀 Getting Started
|
168 |
+
|
169 |
+
### Quick Start
|
170 |
+
```bash
|
171 |
+
# Install Synthex CLI
|
172 |
+
pip install synthex
|
173 |
+
|
174 |
+
# Initialize client
|
175 |
+
synthex init
|
176 |
+
|
177 |
+
# Generate records
|
178 |
+
synthex generate --type clinical_note --count 10
|
179 |
+
```
|
180 |
+
|
181 |
+
### Docker Deployment
|
182 |
+
```bash
|
183 |
+
# Pull image
|
184 |
+
docker pull synthex/synthex:latest
|
185 |
+
|
186 |
+
# Run container
|
187 |
+
docker run -p 8501:8501 synthex/synthex
|
188 |
+
```
|
189 |
+
|
190 |
+
## 📚 Documentation
|
191 |
+
|
192 |
+
- [User Guide](https://docs.synthex.ai/guide)
|
193 |
+
- [API Documentation](https://docs.synthex.ai/api)
|
194 |
+
- [Enterprise Guide](https://docs.synthex.ai/enterprise)
|
195 |
+
- [Security Whitepaper](https://docs.synthex.ai/security)
|
196 |
+
|
197 |
+
## 🙏 Acknowledgments
|
198 |
+
|
199 |
+
- Built with [Streamlit](https://streamlit.io/)
|
200 |
+
- Powered by [Hugging Face](https://huggingface.co/)
|
201 |
+
- Enterprise features by [Google Cloud](https://cloud.google.com/)
|
202 |
+
|
203 |
+
---
|
204 |
+
|
205 |
+
© 2024 Synthex AI. All rights reserved.
|
206 |
+
|
207 |
+
# Synthex Medical Text Generator
|
208 |
+
|
209 |
+
A synthetic medical text generator that creates realistic medical records using AI models. The application provides both a FastAPI backend and a Streamlit interface.
|
210 |
+
|
211 |
+
## Features
|
212 |
+
|
213 |
+
- Generate various types of medical records:
|
214 |
+
- Clinical Notes
|
215 |
+
- Discharge Summaries
|
216 |
+
- Lab Reports
|
217 |
+
- Prescriptions
|
218 |
+
- Patient Intake Forms
|
219 |
+
- Support for multiple AI models:
|
220 |
+
- Hugging Face models (default)
|
221 |
+
- Google Gemini (optional)
|
222 |
+
- Two interfaces:
|
223 |
+
- FastAPI with HTML frontend
|
224 |
+
- Streamlit interface
|
225 |
+
|
226 |
+
## API Endpoints
|
227 |
+
|
228 |
+
- `GET /`: HTML interface
|
229 |
+
- `GET /record-types`: List available record types
|
230 |
+
- `POST /generate`: Generate medical records
|
231 |
+
```json
|
232 |
+
{
|
233 |
+
"record_type": "clinical_note",
|
234 |
+
"quantity": 1,
|
235 |
+
"use_gemini": false,
|
236 |
+
"include_metadata": true
|
237 |
+
}
|
238 |
+
```
|
239 |
+
|
240 |
+
## Deployment
|
241 |
+
|
242 |
+
### Local Development
|
243 |
+
|
244 |
+
1. Install dependencies:
|
245 |
+
```bash
|
246 |
+
pip install -r requirements.txt
|
247 |
+
```
|
248 |
+
|
249 |
+
2. Run FastAPI server:
|
250 |
+
```bash
|
251 |
+
uvicorn src.api.app:app --reload
|
252 |
+
```
|
253 |
+
|
254 |
+
3. Run Streamlit app (optional):
|
255 |
+
```bash
|
256 |
+
streamlit run app.py
|
257 |
+
```
|
258 |
+
|
259 |
+
### Docker Deployment
|
260 |
+
|
261 |
+
1. Build the Docker image:
|
262 |
+
```bash
|
263 |
+
docker build -t synthex-medical-generator .
|
264 |
+
```
|
265 |
+
|
266 |
+
2. Run the container:
|
267 |
+
```bash
|
268 |
+
docker run -p 8000:8000 synthex-medical-generator
|
269 |
+
```
|
270 |
+
|
271 |
+
### Hugging Face Spaces Deployment
|
272 |
+
|
273 |
+
1. Create a new Space on Hugging Face
|
274 |
+
2. Choose "Docker" as the SDK
|
275 |
+
3. Push this repository to your Space
|
276 |
+
4. The application will be automatically deployed
|
277 |
+
|
278 |
+
## Environment Variables
|
279 |
+
|
280 |
+
- `GEMINI_API_KEY`: Google Gemini API key (optional)
|
281 |
+
|
282 |
+
## License
|
283 |
+
|
284 |
+
MIT License
|
analyze_data_quality.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
from pathlib import Path
|
4 |
+
import logging
|
5 |
+
from collections import defaultdict
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
from typing import Dict, List, Any
|
9 |
+
import re
|
10 |
+
|
11 |
+
# Setup logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
class DataQualityAnalyzer:
|
16 |
+
def __init__(self, data_dir: str = "data/raw"):
|
17 |
+
self.data_dir = Path(data_dir)
|
18 |
+
self.stats = defaultdict(dict)
|
19 |
+
|
20 |
+
def load_dataset(self, file_path: Path) -> List[Dict]:
|
21 |
+
"""Load a dataset from JSON file"""
|
22 |
+
try:
|
23 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
24 |
+
return json.load(f)
|
25 |
+
except Exception as e:
|
26 |
+
logger.error(f"Error loading {file_path}: {str(e)}")
|
27 |
+
return []
|
28 |
+
|
29 |
+
def analyze_text_quality(self, text: str) -> Dict[str, Any]:
|
30 |
+
"""Analyze quality metrics for a text"""
|
31 |
+
if not text:
|
32 |
+
return {
|
33 |
+
"length": 0,
|
34 |
+
"word_count": 0,
|
35 |
+
"avg_word_length": 0,
|
36 |
+
"has_numbers": False,
|
37 |
+
"has_special_chars": False
|
38 |
+
}
|
39 |
+
|
40 |
+
words = text.split()
|
41 |
+
return {
|
42 |
+
"length": len(text),
|
43 |
+
"word_count": len(words),
|
44 |
+
"avg_word_length": sum(len(w) for w in words) / len(words) if words else 0,
|
45 |
+
"has_numbers": bool(re.search(r'\d', text)),
|
46 |
+
"has_special_chars": bool(re.search(r'[^a-zA-Z0-9\s.,!?-]', text))
|
47 |
+
}
|
48 |
+
|
49 |
+
def analyze_dataset(self, dataset_name: str, data: List[Dict]):
|
50 |
+
"""Analyze a single dataset"""
|
51 |
+
if not data:
|
52 |
+
logger.warning(f"No data found in {dataset_name}")
|
53 |
+
return
|
54 |
+
|
55 |
+
# Basic stats
|
56 |
+
self.stats[dataset_name]["total_samples"] = len(data)
|
57 |
+
|
58 |
+
# Text quality metrics
|
59 |
+
title_metrics = []
|
60 |
+
abstract_metrics = []
|
61 |
+
|
62 |
+
for item in data:
|
63 |
+
if "title" in item:
|
64 |
+
title_metrics.append(self.analyze_text_quality(item["title"]))
|
65 |
+
if "abstract" in item:
|
66 |
+
abstract_metrics.append(self.analyze_text_quality(item["abstract"]))
|
67 |
+
|
68 |
+
# Aggregate metrics
|
69 |
+
if title_metrics:
|
70 |
+
self.stats[dataset_name]["title"] = {
|
71 |
+
"avg_length": sum(m["length"] for m in title_metrics) / len(title_metrics),
|
72 |
+
"avg_word_count": sum(m["word_count"] for m in title_metrics) / len(title_metrics),
|
73 |
+
"avg_word_length": sum(m["avg_word_length"] for m in title_metrics) / len(title_metrics),
|
74 |
+
"has_numbers_ratio": sum(1 for m in title_metrics if m["has_numbers"]) / len(title_metrics),
|
75 |
+
"has_special_chars_ratio": sum(1 for m in title_metrics if m["has_special_chars"]) / len(title_metrics)
|
76 |
+
}
|
77 |
+
|
78 |
+
if abstract_metrics:
|
79 |
+
self.stats[dataset_name]["abstract"] = {
|
80 |
+
"avg_length": sum(m["length"] for m in abstract_metrics) / len(abstract_metrics),
|
81 |
+
"avg_word_count": sum(m["word_count"] for m in abstract_metrics) / len(abstract_metrics),
|
82 |
+
"avg_word_length": sum(m["avg_word_length"] for m in abstract_metrics) / len(abstract_metrics),
|
83 |
+
"has_numbers_ratio": sum(1 for m in abstract_metrics if m["has_numbers"]) / len(abstract_metrics),
|
84 |
+
"has_special_chars_ratio": sum(1 for m in abstract_metrics if m["has_special_chars"]) / len(abstract_metrics)
|
85 |
+
}
|
86 |
+
|
87 |
+
# Field presence
|
88 |
+
fields = set()
|
89 |
+
for item in data:
|
90 |
+
fields.update(item.keys())
|
91 |
+
self.stats[dataset_name]["fields"] = list(fields)
|
92 |
+
|
93 |
+
# Year distribution (if available)
|
94 |
+
if "year" in fields:
|
95 |
+
years = [item["year"] for item in data if "year" in item]
|
96 |
+
self.stats[dataset_name]["year_distribution"] = pd.Series(years).value_counts().to_dict()
|
97 |
+
|
98 |
+
def analyze_all_datasets(self):
|
99 |
+
"""Analyze all datasets in the data directory"""
|
100 |
+
for file_path in self.data_dir.glob("*.json"):
|
101 |
+
dataset_name = file_path.stem
|
102 |
+
logger.info(f"Analyzing dataset: {dataset_name}")
|
103 |
+
data = self.load_dataset(file_path)
|
104 |
+
self.analyze_dataset(dataset_name, data)
|
105 |
+
|
106 |
+
def generate_report(self):
|
107 |
+
"""Generate a comprehensive report"""
|
108 |
+
report = {
|
109 |
+
"summary": {},
|
110 |
+
"datasets": self.stats
|
111 |
+
}
|
112 |
+
|
113 |
+
# Overall summary
|
114 |
+
total_samples = sum(stats["total_samples"] for stats in self.stats.values())
|
115 |
+
report["summary"]["total_samples"] = total_samples
|
116 |
+
report["summary"]["total_datasets"] = len(self.stats)
|
117 |
+
|
118 |
+
# Save report
|
119 |
+
report_file = self.data_dir.parent / "reports" / "data_quality_report.json"
|
120 |
+
report_file.parent.mkdir(exist_ok=True)
|
121 |
+
|
122 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
123 |
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
124 |
+
|
125 |
+
logger.info(f"Quality report saved to {report_file}")
|
126 |
+
return report
|
127 |
+
|
128 |
+
def plot_metrics(self):
|
129 |
+
"""Generate plots for key metrics"""
|
130 |
+
plots_dir = self.data_dir.parent / "reports" / "plots"
|
131 |
+
plots_dir.mkdir(exist_ok=True)
|
132 |
+
|
133 |
+
# Sample distribution
|
134 |
+
plt.figure(figsize=(10, 6))
|
135 |
+
samples = {name: stats["total_samples"] for name, stats in self.stats.items()}
|
136 |
+
plt.bar(samples.keys(), samples.values())
|
137 |
+
plt.xticks(rotation=45)
|
138 |
+
plt.title("Sample Distribution Across Datasets")
|
139 |
+
plt.tight_layout()
|
140 |
+
plt.savefig(plots_dir / "sample_distribution.png")
|
141 |
+
plt.close()
|
142 |
+
|
143 |
+
# Text length distribution
|
144 |
+
for dataset_name, stats in self.stats.items():
|
145 |
+
if "abstract" in stats:
|
146 |
+
plt.figure(figsize=(10, 6))
|
147 |
+
plt.hist([m["length"] for m in stats["abstract"]], bins=50)
|
148 |
+
plt.title(f"Abstract Length Distribution - {dataset_name}")
|
149 |
+
plt.xlabel("Length")
|
150 |
+
plt.ylabel("Count")
|
151 |
+
plt.tight_layout()
|
152 |
+
plt.savefig(plots_dir / f"abstract_length_{dataset_name}.png")
|
153 |
+
plt.close()
|
154 |
+
|
155 |
+
def main():
|
156 |
+
analyzer = DataQualityAnalyzer()
|
157 |
+
analyzer.analyze_all_datasets()
|
158 |
+
report = analyzer.generate_report()
|
159 |
+
analyzer.plot_metrics()
|
160 |
+
|
161 |
+
# Print summary
|
162 |
+
print("\nData Quality Summary:")
|
163 |
+
print(f"Total samples: {report['summary']['total_samples']}")
|
164 |
+
print(f"Total datasets: {report['summary']['total_datasets']}")
|
165 |
+
print("\nPer Dataset Summary:")
|
166 |
+
for dataset_name, stats in report["datasets"].items():
|
167 |
+
print(f"\n{dataset_name}:")
|
168 |
+
print(f" Samples: {stats['total_samples']}")
|
169 |
+
if "abstract" in stats:
|
170 |
+
print(f" Avg abstract length: {stats['abstract']['avg_length']:.1f}")
|
171 |
+
print(f" Avg words per abstract: {stats['abstract']['avg_word_count']:.1f}")
|
172 |
+
|
173 |
+
if __name__ == "__main__":
|
174 |
+
main()
|
aniket.py
ADDED
File without changes
|
api.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from typing import List, Optional
|
5 |
+
import uvicorn
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Add src directory to Python path
|
10 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
11 |
+
|
12 |
+
# Import the medical generator
|
13 |
+
from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
|
14 |
+
|
15 |
+
app = FastAPI(
|
16 |
+
title="Synthex Medical Text Generator API",
|
17 |
+
description="API for generating synthetic medical records",
|
18 |
+
version="1.0.0"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Add CORS middleware
|
22 |
+
app.add_middleware(
|
23 |
+
CORSMiddleware,
|
24 |
+
allow_origins=["*"], # Allows all origins
|
25 |
+
allow_credentials=True,
|
26 |
+
allow_methods=["*"], # Allows all methods
|
27 |
+
allow_headers=["*"], # Allows all headers
|
28 |
+
)
|
29 |
+
|
30 |
+
# Initialize the generator
|
31 |
+
generator = None
|
32 |
+
|
33 |
+
class GenerationRequest(BaseModel):
|
34 |
+
record_type: str
|
35 |
+
quantity: int = 1
|
36 |
+
use_gemini: bool = False
|
37 |
+
gemini_api_key: Optional[str] = None
|
38 |
+
include_metadata: bool = True
|
39 |
+
|
40 |
+
class GenerationResponse(BaseModel):
|
41 |
+
records: List[dict]
|
42 |
+
total_generated: int
|
43 |
+
|
44 |
+
@app.on_event("startup")
|
45 |
+
async def startup_event():
|
46 |
+
global generator
|
47 |
+
try:
|
48 |
+
generator = MedicalTextGenerator()
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Error initializing generator: {str(e)}")
|
51 |
+
|
52 |
+
@app.get("/")
|
53 |
+
async def root():
|
54 |
+
return {"message": "Welcome to Synthex Medical Text Generator API"}
|
55 |
+
|
56 |
+
@app.post("/generate", response_model=GenerationResponse)
|
57 |
+
async def generate_records(request: GenerationRequest):
|
58 |
+
global generator
|
59 |
+
|
60 |
+
if generator is None:
|
61 |
+
try:
|
62 |
+
generator = MedicalTextGenerator(gemini_api_key=request.gemini_api_key)
|
63 |
+
except Exception as e:
|
64 |
+
raise HTTPException(status_code=500, detail=f"Failed to initialize generator: {str(e)}")
|
65 |
+
|
66 |
+
try:
|
67 |
+
generated_records = []
|
68 |
+
for _ in range(request.quantity):
|
69 |
+
record = generator.generate_record(
|
70 |
+
request.record_type,
|
71 |
+
use_gemini=request.use_gemini
|
72 |
+
)
|
73 |
+
generated_records.append(record)
|
74 |
+
|
75 |
+
return GenerationResponse(
|
76 |
+
records=generated_records,
|
77 |
+
total_generated=len(generated_records)
|
78 |
+
)
|
79 |
+
except Exception as e:
|
80 |
+
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
|
81 |
+
|
82 |
+
@app.get("/record-types")
|
83 |
+
async def get_record_types():
|
84 |
+
return {
|
85 |
+
"record_types": [
|
86 |
+
"clinical_note",
|
87 |
+
"discharge_summary",
|
88 |
+
"lab_report",
|
89 |
+
"prescription",
|
90 |
+
"patient_intake"
|
91 |
+
]
|
92 |
+
}
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
|
app.py
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Synthex Medical Text Generator - MVP Streamlit App
|
3 |
+
Deploy this on Hugging Face Spaces for free hosting
|
4 |
+
"""
|
5 |
+
|
6 |
+
import streamlit as st
|
7 |
+
import json
|
8 |
+
import time
|
9 |
+
from datetime import datetime
|
10 |
+
import pandas as pd
|
11 |
+
import os
|
12 |
+
import sys
|
13 |
+
import logging
|
14 |
+
|
15 |
+
# Setup logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
# Add src directory to Python path
|
20 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
21 |
+
|
22 |
+
# Import the medical generator
|
23 |
+
from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
|
24 |
+
|
25 |
+
# Page config
|
26 |
+
st.set_page_config(
|
27 |
+
page_title="Synthex Medical Text Generator",
|
28 |
+
page_icon="🏥",
|
29 |
+
layout="wide",
|
30 |
+
initial_sidebar_state="expanded"
|
31 |
+
)
|
32 |
+
|
33 |
+
# Custom CSS
|
34 |
+
st.markdown("""
|
35 |
+
<style>
|
36 |
+
/* Main container styling */
|
37 |
+
.main {
|
38 |
+
padding: 2rem;
|
39 |
+
background-color: #f8f9fa;
|
40 |
+
}
|
41 |
+
|
42 |
+
/* Header styling */
|
43 |
+
.main-header {
|
44 |
+
font-size: 2.5rem;
|
45 |
+
font-weight: bold;
|
46 |
+
color: #1f77b4;
|
47 |
+
text-align: center;
|
48 |
+
margin-bottom: 1rem;
|
49 |
+
padding: 1rem;
|
50 |
+
background: linear-gradient(135deg, #1f77b4 0%, #2c9cdb 100%);
|
51 |
+
color: white;
|
52 |
+
border-radius: 10px;
|
53 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
54 |
+
}
|
55 |
+
|
56 |
+
.sub-header {
|
57 |
+
font-size: 1.2rem;
|
58 |
+
color: #666;
|
59 |
+
text-align: center;
|
60 |
+
margin-bottom: 2rem;
|
61 |
+
padding: 0.5rem;
|
62 |
+
}
|
63 |
+
|
64 |
+
/* Card styling */
|
65 |
+
.record-container {
|
66 |
+
background-color: white;
|
67 |
+
padding: 1.5rem;
|
68 |
+
border-radius: 10px;
|
69 |
+
border-left: 4px solid #1f77b4;
|
70 |
+
margin: 1rem 0;
|
71 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
|
72 |
+
transition: transform 0.2s;
|
73 |
+
}
|
74 |
+
|
75 |
+
.record-container:hover {
|
76 |
+
transform: translateY(-2px);
|
77 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
78 |
+
}
|
79 |
+
|
80 |
+
/* Stats container styling */
|
81 |
+
.stats-container {
|
82 |
+
background-color: white;
|
83 |
+
padding: 1.5rem;
|
84 |
+
border-radius: 10px;
|
85 |
+
margin: 1rem 0;
|
86 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
|
87 |
+
}
|
88 |
+
|
89 |
+
/* Button styling */
|
90 |
+
.stButton>button {
|
91 |
+
width: 100%;
|
92 |
+
border-radius: 5px;
|
93 |
+
height: 3em;
|
94 |
+
font-weight: bold;
|
95 |
+
transition: all 0.3s;
|
96 |
+
}
|
97 |
+
|
98 |
+
.stButton>button:hover {
|
99 |
+
transform: translateY(-2px);
|
100 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
101 |
+
}
|
102 |
+
|
103 |
+
/* Metric styling */
|
104 |
+
.stMetric {
|
105 |
+
background-color: #f8f9fa;
|
106 |
+
padding: 1rem;
|
107 |
+
border-radius: 5px;
|
108 |
+
text-align: center;
|
109 |
+
}
|
110 |
+
|
111 |
+
/* Sidebar styling */
|
112 |
+
.sidebar .sidebar-content {
|
113 |
+
background-color: #f8f9fa;
|
114 |
+
}
|
115 |
+
|
116 |
+
/* Progress bar styling */
|
117 |
+
.stProgress > div > div {
|
118 |
+
background-color: #1f77b4;
|
119 |
+
}
|
120 |
+
|
121 |
+
/* Success message styling */
|
122 |
+
.stSuccess {
|
123 |
+
padding: 1rem;
|
124 |
+
border-radius: 5px;
|
125 |
+
background-color: #d4edda;
|
126 |
+
color: #155724;
|
127 |
+
margin: 1rem 0;
|
128 |
+
}
|
129 |
+
|
130 |
+
/* Error message styling */
|
131 |
+
.stError {
|
132 |
+
padding: 1rem;
|
133 |
+
border-radius: 5px;
|
134 |
+
background-color: #f8d7da;
|
135 |
+
color: #721c24;
|
136 |
+
margin: 1rem 0;
|
137 |
+
}
|
138 |
+
|
139 |
+
/* Expander styling */
|
140 |
+
.streamlit-expanderHeader {
|
141 |
+
font-size: 1.1rem;
|
142 |
+
font-weight: bold;
|
143 |
+
color: #1f77b4;
|
144 |
+
}
|
145 |
+
|
146 |
+
/* Text area styling */
|
147 |
+
.stTextArea textarea {
|
148 |
+
font-family: monospace;
|
149 |
+
font-size: 0.9rem;
|
150 |
+
line-height: 1.5;
|
151 |
+
}
|
152 |
+
</style>
|
153 |
+
""", unsafe_allow_html=True)
|
154 |
+
|
155 |
+
# Initialize session state
|
156 |
+
if 'generated_records' not in st.session_state:
|
157 |
+
st.session_state.generated_records = []
|
158 |
+
if 'total_generated' not in st.session_state:
|
159 |
+
st.session_state.total_generated = 0
|
160 |
+
if 'generator' not in st.session_state:
|
161 |
+
st.session_state.generator = None
|
162 |
+
|
163 |
+
# Header
|
164 |
+
st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
|
165 |
+
st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
|
166 |
+
|
167 |
+
# Add a status message area
|
168 |
+
status_area = st.empty()
|
169 |
+
|
170 |
+
# Sidebar
|
171 |
+
with st.sidebar:
|
172 |
+
st.markdown("### ⚙️ Configuration")
|
173 |
+
|
174 |
+
# API Key section
|
175 |
+
with st.expander("🔑 API Settings", expanded=False):
|
176 |
+
gemini_api_key = st.text_input(
|
177 |
+
"Gemini API Key",
|
178 |
+
value=os.getenv('GEMINI_API_KEY', ''),
|
179 |
+
type="password",
|
180 |
+
help="Enter your Google Gemini API key for better generation quality"
|
181 |
+
)
|
182 |
+
|
183 |
+
# Record settings
|
184 |
+
st.markdown("### 📝 Record Settings")
|
185 |
+
record_type = st.selectbox(
|
186 |
+
"Select Record Type",
|
187 |
+
["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
|
188 |
+
format_func=lambda x: x.replace("_", " ").title()
|
189 |
+
)
|
190 |
+
|
191 |
+
quantity = st.slider("Number of Records", 1, 20, 5)
|
192 |
+
|
193 |
+
# Generation settings
|
194 |
+
st.markdown("### 🤖 Generation Settings")
|
195 |
+
use_gemini = st.checkbox(
|
196 |
+
"Use Gemini API",
|
197 |
+
value=False,
|
198 |
+
help="Uses Google Gemini API for better quality generation"
|
199 |
+
)
|
200 |
+
|
201 |
+
# Advanced options
|
202 |
+
with st.expander("⚡ Advanced Options"):
|
203 |
+
include_metadata = st.checkbox("Include Metadata", value=True)
|
204 |
+
export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
|
205 |
+
|
206 |
+
# Main content with better organization
|
207 |
+
col1, col2 = st.columns([2, 1])
|
208 |
+
|
209 |
+
with col1:
|
210 |
+
st.markdown("### 📝 Generate Records")
|
211 |
+
|
212 |
+
# Generation button with better styling
|
213 |
+
if st.button("🚀 Generate Records", type="primary", use_container_width=True):
|
214 |
+
status_area.info("Initializing generator...")
|
215 |
+
|
216 |
+
# Initialize generator if not already done
|
217 |
+
if st.session_state.generator is None:
|
218 |
+
try:
|
219 |
+
with st.spinner("Initializing medical text generator..."):
|
220 |
+
st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
|
221 |
+
status_area.success("Generator initialized successfully!")
|
222 |
+
except Exception as e:
|
223 |
+
status_area.error(f"Error initializing generator: {str(e)}")
|
224 |
+
st.stop()
|
225 |
+
|
226 |
+
# Generate records with progress
|
227 |
+
progress_bar = st.progress(0)
|
228 |
+
status_text = st.empty()
|
229 |
+
|
230 |
+
generated_records = []
|
231 |
+
|
232 |
+
for i in range(quantity):
|
233 |
+
status_text.text(f"Generating record {i+1} of {quantity}...")
|
234 |
+
progress_bar.progress((i + 1) / quantity)
|
235 |
+
|
236 |
+
try:
|
237 |
+
record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
|
238 |
+
generated_records.append(record)
|
239 |
+
|
240 |
+
# Rate limiting
|
241 |
+
if use_gemini:
|
242 |
+
time.sleep(1)
|
243 |
+
|
244 |
+
except Exception as e:
|
245 |
+
logger.error(f"Failed to generate record {i+1}: {str(e)}")
|
246 |
+
status_area.error(f"Failed to generate record {i+1}: {str(e)}")
|
247 |
+
continue
|
248 |
+
|
249 |
+
# Update session state
|
250 |
+
if generated_records:
|
251 |
+
st.session_state.generated_records.extend(generated_records)
|
252 |
+
st.session_state.total_generated += len(generated_records)
|
253 |
+
|
254 |
+
status_text.text("✅ Generation complete!")
|
255 |
+
progress_bar.progress(1.0)
|
256 |
+
|
257 |
+
status_area.success(f"Successfully generated {len(generated_records)} medical records!")
|
258 |
+
|
259 |
+
# Display generated records with better organization
|
260 |
+
if st.session_state.generated_records:
|
261 |
+
st.markdown("### 📋 Generated Records")
|
262 |
+
|
263 |
+
# Filters with better layout
|
264 |
+
col_filter1, col_filter2 = st.columns(2)
|
265 |
+
with col_filter1:
|
266 |
+
filter_type = st.selectbox(
|
267 |
+
"Filter by Type",
|
268 |
+
["All"] + list(set([r.get('type', 'Unknown') for r in st.session_state.generated_records]))
|
269 |
+
)
|
270 |
+
with col_filter2:
|
271 |
+
records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
|
272 |
+
|
273 |
+
# Filter records
|
274 |
+
filtered_records = st.session_state.generated_records
|
275 |
+
if filter_type != "All":
|
276 |
+
filtered_records = [r for r in filtered_records if r.get('type', 'Unknown') == filter_type]
|
277 |
+
|
278 |
+
# Pagination
|
279 |
+
total_records = len(filtered_records)
|
280 |
+
total_pages = (total_records - 1) // records_per_page + 1
|
281 |
+
|
282 |
+
if total_pages > 1:
|
283 |
+
page = st.selectbox("Page", range(1, total_pages + 1))
|
284 |
+
start_idx = (page - 1) * records_per_page
|
285 |
+
end_idx = start_idx + records_per_page
|
286 |
+
page_records = filtered_records[start_idx:end_idx]
|
287 |
+
else:
|
288 |
+
page_records = filtered_records
|
289 |
+
|
290 |
+
# Display records with better styling
|
291 |
+
for i, record in enumerate(page_records):
|
292 |
+
with st.expander(f"Record {record.get('id', 'Unknown')} - {record.get('type', 'Unknown').replace('_', ' ').title()}"):
|
293 |
+
if include_metadata:
|
294 |
+
col_meta1, col_meta2, col_meta3 = st.columns(3)
|
295 |
+
with col_meta1:
|
296 |
+
st.metric("Type", record.get('type', 'Unknown').replace('_', ' ').title())
|
297 |
+
with col_meta2:
|
298 |
+
st.metric("Generated", record.get('timestamp', 'N/A'))
|
299 |
+
with col_meta3:
|
300 |
+
st.metric("Source", record.get('source', 'Hugging Face'))
|
301 |
+
|
302 |
+
st.markdown('<div class="record-container">', unsafe_allow_html=True)
|
303 |
+
st.text_area("Content", record.get('text', 'No content available'), height=200, key=f"record_{i}")
|
304 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
305 |
+
|
306 |
+
with col2:
|
307 |
+
st.markdown("### 📊 Statistics")
|
308 |
+
|
309 |
+
# Stats container with better styling
|
310 |
+
st.markdown('<div class="stats-container">', unsafe_allow_html=True)
|
311 |
+
|
312 |
+
# Total records
|
313 |
+
st.metric("Total Records Generated", st.session_state.total_generated)
|
314 |
+
|
315 |
+
# Record type distribution with better visualization
|
316 |
+
if st.session_state.generated_records:
|
317 |
+
type_counts = pd.Series([r.get('type', 'Unknown') for r in st.session_state.generated_records]).value_counts()
|
318 |
+
st.markdown("#### Record Type Distribution")
|
319 |
+
st.bar_chart(type_counts)
|
320 |
+
|
321 |
+
# Export options with better organization
|
322 |
+
st.markdown("#### 💾 Export Data")
|
323 |
+
if st.session_state.generated_records:
|
324 |
+
if export_format == "JSON":
|
325 |
+
json_str = json.dumps(st.session_state.generated_records, indent=2)
|
326 |
+
st.download_button(
|
327 |
+
"📥 Download JSON",
|
328 |
+
json_str,
|
329 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
330 |
+
mime="application/json",
|
331 |
+
use_container_width=True
|
332 |
+
)
|
333 |
+
elif export_format == "CSV":
|
334 |
+
df = pd.DataFrame(st.session_state.generated_records)
|
335 |
+
csv = df.to_csv(index=False)
|
336 |
+
st.download_button(
|
337 |
+
"📥 Download CSV",
|
338 |
+
csv,
|
339 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
340 |
+
mime="text/csv",
|
341 |
+
use_container_width=True
|
342 |
+
)
|
343 |
+
elif export_format == "TXT":
|
344 |
+
txt = "\n\n".join([f"Record {r.get('id', 'Unknown')} ({r.get('type', 'Unknown')}):\n{r.get('text', 'No content available')}" for r in st.session_state.generated_records])
|
345 |
+
st.download_button(
|
346 |
+
"📥 Download TXT",
|
347 |
+
txt,
|
348 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
349 |
+
mime="text/plain",
|
350 |
+
use_container_width=True
|
351 |
+
)
|
352 |
+
|
353 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
354 |
+
|
355 |
+
# Add a footer
|
356 |
+
st.markdown("---")
|
357 |
+
st.markdown("""
|
358 |
+
<div style='text-align: center; color: #666;'>
|
359 |
+
<p>Built with ❤️ using Streamlit | Synthex Medical Text Generator</p>
|
360 |
+
</div>
|
361 |
+
""", unsafe_allow_html=True)
|
batch_generate.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
import time
|
5 |
+
from pathlib import Path
|
6 |
+
from src.generation.medical_generator import MedicalTextGenerator
|
7 |
+
|
8 |
+
# Check for Gemini API key
|
9 |
+
if not os.getenv('GEMINI_API_KEY'):
|
10 |
+
print("Please set the GEMINI_API_KEY environment variable:")
|
11 |
+
print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'")
|
12 |
+
print("Windows CMD: set GEMINI_API_KEY=your-api-key-here")
|
13 |
+
exit(1)
|
14 |
+
|
15 |
+
# Ensure the output directory exists
|
16 |
+
output_dir = Path("data/synthetic")
|
17 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
18 |
+
|
19 |
+
# Initialize the generator
|
20 |
+
generator = MedicalTextGenerator()
|
21 |
+
|
22 |
+
# Define supported record types (using the keys from the generator's templates)
|
23 |
+
record_types = ["clinical_note", "discharge_summary", "lab_report"]
|
24 |
+
|
25 |
+
# Generate 100 mixed records
|
26 |
+
records = []
|
27 |
+
for i in range(100):
|
28 |
+
# Randomly select record type
|
29 |
+
record_type = random.choice(record_types)
|
30 |
+
|
31 |
+
# Generate record using Hugging Face
|
32 |
+
try:
|
33 |
+
record = generator.generate_record(record_type, use_gemini=False)
|
34 |
+
print(f"Generated record {i+1}/100: {record_type}")
|
35 |
+
|
36 |
+
# Append record details
|
37 |
+
records.append({
|
38 |
+
"id": i + 1,
|
39 |
+
"type": record_type,
|
40 |
+
"content": record,
|
41 |
+
"generator": "Hugging Face",
|
42 |
+
"generated_at": time.strftime("%Y-%m-%d %H:%M:%S")
|
43 |
+
})
|
44 |
+
|
45 |
+
# Respect rate limits (e.g., 4 seconds between calls)
|
46 |
+
time.sleep(4)
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error generating record {i+1}: {str(e)}")
|
50 |
+
continue
|
51 |
+
|
52 |
+
# Save records to a JSON file
|
53 |
+
output_file = output_dir / "synthetic_records.json"
|
54 |
+
with open(output_file, "w") as f:
|
55 |
+
json.dump(records, f, indent=2)
|
56 |
+
|
57 |
+
print(f"\nGenerated {len(records)} records and saved to {output_file}")
|
data/processed/.gitkeep
ADDED
File without changes
|
data/reports/plots/sample_distribution.png
ADDED
![]() |
data/synthetic/.gitkeep
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core ML libraries
|
2 |
+
torch==2.2.1
|
3 |
+
transformers==4.38.2
|
4 |
+
datasets>=2.12.0
|
5 |
+
huggingface_hub>=0.15.0
|
6 |
+
|
7 |
+
# Web framework
|
8 |
+
streamlit==1.32.0
|
9 |
+
gradio>=3.35.0
|
10 |
+
fastapi>=0.115.2
|
11 |
+
uvicorn>=0.24.0
|
12 |
+
|
13 |
+
# Data processing
|
14 |
+
pandas==2.2.1
|
15 |
+
numpy==1.26.4
|
16 |
+
requests>=2.31.0
|
17 |
+
beautifulsoup4>=4.12.0
|
18 |
+
lxml>=4.9.0
|
19 |
+
|
20 |
+
# Medical NLP
|
21 |
+
spacy>=3.6.0
|
22 |
+
scikit-learn>=1.3.0
|
23 |
+
|
24 |
+
# API integration
|
25 |
+
google-generativeai==0.3.2
|
26 |
+
|
27 |
+
# Utilities
|
28 |
+
python-dotenv==1.0.1
|
29 |
+
tqdm>=4.65.0
|
setup.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import subprocess
|
4 |
+
import logging
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# Setup logging
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
def check_python_version():
|
12 |
+
"""Check if Python version is compatible"""
|
13 |
+
if sys.version_info < (3, 11):
|
14 |
+
logger.error("Python 3.11 or higher is required")
|
15 |
+
sys.exit(1)
|
16 |
+
logger.info(f"Python version {sys.version_info.major}.{sys.version_info.minor} detected")
|
17 |
+
|
18 |
+
def create_virtual_environment():
|
19 |
+
"""Create and activate virtual environment"""
|
20 |
+
venv_name = "synthex_env"
|
21 |
+
if not os.path.exists(venv_name):
|
22 |
+
logger.info(f"Creating virtual environment: {venv_name}")
|
23 |
+
subprocess.run([sys.executable, "-m", "venv", venv_name], check=True)
|
24 |
+
else:
|
25 |
+
logger.info(f"Virtual environment {venv_name} already exists")
|
26 |
+
|
27 |
+
def install_requirements():
|
28 |
+
"""Install required packages"""
|
29 |
+
logger.info("Installing requirements...")
|
30 |
+
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)
|
31 |
+
|
32 |
+
def create_directories():
|
33 |
+
"""Create necessary directories"""
|
34 |
+
directories = [
|
35 |
+
"data/raw",
|
36 |
+
"data/processed",
|
37 |
+
"data/reports",
|
38 |
+
"data/reports/plots"
|
39 |
+
]
|
40 |
+
for directory in directories:
|
41 |
+
Path(directory).mkdir(parents=True, exist_ok=True)
|
42 |
+
logger.info(f"Created directory: {directory}")
|
43 |
+
|
44 |
+
def setup_environment():
|
45 |
+
"""Setup the complete environment"""
|
46 |
+
try:
|
47 |
+
logger.info("Starting environment setup...")
|
48 |
+
|
49 |
+
# Check Python version
|
50 |
+
check_python_version()
|
51 |
+
|
52 |
+
# Create virtual environment
|
53 |
+
create_virtual_environment()
|
54 |
+
|
55 |
+
# Install requirements
|
56 |
+
install_requirements()
|
57 |
+
|
58 |
+
# Create directories
|
59 |
+
create_directories()
|
60 |
+
|
61 |
+
logger.info("Environment setup completed successfully!")
|
62 |
+
logger.info("\nNext steps:")
|
63 |
+
logger.info("1. Activate the virtual environment:")
|
64 |
+
logger.info(" - Windows: synthex_env\\Scripts\\activate")
|
65 |
+
logger.info(" - Unix/MacOS: source synthex_env/bin/activate")
|
66 |
+
logger.info("2. Run data collection: python setup_data.py")
|
67 |
+
logger.info("3. Analyze data quality: python analyze_data_quality.py")
|
68 |
+
|
69 |
+
except subprocess.CalledProcessError as e:
|
70 |
+
logger.error(f"Error during setup: {str(e)}")
|
71 |
+
sys.exit(1)
|
72 |
+
except Exception as e:
|
73 |
+
logger.error(f"Unexpected error: {str(e)}")
|
74 |
+
sys.exit(1)
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
setup_environment()
|
setup_data.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
import logging
|
5 |
+
import subprocess
|
6 |
+
|
7 |
+
# Add src directory to Python path
|
8 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
9 |
+
|
10 |
+
# Setup logging
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
14 |
+
)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
def setup_data_directories():
|
18 |
+
"""Create necessary data directories"""
|
19 |
+
directories = [
|
20 |
+
"data/raw",
|
21 |
+
"data/processed",
|
22 |
+
"data/synthetic"
|
23 |
+
]
|
24 |
+
|
25 |
+
for directory in directories:
|
26 |
+
path = Path(directory)
|
27 |
+
path.mkdir(parents=True, exist_ok=True)
|
28 |
+
logger.info(f"Created directory: {directory}")
|
29 |
+
|
30 |
+
# Create .gitkeep file
|
31 |
+
gitkeep = path / ".gitkeep"
|
32 |
+
gitkeep.touch(exist_ok=True)
|
33 |
+
logger.info(f"Created .gitkeep in {directory}")
|
34 |
+
|
35 |
+
def main():
|
36 |
+
"""Setup data directories and run collection"""
|
37 |
+
logger.info("Setting up data directories...")
|
38 |
+
setup_data_directories()
|
39 |
+
|
40 |
+
logger.info("Running data collection script via subprocess...")
|
41 |
+
result = subprocess.run([sys.executable, 'src/data_collection/data_collection.py'])
|
42 |
+
if result.returncode != 0:
|
43 |
+
logger.error(f"Data collection script failed with exit code {result.returncode}")
|
44 |
+
else:
|
45 |
+
logger.info("Data collection completed successfully.")
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
main()
|
src/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Synthex Medical Text Generator
|
3 |
+
A tool for generating synthetic medical records
|
4 |
+
"""
|
5 |
+
|
6 |
+
__version__ = "0.1.0"
|
src/api/app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from fastapi.staticfiles import StaticFiles
|
4 |
+
from fastapi.responses import FileResponse
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from typing import List, Optional
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
import logging
|
10 |
+
|
11 |
+
# Add src directory to Python path
|
12 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
13 |
+
|
14 |
+
from generation.medical_generator import MedicalTextGenerator
|
15 |
+
|
16 |
+
# Setup logging
|
17 |
+
logging.basicConfig(level=logging.INFO)
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
app = FastAPI(
|
21 |
+
title="Synthex Medical Text Generator API",
|
22 |
+
description="API for generating synthetic medical records",
|
23 |
+
version="1.0.0"
|
24 |
+
)
|
25 |
+
|
26 |
+
# Add CORS middleware
|
27 |
+
app.add_middleware(
|
28 |
+
CORSMiddleware,
|
29 |
+
allow_origins=["*"], # Allows all origins
|
30 |
+
allow_credentials=True,
|
31 |
+
allow_methods=["*"], # Allows all methods
|
32 |
+
allow_headers=["*"], # Allows all headers
|
33 |
+
)
|
34 |
+
|
35 |
+
# Mount static files
|
36 |
+
app.mount("/static", StaticFiles(directory="src/web"), name="static")
|
37 |
+
|
38 |
+
# Initialize generator
|
39 |
+
generator = MedicalTextGenerator()
|
40 |
+
|
41 |
+
class GenerationRequest(BaseModel):
|
42 |
+
record_type: str
|
43 |
+
quantity: int = 1
|
44 |
+
use_gemini: bool = False
|
45 |
+
include_metadata: bool = True
|
46 |
+
|
47 |
+
class MedicalRecord(BaseModel):
|
48 |
+
id: str
|
49 |
+
type: str
|
50 |
+
text: str
|
51 |
+
timestamp: str
|
52 |
+
source: str
|
53 |
+
|
54 |
+
class GenerationResponse(BaseModel):
|
55 |
+
records: List[MedicalRecord]
|
56 |
+
total_generated: int
|
57 |
+
|
58 |
+
@app.get("/")
|
59 |
+
async def read_root():
|
60 |
+
"""Serve the HTML interface"""
|
61 |
+
return FileResponse("src/web/index.html")
|
62 |
+
|
63 |
+
@app.get("/record-types")
|
64 |
+
async def get_record_types():
|
65 |
+
"""Get available record types"""
|
66 |
+
return {"record_types": list(generator.templates.keys())}
|
67 |
+
|
68 |
+
@app.post("/generate", response_model=GenerationResponse)
|
69 |
+
async def generate_records(request: GenerationRequest):
|
70 |
+
"""Generate synthetic medical records"""
|
71 |
+
try:
|
72 |
+
if request.record_type not in generator.templates:
|
73 |
+
raise HTTPException(status_code=400, detail=f"Invalid record type. Available types: {list(generator.templates.keys())}")
|
74 |
+
|
75 |
+
if request.quantity < 1 or request.quantity > 10:
|
76 |
+
raise HTTPException(status_code=400, detail="Quantity must be between 1 and 10")
|
77 |
+
|
78 |
+
records = generator.batch_generate(
|
79 |
+
record_type=request.record_type,
|
80 |
+
count=request.quantity,
|
81 |
+
use_gemini=request.use_gemini
|
82 |
+
)
|
83 |
+
|
84 |
+
return {
|
85 |
+
"records": records,
|
86 |
+
"total_generated": len(records)
|
87 |
+
}
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
logger.error(f"Error generating records: {str(e)}")
|
91 |
+
raise HTTPException(status_code=500, detail=str(e))
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
import uvicorn
|
95 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/data_collection/data_collection.py
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Medical Data Collection Pipeline for Synthex MVP
|
3 |
+
Collects medical text from free sources for training data
|
4 |
+
"""
|
5 |
+
|
6 |
+
import requests
|
7 |
+
import pandas as pd
|
8 |
+
from datasets import load_dataset
|
9 |
+
import time
|
10 |
+
import json
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import List, Dict, Any
|
13 |
+
import logging
|
14 |
+
import sys
|
15 |
+
from tqdm import tqdm
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
import re
|
18 |
+
from datetime import datetime
|
19 |
+
|
20 |
+
# Setup logging
|
21 |
+
logging.basicConfig(
|
22 |
+
level=logging.INFO,
|
23 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
24 |
+
handlers=[
|
25 |
+
logging.StreamHandler(sys.stdout),
|
26 |
+
logging.FileHandler('data_collection.log')
|
27 |
+
]
|
28 |
+
)
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
class MedicalDataCollector:
|
32 |
+
def __init__(self, output_dir: str = "data/raw"):
|
33 |
+
self.output_dir = Path(output_dir)
|
34 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
35 |
+
self.stats = {
|
36 |
+
"total_samples": 0,
|
37 |
+
"sources": {},
|
38 |
+
"errors": [],
|
39 |
+
"start_time": datetime.now()
|
40 |
+
}
|
41 |
+
logger.info(f"Initialized MedicalDataCollector with output directory: {self.output_dir}")
|
42 |
+
|
43 |
+
def collect_huggingface_datasets(self) -> Dict[str, List]:
|
44 |
+
"""Collect medical datasets from Hugging Face Hub"""
|
45 |
+
|
46 |
+
# Only include datasets that are known to exist and are medical-related
|
47 |
+
datasets_to_collect = [
|
48 |
+
"medical_questions_pairs",
|
49 |
+
"medalpaca/medical_meadow_medical_flashcards",
|
50 |
+
"gamino/wiki_medical_terms",
|
51 |
+
("pubmed_qa", "pqa_artificial") # pubmed_qa requires a config
|
52 |
+
]
|
53 |
+
|
54 |
+
collected_data = {}
|
55 |
+
|
56 |
+
for dataset_entry in tqdm(datasets_to_collect, desc="Collecting Hugging Face datasets"):
|
57 |
+
try:
|
58 |
+
if isinstance(dataset_entry, tuple):
|
59 |
+
dataset_name, config = dataset_entry
|
60 |
+
logger.info(f"Loading dataset: {dataset_name} with config: {config}")
|
61 |
+
dataset = load_dataset(dataset_name, config, split="train")
|
62 |
+
dataset_key = f"{dataset_name}_{config}"
|
63 |
+
else:
|
64 |
+
dataset_name = dataset_entry
|
65 |
+
logger.info(f"Loading dataset: {dataset_name}")
|
66 |
+
dataset = load_dataset(dataset_name, split="train")
|
67 |
+
dataset_key = dataset_name
|
68 |
+
|
69 |
+
# Convert to list of dictionaries
|
70 |
+
data_list = []
|
71 |
+
for item in dataset:
|
72 |
+
processed_item = self._process_dataset_item(item)
|
73 |
+
if processed_item:
|
74 |
+
data_list.append(processed_item)
|
75 |
+
|
76 |
+
if data_list:
|
77 |
+
collected_data[dataset_key] = data_list
|
78 |
+
self.stats["sources"][dataset_key] = len(data_list)
|
79 |
+
self.stats["total_samples"] += len(data_list)
|
80 |
+
|
81 |
+
# Save to file
|
82 |
+
output_file = self.output_dir / f"{dataset_key.replace('/', '_')}.json"
|
83 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
84 |
+
json.dump(data_list, f, indent=2, ensure_ascii=False)
|
85 |
+
|
86 |
+
logger.info(f"Saved {len(data_list)} samples from {dataset_key} to {output_file}")
|
87 |
+
else:
|
88 |
+
logger.warning(f"No valid data found in dataset: {dataset_key}")
|
89 |
+
|
90 |
+
time.sleep(1) # Be respectful to APIs
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
error_msg = f"Failed to load {dataset_entry}: {str(e)}"
|
94 |
+
logger.error(error_msg, exc_info=True)
|
95 |
+
self.stats["errors"].append(error_msg)
|
96 |
+
continue
|
97 |
+
|
98 |
+
return collected_data
|
99 |
+
|
100 |
+
def collect_pubmed_abstracts(self, queries: List[str] = None, max_results: int = 1000) -> List[Dict]:
|
101 |
+
"""Collect PubMed abstracts via API"""
|
102 |
+
|
103 |
+
if queries is None:
|
104 |
+
queries = [
|
105 |
+
"clinical notes",
|
106 |
+
"medical case reports",
|
107 |
+
"patient discharge summaries",
|
108 |
+
"medical laboratory reports",
|
109 |
+
"medical imaging reports"
|
110 |
+
]
|
111 |
+
|
112 |
+
all_abstracts = []
|
113 |
+
|
114 |
+
for query in tqdm(queries, desc="Collecting PubMed abstracts"):
|
115 |
+
try:
|
116 |
+
abstracts = self._collect_pubmed_query(query, max_results)
|
117 |
+
all_abstracts.extend(abstracts)
|
118 |
+
self.stats["sources"]["pubmed_" + query.replace(" ", "_")] = len(abstracts)
|
119 |
+
self.stats["total_samples"] += len(abstracts)
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
error_msg = f"Failed to collect PubMed abstracts for {query}: {str(e)}"
|
123 |
+
logger.error(error_msg)
|
124 |
+
self.stats["errors"].append(error_msg)
|
125 |
+
continue
|
126 |
+
|
127 |
+
# Save all abstracts
|
128 |
+
if all_abstracts:
|
129 |
+
output_file = self.output_dir / "pubmed_abstracts.json"
|
130 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
131 |
+
json.dump(all_abstracts, f, indent=2, ensure_ascii=False)
|
132 |
+
|
133 |
+
return all_abstracts
|
134 |
+
|
135 |
+
def _collect_pubmed_query(self, query: str, max_results: int) -> List[Dict]:
|
136 |
+
"""Collect PubMed abstracts for a specific query"""
|
137 |
+
|
138 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
139 |
+
search_url = f"{base_url}esearch.fcgi"
|
140 |
+
|
141 |
+
search_params = {
|
142 |
+
"db": "pubmed",
|
143 |
+
"term": query,
|
144 |
+
"retmax": max_results,
|
145 |
+
"retmode": "json",
|
146 |
+
"sort": "relevance"
|
147 |
+
}
|
148 |
+
|
149 |
+
try:
|
150 |
+
response = requests.get(search_url, params=search_params)
|
151 |
+
response.raise_for_status() # Raise exception for bad status codes
|
152 |
+
search_results = response.json()
|
153 |
+
|
154 |
+
# Check rate limits
|
155 |
+
rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
|
156 |
+
rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
|
157 |
+
logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
|
158 |
+
|
159 |
+
if rate_remaining <= 1:
|
160 |
+
logger.warning("Rate limit nearly reached, waiting 60 seconds")
|
161 |
+
time.sleep(60)
|
162 |
+
|
163 |
+
except requests.exceptions.RequestException as e:
|
164 |
+
logger.error(f"Failed to fetch PubMed search results for query '{query}': {str(e)}")
|
165 |
+
return []
|
166 |
+
except json.JSONDecodeError as e:
|
167 |
+
logger.error(f"Failed to parse PubMed search results for query '{query}': {str(e)}")
|
168 |
+
return []
|
169 |
+
|
170 |
+
if "esearchresult" not in search_results:
|
171 |
+
logger.warning(f"No search results found for query '{query}'")
|
172 |
+
return []
|
173 |
+
|
174 |
+
id_list = search_results["esearchresult"]["idlist"]
|
175 |
+
abstracts = []
|
176 |
+
batch_size = 100
|
177 |
+
|
178 |
+
for i in range(0, len(id_list), batch_size):
|
179 |
+
batch_ids = id_list[i:i+batch_size]
|
180 |
+
ids_str = ",".join(batch_ids)
|
181 |
+
|
182 |
+
fetch_url = f"{base_url}efetch.fcgi"
|
183 |
+
fetch_params = {
|
184 |
+
"db": "pubmed",
|
185 |
+
"id": ids_str,
|
186 |
+
"retmode": "xml"
|
187 |
+
}
|
188 |
+
|
189 |
+
try:
|
190 |
+
response = requests.get(fetch_url, params=fetch_params)
|
191 |
+
response.raise_for_status()
|
192 |
+
|
193 |
+
# Check rate limits
|
194 |
+
rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
|
195 |
+
rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
|
196 |
+
logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
|
197 |
+
|
198 |
+
if rate_remaining <= 1:
|
199 |
+
logger.warning("Rate limit nearly reached, waiting 60 seconds")
|
200 |
+
time.sleep(60)
|
201 |
+
|
202 |
+
# Parse XML with proper features
|
203 |
+
soup = BeautifulSoup(response.text, 'lxml', features="xml")
|
204 |
+
|
205 |
+
except requests.exceptions.RequestException as e:
|
206 |
+
logger.error(f"Failed to fetch PubMed article batch {i//batch_size + 1}: {str(e)}")
|
207 |
+
continue
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Failed to parse PubMed article batch {i//batch_size + 1}: {str(e)}")
|
210 |
+
continue
|
211 |
+
|
212 |
+
for article in soup.find_all('PubmedArticle'):
|
213 |
+
try:
|
214 |
+
abstract = article.find('Abstract')
|
215 |
+
if abstract:
|
216 |
+
abstract_text = abstract.get_text().strip()
|
217 |
+
if len(abstract_text) > 100: # Filter out very short abstracts
|
218 |
+
title = article.find('ArticleTitle')
|
219 |
+
if not title:
|
220 |
+
continue
|
221 |
+
title_text = title.get_text().strip()
|
222 |
+
|
223 |
+
pub_date = article.find('PubDate')
|
224 |
+
year = "Unknown"
|
225 |
+
if pub_date and pub_date.find('Year'):
|
226 |
+
year = pub_date.find('Year').get_text().strip()
|
227 |
+
|
228 |
+
abstracts.append({
|
229 |
+
"title": title_text,
|
230 |
+
"abstract": abstract_text,
|
231 |
+
"year": year,
|
232 |
+
"source": "pubmed",
|
233 |
+
"query": query
|
234 |
+
})
|
235 |
+
except Exception as e:
|
236 |
+
logger.debug(f"Failed to process article in batch {i//batch_size + 1}: {str(e)}")
|
237 |
+
continue
|
238 |
+
|
239 |
+
# Always wait between batches to respect rate limits
|
240 |
+
time.sleep(1)
|
241 |
+
|
242 |
+
logger.info(f"Collected {len(abstracts)} abstracts for query '{query}'")
|
243 |
+
return abstracts
|
244 |
+
|
245 |
+
def create_training_dataset(self) -> pd.DataFrame:
|
246 |
+
"""Combine all collected data into training dataset"""
|
247 |
+
|
248 |
+
all_texts = []
|
249 |
+
|
250 |
+
# Load all collected datasets
|
251 |
+
for json_file in tqdm(list(self.output_dir.glob("*.json")), desc="Processing collected data"):
|
252 |
+
try:
|
253 |
+
with open(json_file, 'r', encoding='utf-8') as f:
|
254 |
+
data = json.load(f)
|
255 |
+
|
256 |
+
# Extract text content
|
257 |
+
for item in data:
|
258 |
+
text_content = self._extract_text_content(item)
|
259 |
+
if text_content:
|
260 |
+
processed_text = self._clean_text(text_content)
|
261 |
+
if processed_text:
|
262 |
+
all_texts.append({
|
263 |
+
"text": processed_text,
|
264 |
+
"source": json_file.stem,
|
265 |
+
"length": len(processed_text),
|
266 |
+
"type": self._determine_text_type(processed_text)
|
267 |
+
})
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
error_msg = f"Failed to process {json_file}: {str(e)}"
|
271 |
+
logger.error(error_msg)
|
272 |
+
self.stats["errors"].append(error_msg)
|
273 |
+
continue
|
274 |
+
|
275 |
+
# Create DataFrame
|
276 |
+
df = pd.DataFrame(all_texts)
|
277 |
+
|
278 |
+
# Basic filtering
|
279 |
+
df = df[df['length'] > 100] # Remove very short texts
|
280 |
+
df = df[df['length'] < 5000] # Remove very long texts
|
281 |
+
|
282 |
+
# Remove duplicates
|
283 |
+
df = df.drop_duplicates(subset=['text'])
|
284 |
+
|
285 |
+
# Save processed dataset
|
286 |
+
output_file = self.output_dir.parent / "processed" / "training_data.csv"
|
287 |
+
output_file.parent.mkdir(exist_ok=True)
|
288 |
+
df.to_csv(output_file, index=False, encoding='utf-8')
|
289 |
+
|
290 |
+
# Update stats
|
291 |
+
self.stats["final_samples"] = len(df)
|
292 |
+
self.stats["text_types"] = df['type'].value_counts().to_dict()
|
293 |
+
|
294 |
+
logger.info(f"Created training dataset with {len(df)} samples")
|
295 |
+
return df
|
296 |
+
|
297 |
+
def _process_dataset_item(self, item: Dict) -> Dict:
|
298 |
+
"""Process and validate a dataset item"""
|
299 |
+
try:
|
300 |
+
# Extract text content
|
301 |
+
text = self._extract_text_content(item)
|
302 |
+
if not text or len(text) < 100:
|
303 |
+
return None
|
304 |
+
|
305 |
+
# Clean text
|
306 |
+
cleaned_text = self._clean_text(text)
|
307 |
+
if not cleaned_text:
|
308 |
+
return None
|
309 |
+
|
310 |
+
# Create processed item
|
311 |
+
processed = {
|
312 |
+
"text": cleaned_text,
|
313 |
+
"source": "huggingface",
|
314 |
+
"type": self._determine_text_type(cleaned_text)
|
315 |
+
}
|
316 |
+
|
317 |
+
# Add metadata if available
|
318 |
+
for key in ['title', 'question', 'answer', 'instruction']:
|
319 |
+
if key in item:
|
320 |
+
processed[key] = str(item[key])
|
321 |
+
|
322 |
+
return processed
|
323 |
+
|
324 |
+
except Exception:
|
325 |
+
return None
|
326 |
+
|
327 |
+
def _extract_text_content(self, item: Dict) -> str:
|
328 |
+
"""Extract relevant text content from dataset item"""
|
329 |
+
|
330 |
+
# Common text fields in medical datasets
|
331 |
+
text_fields = ['text', 'content', 'abstract', 'question', 'answer',
|
332 |
+
'instruction', 'output', 'input', 'context']
|
333 |
+
|
334 |
+
for field in text_fields:
|
335 |
+
if field in item and item[field]:
|
336 |
+
return str(item[field])
|
337 |
+
|
338 |
+
# Fallback: combine multiple fields
|
339 |
+
combined_text = ""
|
340 |
+
for key, value in item.items():
|
341 |
+
if isinstance(value, str) and len(value) > 20:
|
342 |
+
combined_text += f"{value} "
|
343 |
+
|
344 |
+
return combined_text.strip()
|
345 |
+
|
346 |
+
def _clean_text(self, text: str) -> str:
|
347 |
+
"""Clean and normalize text"""
|
348 |
+
if not text:
|
349 |
+
return ""
|
350 |
+
|
351 |
+
# Remove special characters and normalize whitespace
|
352 |
+
text = re.sub(r'[^\w\s.,;:!?()-]', ' ', text)
|
353 |
+
text = re.sub(r'\s+', ' ', text)
|
354 |
+
|
355 |
+
# Remove common noise
|
356 |
+
text = re.sub(r'http\S+', '', text)
|
357 |
+
text = re.sub(r'www\S+', '', text)
|
358 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
359 |
+
|
360 |
+
return text.strip()
|
361 |
+
|
362 |
+
def _determine_text_type(self, text: str) -> str:
|
363 |
+
"""Determine the type of medical text"""
|
364 |
+
text = text.lower()
|
365 |
+
|
366 |
+
if any(term in text for term in ['discharge', 'summary', 'discharge summary']):
|
367 |
+
return 'discharge_summary'
|
368 |
+
elif any(term in text for term in ['lab', 'laboratory', 'test results']):
|
369 |
+
return 'lab_report'
|
370 |
+
elif any(term in text for term in ['prescription', 'medication', 'drug']):
|
371 |
+
return 'prescription'
|
372 |
+
elif any(term in text for term in ['question', 'answer', 'qa']):
|
373 |
+
return 'medical_qa'
|
374 |
+
else:
|
375 |
+
return 'clinical_note'
|
376 |
+
|
377 |
+
def generate_report(self) -> Dict:
|
378 |
+
"""Generate a report of the data collection process"""
|
379 |
+
# Convert all datetime objects to strings
|
380 |
+
for k, v in self.stats.items():
|
381 |
+
if isinstance(v, datetime):
|
382 |
+
self.stats[k] = str(v)
|
383 |
+
self.stats["end_time"] = str(datetime.now())
|
384 |
+
if isinstance(self.stats["start_time"], datetime):
|
385 |
+
self.stats["start_time"] = str(self.stats["start_time"])
|
386 |
+
# Calculate duration as string
|
387 |
+
try:
|
388 |
+
start_dt = datetime.fromisoformat(self.stats["start_time"])
|
389 |
+
end_dt = datetime.fromisoformat(self.stats["end_time"])
|
390 |
+
self.stats["duration"] = str(end_dt - start_dt)
|
391 |
+
except Exception:
|
392 |
+
self.stats["duration"] = "unknown"
|
393 |
+
|
394 |
+
report_file = self.output_dir.parent / "reports" / "collection_report.json"
|
395 |
+
report_file.parent.mkdir(exist_ok=True)
|
396 |
+
|
397 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
398 |
+
json.dump(self.stats, f, indent=2, ensure_ascii=False)
|
399 |
+
|
400 |
+
return self.stats
|
401 |
+
|
402 |
+
def main():
|
403 |
+
"""Run data collection pipeline"""
|
404 |
+
|
405 |
+
try:
|
406 |
+
collector = MedicalDataCollector()
|
407 |
+
|
408 |
+
# Collect from Hugging Face
|
409 |
+
logger.info("Starting Hugging Face dataset collection...")
|
410 |
+
hf_data = collector.collect_huggingface_datasets()
|
411 |
+
|
412 |
+
# Collect from PubMed
|
413 |
+
logger.info("Starting PubMed collection...")
|
414 |
+
pubmed_data = collector.collect_pubmed_abstracts()
|
415 |
+
|
416 |
+
# Create training dataset
|
417 |
+
logger.info("Creating training dataset...")
|
418 |
+
training_df = collector.create_training_dataset()
|
419 |
+
|
420 |
+
# Generate report
|
421 |
+
report = collector.generate_report()
|
422 |
+
|
423 |
+
# Print summary
|
424 |
+
logger.info("\nData Collection Summary:")
|
425 |
+
logger.info(f"Total samples collected: {report['total_samples']}")
|
426 |
+
logger.info(f"Final training samples: {report['final_samples']}")
|
427 |
+
logger.info(f"Duration: {report['duration']}")
|
428 |
+
logger.info("\nText types distribution:")
|
429 |
+
for type_, count in report['text_types'].items():
|
430 |
+
logger.info(f"- {type_}: {count}")
|
431 |
+
|
432 |
+
if report['errors']:
|
433 |
+
logger.warning(f"\nEncountered {len(report['errors'])} errors during collection")
|
434 |
+
|
435 |
+
except Exception as e:
|
436 |
+
logger.error(f"Data collection failed: {str(e)}", exc_info=True)
|
437 |
+
sys.exit(1)
|
438 |
+
|
439 |
+
if __name__ == "__main__":
|
440 |
+
main()
|
src/generation/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Synthex Medical Text Generation Package
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .medical_generator import MedicalTextGenerator
|
6 |
+
|
7 |
+
__all__ = ['MedicalTextGenerator']
|
src/generation/medical_generator.py
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Basic Medical Text Generator for Synthex MVP
|
3 |
+
Uses Hugging Face models and Gemini API
|
4 |
+
"""
|
5 |
+
|
6 |
+
import google.generativeai as genai
|
7 |
+
from transformers import pipeline
|
8 |
+
import random
|
9 |
+
import time
|
10 |
+
import json
|
11 |
+
from typing import List, Dict, Optional
|
12 |
+
import logging
|
13 |
+
from datetime import datetime
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
|
17 |
+
# Setup logging with better formatting
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.INFO,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
21 |
+
handlers=[
|
22 |
+
logging.StreamHandler(sys.stdout)
|
23 |
+
]
|
24 |
+
)
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
|
27 |
+
# Get Gemini API key from environment variable
|
28 |
+
DEFAULT_GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', '')
|
29 |
+
|
30 |
+
class MedicalTextGenerator:
|
31 |
+
def __init__(self, gemini_api_key: Optional[str] = None):
|
32 |
+
"""Initialize the medical text generator"""
|
33 |
+
|
34 |
+
self.gemini_api_key = gemini_api_key or DEFAULT_GEMINI_API_KEY
|
35 |
+
if not self.gemini_api_key:
|
36 |
+
logger.warning("No Gemini API key provided. Using Hugging Face model only.")
|
37 |
+
|
38 |
+
self.hf_model = None
|
39 |
+
self.gemini_model = None
|
40 |
+
|
41 |
+
# Initialize models
|
42 |
+
self._setup_models()
|
43 |
+
|
44 |
+
# Medical record templates
|
45 |
+
self.templates = {
|
46 |
+
"clinical_note": self._get_clinical_note_template(),
|
47 |
+
"discharge_summary": self._get_discharge_summary_template(),
|
48 |
+
"lab_report": self._get_lab_report_template(),
|
49 |
+
"prescription": self._get_prescription_template(),
|
50 |
+
"patient_intake": self._get_patient_intake_template()
|
51 |
+
}
|
52 |
+
|
53 |
+
def _setup_models(self):
|
54 |
+
"""Setup Hugging Face and Gemini models"""
|
55 |
+
|
56 |
+
try:
|
57 |
+
# Setup Hugging Face model (free)
|
58 |
+
logger.info("Loading Hugging Face medical model...")
|
59 |
+
|
60 |
+
# Use text generation pipeline with a smaller model and CPU device
|
61 |
+
self.hf_generator = pipeline(
|
62 |
+
"text-generation",
|
63 |
+
model="distilgpt2",
|
64 |
+
max_length=512,
|
65 |
+
do_sample=True,
|
66 |
+
temperature=0.7,
|
67 |
+
device=-1, # Force CPU usage to avoid CUDA issues
|
68 |
+
truncation=True # Add truncation to avoid warnings
|
69 |
+
)
|
70 |
+
|
71 |
+
logger.info("Hugging Face model loaded successfully")
|
72 |
+
|
73 |
+
except Exception as e:
|
74 |
+
logger.error(f"Failed to load Hugging Face model: {str(e)}")
|
75 |
+
self.hf_generator = None
|
76 |
+
logger.info("Falling back to template-based generation")
|
77 |
+
|
78 |
+
try:
|
79 |
+
# Setup Gemini (free tier)
|
80 |
+
if self.gemini_api_key:
|
81 |
+
genai.configure(api_key=self.gemini_api_key)
|
82 |
+
# List available models
|
83 |
+
for m in genai.list_models():
|
84 |
+
logger.info(f"Available model: {m.name}")
|
85 |
+
self.gemini_model = genai.GenerativeModel('gemini-pro')
|
86 |
+
logger.info("Gemini model loaded successfully")
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
logger.error(f"Failed to load Gemini model: {str(e)}")
|
90 |
+
self.gemini_model = None
|
91 |
+
logger.info("Gemini API will not be available")
|
92 |
+
|
93 |
+
def generate_record(self, record_type: str, use_gemini: bool = False) -> Dict:
|
94 |
+
"""Generate a synthetic medical record"""
|
95 |
+
|
96 |
+
if record_type not in self.templates:
|
97 |
+
raise ValueError(f"Unknown record type: {record_type}")
|
98 |
+
|
99 |
+
template = self.templates[record_type]
|
100 |
+
content = None
|
101 |
+
|
102 |
+
# Try generation methods in order of preference
|
103 |
+
if use_gemini and self.gemini_model:
|
104 |
+
try:
|
105 |
+
content = self._generate_with_gemini(template)
|
106 |
+
logger.info("Successfully generated record using Gemini")
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(f"Gemini generation failed: {str(e)}")
|
109 |
+
content = None
|
110 |
+
|
111 |
+
if content is None and self.hf_generator:
|
112 |
+
try:
|
113 |
+
content = self._generate_with_huggingface(template)
|
114 |
+
logger.info("Successfully generated record using Hugging Face")
|
115 |
+
except Exception as e:
|
116 |
+
logger.error(f"Hugging Face generation failed: {str(e)}")
|
117 |
+
content = None
|
118 |
+
|
119 |
+
if content is None:
|
120 |
+
try:
|
121 |
+
content = self._generate_with_template(template)
|
122 |
+
logger.info("Successfully generated record using template")
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"Template generation failed: {str(e)}")
|
125 |
+
raise RuntimeError("All generation methods failed")
|
126 |
+
|
127 |
+
return {
|
128 |
+
"id": self._generate_id(),
|
129 |
+
"type": record_type,
|
130 |
+
"text": content,
|
131 |
+
"timestamp": datetime.now().isoformat(),
|
132 |
+
"source": "Gemini" if use_gemini and self.gemini_model else "Hugging Face" if self.hf_generator else "Template"
|
133 |
+
}
|
134 |
+
|
135 |
+
def _generate_with_gemini(self, template: str) -> str:
|
136 |
+
"""Generate text using Gemini API"""
|
137 |
+
|
138 |
+
try:
|
139 |
+
prompt = f"""
|
140 |
+
Generate a realistic but completely fictional medical record using this template:
|
141 |
+
|
142 |
+
{template}
|
143 |
+
|
144 |
+
Requirements:
|
145 |
+
- Use fictional patient names and details
|
146 |
+
- Include medically accurate terminology
|
147 |
+
- Make it realistic but not based on any real patient
|
148 |
+
- Include specific medical details and measurements
|
149 |
+
- Follow standard medical documentation format
|
150 |
+
"""
|
151 |
+
|
152 |
+
response = self.gemini_model.generate_content(prompt)
|
153 |
+
return response.text
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"Gemini generation failed: {str(e)}")
|
157 |
+
raise
|
158 |
+
|
159 |
+
def _generate_with_huggingface(self, template: str) -> str:
|
160 |
+
"""Generate text using Hugging Face model"""
|
161 |
+
try:
|
162 |
+
# First fill the template with random values
|
163 |
+
fake_data = {
|
164 |
+
"patient_name": random.choice([
|
165 |
+
"John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
|
166 |
+
"Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
|
167 |
+
]),
|
168 |
+
"age": random.randint(18, 90),
|
169 |
+
"gender": random.choice(["Male", "Female", "Other"]),
|
170 |
+
"chief_complaint": random.choice([
|
171 |
+
"chest pain", "shortness of breath", "abdominal pain", "headache",
|
172 |
+
"fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
|
173 |
+
]),
|
174 |
+
"blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
|
175 |
+
"heart_rate": random.randint(55, 120),
|
176 |
+
"temperature": round(random.uniform(97.0, 104.0), 1),
|
177 |
+
"diagnosis": random.choice([
|
178 |
+
"Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
|
179 |
+
"Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
|
180 |
+
]),
|
181 |
+
"date": time.strftime("%Y-%m-%d"),
|
182 |
+
"address": random.choice([
|
183 |
+
"123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
|
184 |
+
]),
|
185 |
+
"phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
|
186 |
+
"email": random.choice([
|
187 | |
188 |
+
]),
|
189 |
+
}
|
190 |
+
|
191 |
+
# Fill template with fake data
|
192 |
+
filled_template = template
|
193 |
+
for key, value in fake_data.items():
|
194 |
+
filled_template = filled_template.replace(f"{{{key}}}", str(value))
|
195 |
+
|
196 |
+
# Use the filled template as starting prompt
|
197 |
+
prompt = filled_template[:100] + "..."
|
198 |
+
|
199 |
+
# Generate text with explicit configuration
|
200 |
+
generated = self.hf_generator(
|
201 |
+
prompt,
|
202 |
+
max_length=400,
|
203 |
+
num_return_sequences=1,
|
204 |
+
pad_token_id=50256,
|
205 |
+
truncation=True
|
206 |
+
)
|
207 |
+
|
208 |
+
# Use the generated text
|
209 |
+
return generated[0]['generated_text']
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
logger.error(f"Hugging Face generation failed: {str(e)}")
|
213 |
+
logger.info("Falling back to template-based generation")
|
214 |
+
return self._generate_with_template(template)
|
215 |
+
|
216 |
+
def _generate_with_template(self, template: str) -> str:
|
217 |
+
"""Fallback: Generate using template with random values"""
|
218 |
+
try:
|
219 |
+
# Expanded fake data for more variety
|
220 |
+
fake_data = {
|
221 |
+
"patient_name": random.choice([
|
222 |
+
"John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
|
223 |
+
"Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
|
224 |
+
]),
|
225 |
+
"age": random.randint(18, 90),
|
226 |
+
"gender": random.choice(["Male", "Female", "Other"]),
|
227 |
+
"chief_complaint": random.choice([
|
228 |
+
"chest pain", "shortness of breath", "abdominal pain", "headache",
|
229 |
+
"fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
|
230 |
+
]),
|
231 |
+
"blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
|
232 |
+
"heart_rate": random.randint(55, 120),
|
233 |
+
"temperature": round(random.uniform(97.0, 104.0), 1),
|
234 |
+
"diagnosis": random.choice([
|
235 |
+
"Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
|
236 |
+
"Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
|
237 |
+
]),
|
238 |
+
"date": time.strftime("%Y-%m-%d"),
|
239 |
+
"address": random.choice([
|
240 |
+
"123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
|
241 |
+
]),
|
242 |
+
"phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
|
243 |
+
"email": random.choice([
|
244 | |
245 |
+
]),
|
246 |
+
}
|
247 |
+
# Fill template with fake data
|
248 |
+
filled_template = template
|
249 |
+
for key, value in fake_data.items():
|
250 |
+
filled_template = filled_template.replace(f"{{{key}}}", str(value))
|
251 |
+
return filled_template
|
252 |
+
except Exception as e:
|
253 |
+
logger.error(f"Template generation failed: {str(e)}")
|
254 |
+
raise
|
255 |
+
|
256 |
+
def batch_generate(self, record_type: str, count: int = 10, use_gemini: bool = False) -> List[Dict]:
|
257 |
+
"""Generate multiple records"""
|
258 |
+
|
259 |
+
records = []
|
260 |
+
for i in range(count):
|
261 |
+
try:
|
262 |
+
record = self.generate_record(record_type, use_gemini)
|
263 |
+
records.append(record)
|
264 |
+
|
265 |
+
# Rate limiting for API calls
|
266 |
+
if use_gemini:
|
267 |
+
time.sleep(1) # Respect API limits
|
268 |
+
|
269 |
+
logger.info(f"Generated record {i+1}/{count}")
|
270 |
+
|
271 |
+
except Exception as e:
|
272 |
+
logger.error(f"Failed to generate record {i+1}: {str(e)}")
|
273 |
+
continue
|
274 |
+
|
275 |
+
return records
|
276 |
+
|
277 |
+
def _generate_id(self) -> str:
|
278 |
+
"""Generate unique record ID"""
|
279 |
+
return f"SYN-{int(time.time())}-{random.randint(1000, 9999)}"
|
280 |
+
|
281 |
+
def _get_clinical_note_template(self) -> str:
|
282 |
+
return """
|
283 |
+
CLINICAL NOTE
|
284 |
+
|
285 |
+
Patient: {patient_name}
|
286 |
+
Age: {age}
|
287 |
+
Gender: {gender}
|
288 |
+
Date: {date}
|
289 |
+
|
290 |
+
Chief Complaint:
|
291 |
+
{chief_complaint}
|
292 |
+
|
293 |
+
Vital Signs:
|
294 |
+
- Blood Pressure: {blood_pressure} mmHg
|
295 |
+
- Heart Rate: {heart_rate} bpm
|
296 |
+
- Temperature: {temperature}°F
|
297 |
+
|
298 |
+
Assessment:
|
299 |
+
{diagnosis}
|
300 |
+
|
301 |
+
Plan:
|
302 |
+
1. Follow-up in 2 weeks
|
303 |
+
2. Continue current medications
|
304 |
+
3. Monitor symptoms
|
305 |
+
|
306 |
+
Provider: Dr. Smith
|
307 |
+
"""
|
308 |
+
|
309 |
+
def _get_discharge_summary_template(self) -> str:
|
310 |
+
return """
|
311 |
+
DISCHARGE SUMMARY
|
312 |
+
|
313 |
+
Patient: {patient_name}
|
314 |
+
Age: {age}
|
315 |
+
Gender: {gender}
|
316 |
+
Admission Date: {date}
|
317 |
+
Discharge Date: {date}
|
318 |
+
|
319 |
+
Reason for Admission:
|
320 |
+
{chief_complaint}
|
321 |
+
|
322 |
+
Hospital Course:
|
323 |
+
Patient was admitted for {chief_complaint}. During hospitalization, patient was treated with appropriate medications and showed improvement.
|
324 |
+
|
325 |
+
Final Diagnosis:
|
326 |
+
{diagnosis}
|
327 |
+
|
328 |
+
Discharge Medications:
|
329 |
+
1. Medication A - 1 tablet daily
|
330 |
+
2. Medication B - 2 tablets twice daily
|
331 |
+
|
332 |
+
Follow-up:
|
333 |
+
- Primary Care Provider: Dr. Johnson
|
334 |
+
- Appointment: 2 weeks from discharge
|
335 |
+
|
336 |
+
Discharge Instructions:
|
337 |
+
1. Take medications as prescribed
|
338 |
+
2. Follow up with primary care provider
|
339 |
+
3. Call if symptoms worsen
|
340 |
+
|
341 |
+
Discharging Provider: Dr. Smith
|
342 |
+
"""
|
343 |
+
|
344 |
+
def _get_lab_report_template(self) -> str:
|
345 |
+
return """
|
346 |
+
LABORATORY REPORT
|
347 |
+
|
348 |
+
Patient: {patient_name}
|
349 |
+
Age: {age}
|
350 |
+
Gender: {gender}
|
351 |
+
Date: {date}
|
352 |
+
|
353 |
+
Test Results:
|
354 |
+
|
355 |
+
Complete Blood Count (CBC):
|
356 |
+
- White Blood Cells: {random.randint(4,11)} K/uL
|
357 |
+
- Red Blood Cells: {round(random.uniform(4.0,5.5),2)} M/uL
|
358 |
+
- Hemoglobin: {round(random.uniform(12.0,16.0),1)} g/dL
|
359 |
+
- Platelets: {random.randint(150,450)} K/uL
|
360 |
+
|
361 |
+
Basic Metabolic Panel:
|
362 |
+
- Glucose: {random.randint(70,140)} mg/dL
|
363 |
+
- BUN: {random.randint(7,20)} mg/dL
|
364 |
+
- Creatinine: {round(random.uniform(0.6,1.2),2)} mg/dL
|
365 |
+
|
366 |
+
Interpretation:
|
367 |
+
Results are within normal limits.
|
368 |
+
|
369 |
+
Lab Director: Dr. Wilson
|
370 |
+
"""
|
371 |
+
|
372 |
+
def _get_prescription_template(self) -> str:
|
373 |
+
return """
|
374 |
+
PRESCRIPTION
|
375 |
+
|
376 |
+
Patient: {patient_name}
|
377 |
+
Age: {age}
|
378 |
+
Gender: {gender}
|
379 |
+
Date: {date}
|
380 |
+
|
381 |
+
Prescription:
|
382 |
+
{diagnosis} - {random.choice(['Amoxicillin', 'Lisinopril', 'Metformin', 'Atorvastatin', 'Albuterol'])}
|
383 |
+
|
384 |
+
Dosage: {random.choice(['1 tablet', '2 tablets', '1 capsule'])} {random.choice(['daily', 'twice daily', 'three times daily'])}
|
385 |
+
|
386 |
+
Quantity: {random.randint(30,90)} tablets
|
387 |
+
|
388 |
+
Refills: {random.randint(0,3)}
|
389 |
+
|
390 |
+
Prescribing Provider: Dr. Smith
|
391 |
+
DEA Number: AB1234567
|
392 |
+
"""
|
393 |
+
|
394 |
+
def _get_patient_intake_template(self) -> str:
|
395 |
+
return """
|
396 |
+
PATIENT INTAKE FORM
|
397 |
+
|
398 |
+
Personal Information:
|
399 |
+
Name: {patient_name}
|
400 |
+
Age: {age}
|
401 |
+
Gender: {gender}
|
402 |
+
Address: {address}
|
403 |
+
Phone: {phone}
|
404 |
+
Email: {email}
|
405 |
+
|
406 |
+
Emergency Contact:
|
407 |
+
Name: {random.choice(['Spouse', 'Parent', 'Sibling'])} {patient_name.split()[0]}
|
408 |
+
Phone: {phone}
|
409 |
+
Relationship: {random.choice(['Spouse', 'Parent', 'Sibling'])}
|
410 |
+
|
411 |
+
Insurance Information:
|
412 |
+
Provider: {random.choice(['Blue Cross', 'Aetna', 'United Healthcare', 'Cigna'])}
|
413 |
+
Policy Number: {random.randint(100000000,999999999)}
|
414 |
+
Group Number: {random.randint(10000,99999)}
|
415 |
+
|
416 |
+
Medical History:
|
417 |
+
Chief Complaint: {chief_complaint}
|
418 |
+
Current Medications: {random.choice(['None', 'Aspirin', 'Metformin', 'Lisinopril'])}
|
419 |
+
Allergies: {random.choice(['None', 'Penicillin', 'Sulfa', 'Peanuts'])}
|
420 |
+
|
421 |
+
Vital Signs:
|
422 |
+
Blood Pressure: {blood_pressure} mmHg
|
423 |
+
Heart Rate: {heart_rate} bpm
|
424 |
+
Temperature: {temperature}°F
|
425 |
+
|
426 |
+
Intake Date: {date}
|
427 |
+
Intake Provider: Dr. Smith
|
428 |
+
"""
|
429 |
+
|
430 |
+
def main():
|
431 |
+
"""Test the generator"""
|
432 |
+
generator = MedicalTextGenerator()
|
433 |
+
|
434 |
+
# Test each record type
|
435 |
+
for record_type in generator.templates.keys():
|
436 |
+
print(f"\nGenerating {record_type}...")
|
437 |
+
record = generator.generate_record(record_type)
|
438 |
+
print(json.dumps(record, indent=2))
|
439 |
+
|
440 |
+
if __name__ == "__main__":
|
441 |
+
main()
|
src/streamlit_app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import altair as alt
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
"""
|
7 |
+
# Welcome to Streamlit!
|
8 |
+
|
9 |
+
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
10 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
11 |
+
forums](https://discuss.streamlit.io).
|
12 |
+
|
13 |
+
In the meantime, below is an example of what you can do with just a few lines of code:
|
14 |
+
"""
|
15 |
+
|
16 |
+
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
17 |
+
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
18 |
+
|
19 |
+
indices = np.linspace(0, 1, num_points)
|
20 |
+
theta = 2 * np.pi * num_turns * indices
|
21 |
+
radius = indices
|
22 |
+
|
23 |
+
x = radius * np.cos(theta)
|
24 |
+
y = radius * np.sin(theta)
|
25 |
+
|
26 |
+
df = pd.DataFrame({
|
27 |
+
"x": x,
|
28 |
+
"y": y,
|
29 |
+
"idx": indices,
|
30 |
+
"rand": np.random.randn(num_points),
|
31 |
+
})
|
32 |
+
|
33 |
+
st.altair_chart(alt.Chart(df, height=700, width=700)
|
34 |
+
.mark_point(filled=True)
|
35 |
+
.encode(
|
36 |
+
x=alt.X("x", axis=None),
|
37 |
+
y=alt.Y("y", axis=None),
|
38 |
+
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
39 |
+
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
40 |
+
))
|
src/web/index.html
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Synthex Medical Text Generator</title>
|
7 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
|
8 |
+
<style>
|
9 |
+
body {
|
10 |
+
padding: 20px;
|
11 |
+
background-color: #f8f9fa;
|
12 |
+
}
|
13 |
+
.container {
|
14 |
+
max-width: 800px;
|
15 |
+
background-color: white;
|
16 |
+
padding: 30px;
|
17 |
+
border-radius: 10px;
|
18 |
+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
19 |
+
}
|
20 |
+
.result-box {
|
21 |
+
background-color: #f8f9fa;
|
22 |
+
padding: 15px;
|
23 |
+
border-radius: 5px;
|
24 |
+
margin-top: 20px;
|
25 |
+
white-space: pre-wrap;
|
26 |
+
}
|
27 |
+
.loading {
|
28 |
+
display: none;
|
29 |
+
text-align: center;
|
30 |
+
margin: 20px 0;
|
31 |
+
}
|
32 |
+
</style>
|
33 |
+
</head>
|
34 |
+
<body>
|
35 |
+
<div class="container">
|
36 |
+
<h1 class="mb-4">Synthex Medical Text Generator</h1>
|
37 |
+
|
38 |
+
<div class="mb-3">
|
39 |
+
<label for="recordType" class="form-label">Record Type</label>
|
40 |
+
<select class="form-select" id="recordType">
|
41 |
+
<option value="clinical_note">Clinical Note</option>
|
42 |
+
<option value="discharge_summary">Discharge Summary</option>
|
43 |
+
<option value="lab_report">Lab Report</option>
|
44 |
+
<option value="prescription">Prescription</option>
|
45 |
+
<option value="patient_intake">Patient Intake</option>
|
46 |
+
</select>
|
47 |
+
</div>
|
48 |
+
|
49 |
+
<div class="mb-3">
|
50 |
+
<label for="quantity" class="form-label">Quantity</label>
|
51 |
+
<input type="number" class="form-control" id="quantity" value="1" min="1" max="10">
|
52 |
+
</div>
|
53 |
+
|
54 |
+
<div class="mb-3 form-check">
|
55 |
+
<input type="checkbox" class="form-check-input" id="useGemini">
|
56 |
+
<label class="form-check-label" for="useGemini">Use Gemini (if available)</label>
|
57 |
+
</div>
|
58 |
+
|
59 |
+
<div class="mb-3 form-check">
|
60 |
+
<input type="checkbox" class="form-check-input" id="includeMetadata" checked>
|
61 |
+
<label class="form-check-label" for="includeMetadata">Include Metadata</label>
|
62 |
+
</div>
|
63 |
+
|
64 |
+
<button class="btn btn-primary" onclick="generateRecords()">Generate Records</button>
|
65 |
+
|
66 |
+
<div class="loading" id="loading">
|
67 |
+
<div class="spinner-border text-primary" role="status">
|
68 |
+
<span class="visually-hidden">Loading...</span>
|
69 |
+
</div>
|
70 |
+
<p class="mt-2">Generating records...</p>
|
71 |
+
</div>
|
72 |
+
|
73 |
+
<div id="result" class="result-box"></div>
|
74 |
+
</div>
|
75 |
+
|
76 |
+
<script>
|
77 |
+
async function generateRecords() {
|
78 |
+
const recordType = document.getElementById('recordType').value;
|
79 |
+
const quantity = parseInt(document.getElementById('quantity').value);
|
80 |
+
const useGemini = document.getElementById('useGemini').checked;
|
81 |
+
const includeMetadata = document.getElementById('includeMetadata').checked;
|
82 |
+
|
83 |
+
// Show loading
|
84 |
+
document.getElementById('loading').style.display = 'block';
|
85 |
+
document.getElementById('result').innerHTML = '';
|
86 |
+
|
87 |
+
try {
|
88 |
+
const response = await fetch('/generate', {
|
89 |
+
method: 'POST',
|
90 |
+
headers: {
|
91 |
+
'Content-Type': 'application/json',
|
92 |
+
'Accept': 'application/json'
|
93 |
+
},
|
94 |
+
body: JSON.stringify({
|
95 |
+
record_type: recordType,
|
96 |
+
quantity: quantity,
|
97 |
+
use_gemini: useGemini,
|
98 |
+
include_metadata: includeMetadata
|
99 |
+
})
|
100 |
+
});
|
101 |
+
|
102 |
+
const data = await response.json();
|
103 |
+
|
104 |
+
// Format and display results
|
105 |
+
let resultHtml = '<h3>Generated Records:</h3>';
|
106 |
+
data.records.forEach(record => {
|
107 |
+
resultHtml += `
|
108 |
+
<div class="mb-4">
|
109 |
+
<strong>ID:</strong> ${record.id}<br>
|
110 |
+
<strong>Type:</strong> ${record.type}<br>
|
111 |
+
<strong>Source:</strong> ${record.source}<br>
|
112 |
+
<strong>Timestamp:</strong> ${record.timestamp}<br>
|
113 |
+
<strong>Text:</strong><br>
|
114 |
+
<pre>${record.text}</pre>
|
115 |
+
</div>
|
116 |
+
`;
|
117 |
+
});
|
118 |
+
document.getElementById('result').innerHTML = resultHtml;
|
119 |
+
} catch (error) {
|
120 |
+
document.getElementById('result').innerHTML = `<div class="alert alert-danger">Error: ${error.message}</div>`;
|
121 |
+
} finally {
|
122 |
+
document.getElementById('loading').style.display = 'none';
|
123 |
+
}
|
124 |
+
}
|
125 |
+
</script>
|
126 |
+
</body>
|
127 |
+
</html>
|
streamlit_app.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Synthex Medical Text Generator - MVP Streamlit App
|
3 |
+
Deploy this on Hugging Face Spaces for free hosting
|
4 |
+
"""
|
5 |
+
|
6 |
+
import streamlit as st
|
7 |
+
import json
|
8 |
+
import time
|
9 |
+
from datetime import datetime
|
10 |
+
import pandas as pd
|
11 |
+
import os
|
12 |
+
import sys
|
13 |
+
import logging
|
14 |
+
|
15 |
+
# Setup logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
# Add src directory to Python path
|
20 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
21 |
+
|
22 |
+
# Import the medical generator
|
23 |
+
from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
|
24 |
+
|
25 |
+
# Page config
|
26 |
+
st.set_page_config(
|
27 |
+
page_title="Synthex Medical Text Generator",
|
28 |
+
page_icon="🏥",
|
29 |
+
layout="wide",
|
30 |
+
initial_sidebar_state="expanded"
|
31 |
+
)
|
32 |
+
|
33 |
+
# Custom CSS
|
34 |
+
st.markdown("""
|
35 |
+
<style>
|
36 |
+
.main-header {
|
37 |
+
font-size: 3rem;
|
38 |
+
font-weight: bold;
|
39 |
+
color: #1f77b4;
|
40 |
+
text-align: center;
|
41 |
+
margin-bottom: 2rem;
|
42 |
+
}
|
43 |
+
.sub-header {
|
44 |
+
font-size: 1.5rem;
|
45 |
+
color: #666;
|
46 |
+
text-align: center;
|
47 |
+
margin-bottom: 3rem;
|
48 |
+
}
|
49 |
+
.record-container {
|
50 |
+
background-color: #f8f9fa;
|
51 |
+
padding: 1rem;
|
52 |
+
border-radius: 0.5rem;
|
53 |
+
border-left: 4px solid #1f77b4;
|
54 |
+
margin: 1rem 0;
|
55 |
+
}
|
56 |
+
.stats-container {
|
57 |
+
background-color: #e8f4fd;
|
58 |
+
padding: 1rem;
|
59 |
+
border-radius: 0.5rem;
|
60 |
+
margin: 1rem 0;
|
61 |
+
}
|
62 |
+
</style>
|
63 |
+
""", unsafe_allow_html=True)
|
64 |
+
|
65 |
+
# Initialize session state
|
66 |
+
if 'generated_records' not in st.session_state:
|
67 |
+
st.session_state.generated_records = []
|
68 |
+
if 'total_generated' not in st.session_state:
|
69 |
+
st.session_state.total_generated = 0
|
70 |
+
if 'generator' not in st.session_state:
|
71 |
+
st.session_state.generator = None
|
72 |
+
|
73 |
+
# Header
|
74 |
+
st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
|
75 |
+
st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
|
76 |
+
|
77 |
+
# Sidebar
|
78 |
+
with st.sidebar:
|
79 |
+
st.header("⚙️ Configuration")
|
80 |
+
|
81 |
+
# API Key input (pre-filled with environment variable if available)
|
82 |
+
gemini_api_key = st.text_input(
|
83 |
+
"Gemini API Key",
|
84 |
+
value=os.getenv('GEMINI_API_KEY', ''),
|
85 |
+
type="password",
|
86 |
+
help="Enter your Google Gemini API key for better generation quality"
|
87 |
+
)
|
88 |
+
|
89 |
+
# Record type selection
|
90 |
+
record_type = st.selectbox(
|
91 |
+
"Select Record Type",
|
92 |
+
["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
|
93 |
+
format_func=lambda x: x.replace("_", " ").title()
|
94 |
+
)
|
95 |
+
|
96 |
+
# Quantity
|
97 |
+
quantity = st.slider("Number of Records", 1, 20, 5)
|
98 |
+
|
99 |
+
# Generation method
|
100 |
+
use_gemini = st.checkbox(
|
101 |
+
"Use Gemini API",
|
102 |
+
value=bool(gemini_api_key), # Only default to True if API key is available
|
103 |
+
help="Uses Google Gemini API for better quality generation"
|
104 |
+
)
|
105 |
+
|
106 |
+
# Advanced options
|
107 |
+
with st.expander("Advanced Options"):
|
108 |
+
include_metadata = st.checkbox("Include Metadata", value=True)
|
109 |
+
export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
|
110 |
+
|
111 |
+
# Main content
|
112 |
+
col1, col2 = st.columns([2, 1])
|
113 |
+
|
114 |
+
with col1:
|
115 |
+
st.header("📝 Generate Medical Records")
|
116 |
+
|
117 |
+
# Generation button
|
118 |
+
if st.button("🚀 Generate Records", type="primary", use_container_width=True):
|
119 |
+
|
120 |
+
# Initialize generator if not already done
|
121 |
+
if st.session_state.generator is None:
|
122 |
+
try:
|
123 |
+
with st.spinner("Initializing medical text generator..."):
|
124 |
+
st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
|
125 |
+
except Exception as e:
|
126 |
+
st.error(f"Error initializing generator: {str(e)}")
|
127 |
+
st.stop()
|
128 |
+
|
129 |
+
# Generate records
|
130 |
+
progress_bar = st.progress(0)
|
131 |
+
status_text = st.empty()
|
132 |
+
|
133 |
+
generated_records = []
|
134 |
+
|
135 |
+
for i in range(quantity):
|
136 |
+
status_text.text(f"Generating record {i+1} of {quantity}...")
|
137 |
+
progress_bar.progress((i + 1) / quantity)
|
138 |
+
|
139 |
+
try:
|
140 |
+
record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
|
141 |
+
generated_records.append(record)
|
142 |
+
|
143 |
+
# Rate limiting
|
144 |
+
if use_gemini:
|
145 |
+
time.sleep(1)
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
logger.error(f"Failed to generate record {i+1}: {str(e)}")
|
149 |
+
st.error(f"Failed to generate record {i+1}: {str(e)}")
|
150 |
+
continue
|
151 |
+
|
152 |
+
# Update session state
|
153 |
+
if generated_records:
|
154 |
+
st.session_state.generated_records.extend(generated_records)
|
155 |
+
st.session_state.total_generated += len(generated_records)
|
156 |
+
|
157 |
+
status_text.text("✅ Generation complete!")
|
158 |
+
progress_bar.progress(1.0)
|
159 |
+
|
160 |
+
st.success(f"Successfully generated {len(generated_records)} medical records!")
|
161 |
+
|
162 |
+
# Display generated records
|
163 |
+
if st.session_state.generated_records:
|
164 |
+
st.header("📋 Generated Records")
|
165 |
+
|
166 |
+
# Filters
|
167 |
+
col_filter1, col_filter2 = st.columns(2)
|
168 |
+
with col_filter1:
|
169 |
+
filter_type = st.selectbox(
|
170 |
+
"Filter by Type",
|
171 |
+
["All"] + list(set([r['type'] for r in st.session_state.generated_records]))
|
172 |
+
)
|
173 |
+
with col_filter2:
|
174 |
+
records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
|
175 |
+
|
176 |
+
# Filter records
|
177 |
+
filtered_records = st.session_state.generated_records
|
178 |
+
if filter_type != "All":
|
179 |
+
filtered_records = [r for r in filtered_records if r['type'] == filter_type]
|
180 |
+
|
181 |
+
# Pagination
|
182 |
+
total_records = len(filtered_records)
|
183 |
+
total_pages = (total_records - 1) // records_per_page + 1
|
184 |
+
|
185 |
+
if total_pages > 1:
|
186 |
+
page = st.selectbox("Page", range(1, total_pages + 1))
|
187 |
+
start_idx = (page - 1) * records_per_page
|
188 |
+
end_idx = start_idx + records_per_page
|
189 |
+
page_records = filtered_records[start_idx:end_idx]
|
190 |
+
else:
|
191 |
+
page_records = filtered_records
|
192 |
+
|
193 |
+
# Display records
|
194 |
+
for i, record in enumerate(page_records):
|
195 |
+
with st.expander(f"Record {record['id']} - {record['type'].replace('_', ' ').title()}"):
|
196 |
+
if include_metadata:
|
197 |
+
col_meta1, col_meta2, col_meta3 = st.columns(3)
|
198 |
+
with col_meta1:
|
199 |
+
st.metric("Type", record['type'].replace('_', ' ').title())
|
200 |
+
with col_meta2:
|
201 |
+
st.metric("Generated", record['timestamp'])
|
202 |
+
with col_meta3:
|
203 |
+
st.metric("Source", record['source'])
|
204 |
+
|
205 |
+
st.markdown('<div class="record-container">', unsafe_allow_html=True)
|
206 |
+
st.text_area("Content", record['text'], height=200, key=f"record_{i}")
|
207 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
208 |
+
|
209 |
+
with col2:
|
210 |
+
st.header("📊 Statistics")
|
211 |
+
|
212 |
+
# Stats container
|
213 |
+
st.markdown('<div class="stats-container">', unsafe_allow_html=True)
|
214 |
+
|
215 |
+
# Total records
|
216 |
+
st.metric("Total Records Generated", st.session_state.total_generated)
|
217 |
+
|
218 |
+
# Record type distribution
|
219 |
+
if st.session_state.generated_records:
|
220 |
+
type_counts = pd.Series([r['type'] for r in st.session_state.generated_records]).value_counts()
|
221 |
+
st.subheader("Record Type Distribution")
|
222 |
+
st.bar_chart(type_counts)
|
223 |
+
|
224 |
+
# Export options
|
225 |
+
st.subheader("Export Data")
|
226 |
+
if st.session_state.generated_records:
|
227 |
+
if export_format == "JSON":
|
228 |
+
json_str = json.dumps(st.session_state.generated_records, indent=2)
|
229 |
+
st.download_button(
|
230 |
+
"Download JSON",
|
231 |
+
json_str,
|
232 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
233 |
+
mime="application/json"
|
234 |
+
)
|
235 |
+
elif export_format == "CSV":
|
236 |
+
df = pd.DataFrame(st.session_state.generated_records)
|
237 |
+
csv = df.to_csv(index=False)
|
238 |
+
st.download_button(
|
239 |
+
"Download CSV",
|
240 |
+
csv,
|
241 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
242 |
+
mime="text/csv"
|
243 |
+
)
|
244 |
+
elif export_format == "TXT":
|
245 |
+
txt = "\n\n".join([f"Record {r['id']} ({r['type']}):\n{r['text']}" for r in st.session_state.generated_records])
|
246 |
+
st.download_button(
|
247 |
+
"Download TXT",
|
248 |
+
txt,
|
249 |
+
file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
250 |
+
mime="text/plain"
|
251 |
+
)
|
252 |
+
|
253 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
templates/index.html
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Synthex - Medical Data Collection and Analysis</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: Arial, sans-serif;
|
10 |
+
margin: 0;
|
11 |
+
padding: 20px;
|
12 |
+
background-color: #f4f4f4;
|
13 |
+
}
|
14 |
+
.container {
|
15 |
+
max-width: 800px;
|
16 |
+
margin: 0 auto;
|
17 |
+
background: white;
|
18 |
+
padding: 20px;
|
19 |
+
border-radius: 5px;
|
20 |
+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
21 |
+
}
|
22 |
+
h1 {
|
23 |
+
color: #333;
|
24 |
+
}
|
25 |
+
.button {
|
26 |
+
display: inline-block;
|
27 |
+
padding: 10px 20px;
|
28 |
+
margin: 10px 0;
|
29 |
+
background-color: #007bff;
|
30 |
+
color: white;
|
31 |
+
text-decoration: none;
|
32 |
+
border-radius: 5px;
|
33 |
+
}
|
34 |
+
.button:hover {
|
35 |
+
background-color: #0056b3;
|
36 |
+
}
|
37 |
+
.flash {
|
38 |
+
padding: 10px;
|
39 |
+
margin: 10px 0;
|
40 |
+
border-radius: 5px;
|
41 |
+
}
|
42 |
+
.flash.success {
|
43 |
+
background-color: #d4edda;
|
44 |
+
color: #155724;
|
45 |
+
}
|
46 |
+
.flash.error {
|
47 |
+
background-color: #f8d7da;
|
48 |
+
color: #721c24;
|
49 |
+
}
|
50 |
+
</style>
|
51 |
+
</head>
|
52 |
+
<body>
|
53 |
+
<div class="container">
|
54 |
+
<h1>Synthex - Medical Data Collection and Analysis</h1>
|
55 |
+
{% with messages = get_flashed_messages(with_categories=true) %}
|
56 |
+
{% if messages %}
|
57 |
+
{% for category, message in messages %}
|
58 |
+
<div class="flash {{ category }}">{{ message }}</div>
|
59 |
+
{% endfor %}
|
60 |
+
{% endif %}
|
61 |
+
{% endwith %}
|
62 |
+
<a href="{{ url_for('collect_data') }}" class="button">Collect Data</a>
|
63 |
+
<a href="{{ url_for('analyze_data') }}" class="button">Analyze Data</a>
|
64 |
+
</div>
|
65 |
+
</body>
|
66 |
+
</html>
|
test_dataset.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
def test_medical_dataset():
|
7 |
+
try:
|
8 |
+
# Load a small sample of the medical questions dataset
|
9 |
+
dataset = load_dataset("medical_questions_pairs", split="train[:100]")
|
10 |
+
print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs")
|
11 |
+
|
12 |
+
# Print sample structure
|
13 |
+
print("\nSample structure:")
|
14 |
+
print(json.dumps(dataset[0], indent=2))
|
15 |
+
|
16 |
+
return True
|
17 |
+
except Exception as e:
|
18 |
+
print(f"Error loading dataset: {str(e)}")
|
19 |
+
return False
|
20 |
+
|
21 |
+
def verify_data_directory():
|
22 |
+
data_dir = Path("data/raw")
|
23 |
+
if not data_dir.exists():
|
24 |
+
print(f"Creating data directory: {data_dir}")
|
25 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
26 |
+
|
27 |
+
# Check for JSON files
|
28 |
+
json_files = list(data_dir.glob("*.json"))
|
29 |
+
if json_files:
|
30 |
+
print(f"\nFound {len(json_files)} JSON files in data/raw:")
|
31 |
+
for file in json_files:
|
32 |
+
print(f"- {file.name}")
|
33 |
+
else:
|
34 |
+
print("\nNo JSON files found in data/raw directory")
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
print("Testing Hugging Face dataset loading...")
|
38 |
+
test_medical_dataset()
|
39 |
+
|
40 |
+
print("\nVerifying data directory structure...")
|
41 |
+
verify_data_directory()
|
test_pubmed.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import logging
|
5 |
+
|
6 |
+
# Setup logging
|
7 |
+
logging.basicConfig(level=logging.INFO)
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
def test_pubmed_search():
|
11 |
+
"""Test PubMed search API"""
|
12 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
13 |
+
search_url = f"{base_url}esearch.fcgi"
|
14 |
+
|
15 |
+
# Test query
|
16 |
+
query = "clinical notes"
|
17 |
+
|
18 |
+
search_params = {
|
19 |
+
"db": "pubmed",
|
20 |
+
"term": query,
|
21 |
+
"retmax": 10, # Just get 10 results for testing
|
22 |
+
"retmode": "json",
|
23 |
+
"sort": "relevance"
|
24 |
+
}
|
25 |
+
|
26 |
+
logger.info(f"Testing PubMed search with query: {query}")
|
27 |
+
logger.info(f"Search URL: {search_url}")
|
28 |
+
logger.info(f"Search params: {search_params}")
|
29 |
+
|
30 |
+
try:
|
31 |
+
response = requests.get(search_url, params=search_params)
|
32 |
+
response.raise_for_status()
|
33 |
+
search_results = response.json()
|
34 |
+
|
35 |
+
logger.info(f"Response status code: {response.status_code}")
|
36 |
+
logger.info(f"Response headers: {dict(response.headers)}")
|
37 |
+
logger.info(f"Search results: {json.dumps(search_results, indent=2)}")
|
38 |
+
|
39 |
+
if "esearchresult" in search_results:
|
40 |
+
id_list = search_results["esearchresult"]["idlist"]
|
41 |
+
logger.info(f"Found {len(id_list)} article IDs")
|
42 |
+
|
43 |
+
# Test fetching one article
|
44 |
+
if id_list:
|
45 |
+
test_id = id_list[0]
|
46 |
+
fetch_url = f"{base_url}efetch.fcgi"
|
47 |
+
fetch_params = {
|
48 |
+
"db": "pubmed",
|
49 |
+
"id": test_id,
|
50 |
+
"retmode": "xml"
|
51 |
+
}
|
52 |
+
|
53 |
+
logger.info(f"\nTesting article fetch for ID: {test_id}")
|
54 |
+
logger.info(f"Fetch URL: {fetch_url}")
|
55 |
+
logger.info(f"Fetch params: {fetch_params}")
|
56 |
+
|
57 |
+
response = requests.get(fetch_url, params=fetch_params)
|
58 |
+
response.raise_for_status()
|
59 |
+
|
60 |
+
logger.info(f"Fetch response status code: {response.status_code}")
|
61 |
+
logger.info(f"Fetch response headers: {dict(response.headers)}")
|
62 |
+
logger.info(f"First 500 chars of response: {response.text[:500]}")
|
63 |
+
|
64 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
65 |
+
article = soup.find('PubmedArticle')
|
66 |
+
|
67 |
+
if article:
|
68 |
+
logger.info("\nArticle structure:")
|
69 |
+
logger.info(f"Title: {article.find('ArticleTitle').get_text() if article.find('ArticleTitle') else 'Not found'}")
|
70 |
+
logger.info(f"Abstract: {article.find('Abstract').get_text()[:200] + '...' if article.find('Abstract') else 'Not found'}")
|
71 |
+
else:
|
72 |
+
logger.error("No PubmedArticle found in response")
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error during test: {str(e)}", exc_info=True)
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
test_pubmed_search()
|
web_app.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, redirect, url_for, flash
|
2 |
+
import subprocess
|
3 |
+
import os
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
app.secret_key = 'your_secret_key' # Required for flashing messages
|
7 |
+
|
8 |
+
@app.route('/')
|
9 |
+
def index():
|
10 |
+
return render_template('index.html')
|
11 |
+
|
12 |
+
@app.route('/collect_data')
|
13 |
+
def collect_data():
|
14 |
+
try:
|
15 |
+
subprocess.run(['python', 'setup_data.py'], check=True)
|
16 |
+
flash('Data collection completed successfully!', 'success')
|
17 |
+
except subprocess.CalledProcessError as e:
|
18 |
+
flash(f'Error during data collection: {str(e)}', 'error')
|
19 |
+
return redirect(url_for('index'))
|
20 |
+
|
21 |
+
@app.route('/analyze_data')
|
22 |
+
def analyze_data():
|
23 |
+
try:
|
24 |
+
subprocess.run(['python', 'analyze_data_quality.py'], check=True)
|
25 |
+
flash('Data analysis completed successfully!', 'success')
|
26 |
+
except subprocess.CalledProcessError as e:
|
27 |
+
flash(f'Error during data analysis: {str(e)}', 'error')
|
28 |
+
return redirect(url_for('index'))
|
29 |
+
|
30 |
+
if __name__ == '__main__':
|
31 |
+
app.run(debug=True)
|