theaniketgiri commited on
Commit
32519eb
·
0 Parent(s):

� Initial commit to Hugging Face Space

Browse files
.dockerignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ synthex_env/
25
+ venv/
26
+ ENV/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # Git
35
+ .git
36
+ .gitignore
37
+
38
+ # Data
39
+ data/raw/*
40
+ data/processed/*
41
+ data/synthetic/*
42
+ !data/raw/.gitkeep
43
+ !data/processed/.gitkeep
44
+ !data/synthetic/.gitkeep
45
+
46
+ # Logs
47
+ *.log
48
+
49
+ # Local development
50
+ .env
51
+ .env.local
52
+ .env.*.local
53
+
54
+ # Docker
55
+ Dockerfile
56
+ .dockerignore
57
+
58
+ # Misc
59
+ .DS_Store
60
+ Thumbs.db
.gitignore ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+ .env
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+
35
+ # Project specific
36
+ data/raw/
37
+ data/generated/
38
+ *.log
39
+ .DS_Store
40
+ .coverage
41
+ htmlcov/
42
+ .pytest_cache/
43
+
44
+ # Hugging Face
45
+ .huggingface/
46
+ .hf/
47
+
48
+ # Docker
49
+ .docker/
50
+ docker-compose.override.yml
51
+
52
+ # Security
53
+ *.pem
54
+ *.key
55
+ *.cert
56
+
57
+ # Large files
58
+ *.json
59
+ *.csv
60
+ *.xlsx
61
+ *.xls
62
+ *.db
63
+ *.sqlite
64
+ *.h5
65
+ *.pkl
66
+ *.model
67
+ *.bin
68
+ *.pt
69
+ *.pth
70
+ *.onnx
71
+
72
+ # Python
73
+ __pycache__/
74
+ *.py[cod]
75
+ *$py.class
76
+ *.so
77
+ .Python
78
+ build/
79
+ develop-eggs/
80
+ dist/
81
+ downloads/
82
+ eggs/
83
+ .eggs/
84
+ lib/
85
+ lib64/
86
+ parts/
87
+ sdist/
88
+ var/
89
+ wheels/
90
+ *.egg-info/
91
+ .installed.cfg
92
+ *.egg
93
+
94
+ # Virtual Environment
95
+ venv/
96
+ env/
97
+ ENV/
98
+ .env
99
+
100
+ # IDE
101
+ .idea/
102
+ .vscode/
103
+ *.swp
104
+ *.swo
105
+
106
+ # Project specific
107
+ data/generated/
108
+ *.log
109
+ .DS_Store
110
+ .coverage
111
+ htmlcov/
112
+ .pytest_cache/
113
+
114
+ # Hugging Face
115
+ .huggingface/
116
+ .hf/
117
+
118
+ # Docker
119
+ .docker/
120
+ docker-compose.override.yml
121
+
122
+ # Security
123
+ *.pem
124
+ *.key
125
+ *.cert
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first to leverage Docker cache
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the rest of the application
17
+ COPY . .
18
+
19
+ # Create necessary directories
20
+ RUN mkdir -p src/web
21
+
22
+ # Expose the port
23
+ EXPOSE 8000
24
+
25
+ # Set environment variables
26
+ ENV PYTHONPATH=/app
27
+ ENV PORT=8000
28
+
29
+ # Command to run the application
30
+ CMD ["uvicorn", "src.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
LICENSE ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Synthex AI - Commercial License
2
+
3
+ Copyright (c) 2024 Synthex AI
4
+
5
+ This software and associated documentation files (the "Software") are proprietary and confidential.
6
+ The Software is protected by copyright laws and international copyright treaties, as well as other
7
+ intellectual property laws and treaties.
8
+
9
+ TERMS AND CONDITIONS
10
+
11
+ 1. License Grant
12
+ This license grants you a limited, non-exclusive, non-transferable license to use the Software
13
+ solely for your internal business purposes, subject to the terms and conditions of this Agreement.
14
+
15
+ 2. Restrictions
16
+ You may not:
17
+ - Copy, modify, or create derivative works of the Software
18
+ - Reverse engineer, decompile, or disassemble the Software
19
+ - Remove or alter any proprietary notices or labels on the Software
20
+ - Use the Software for any illegal purpose
21
+ - Transfer, sublicense, or resell the Software
22
+
23
+ 3. Proprietary Rights
24
+ The Software and all copies, modifications, and derivative works are owned by Synthex AI and
25
+ are protected by copyright, trade secret, and other intellectual property laws.
26
+
27
+ 4. Confidentiality
28
+ You agree to maintain the confidentiality of the Software and not disclose it to any third party
29
+ without Synthex AI's prior written consent.
30
+
31
+ 5. Warranty Disclaimer
32
+ THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. SYNTHEX AI DISCLAIMS ALL
33
+ WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34
+ FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT.
35
+
36
+ 6. Limitation of Liability
37
+ IN NO EVENT SHALL SYNTHEX AI BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY ARISING FROM,
38
+ OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
39
+
40
+ 7. Termination
41
+ This license is effective until terminated. Your rights under this license will terminate
42
+ automatically without notice if you fail to comply with any of its terms.
43
+
44
+ 8. Governing Law
45
+ This Agreement shall be governed by and construed in accordance with the laws of the State of
46
+ Delaware, without regard to its conflict of law provisions.
47
+
48
+ 9. Contact Information
49
+ For licensing inquiries, please contact:
50
+ Synthex AI
51
52
+ Website: https://synthex.ai
53
+
54
+ By using the Software, you acknowledge that you have read this Agreement, understand it, and agree
55
+ to be bound by its terms and conditions.
README.md ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Synthex AI - Medical Text Generation Platform
2
+
3
+ ![Synthex AI](https://img.shields.io/badge/Synthex-AI-blue)
4
+ ![Version](https://img.shields.io/badge/version-1.0.0-green)
5
+ ![License](https://img.shields.io/badge/license-MIT-blue)
6
+
7
+ > Synthex AI is a cutting-edge platform that generates HIPAA-compliant synthetic medical records for healthcare AI development, testing, and research.
8
+
9
+ ## 🏢 Enterprise Solution
10
+
11
+ Synthex AI provides enterprise-grade synthetic medical data generation with:
12
+
13
+ - **HIPAA Compliance**: All generated data is synthetic and compliant with healthcare regulations
14
+ - **Enterprise Security**: SOC 2 Type II certified infrastructure
15
+ - **Custom Solutions**: Tailored generation for specific medical domains
16
+ - **API Access**: RESTful API for integration with existing systems
17
+ - **Dedicated Support**: 24/7 enterprise support and SLAs
18
+
19
+ ## 💼 Use Cases
20
+
21
+ ### Healthcare AI Development
22
+ - Train and test AI models without real patient data
23
+ - Generate diverse medical scenarios for model validation
24
+ - Create synthetic datasets for research and development
25
+
26
+ ### Medical Software Testing
27
+ - Test EHR systems with realistic synthetic data
28
+ - Validate clinical decision support systems
29
+ - QA medical software with diverse patient scenarios
30
+
31
+ ### Healthcare Research
32
+ - Conduct research with privacy-compliant data
33
+ - Generate synthetic datasets for medical studies
34
+ - Test hypotheses without patient privacy concerns
35
+
36
+ ## 🚀 Features
37
+
38
+ ### Core Features
39
+ - Multiple medical record types:
40
+ - Clinical Notes
41
+ - Discharge Summaries
42
+ - Lab Reports
43
+ - Prescriptions
44
+ - Patient Intake Forms
45
+ - Advanced generation methods:
46
+ - Hugging Face models (default)
47
+ - Google Gemini API (premium)
48
+ - Custom model integration (enterprise)
49
+ - Enterprise-grade UI/UX
50
+ - Multiple export formats (JSON, CSV, TXT)
51
+ - Batch generation capabilities
52
+ - API access (enterprise)
53
+
54
+ ### Enterprise Features
55
+ - Custom model training
56
+ - Domain-specific generation
57
+ - Advanced data validation
58
+ - Integration support
59
+ - Dedicated infrastructure
60
+ - Custom SLAs
61
+
62
+ ## 💰 Pricing
63
+
64
+ ### Free Tier
65
+ - Basic medical record generation
66
+ - Limited to 100 records/month
67
+ - Community support
68
+ - Basic templates
69
+
70
+ ### Pro Plan ($99/month)
71
+ - Up to 10,000 records/month
72
+ - Advanced generation features
73
+ - Priority support
74
+ - API access
75
+ - Custom templates
76
+
77
+ ### Enterprise Plan (Custom)
78
+ - Unlimited generation
79
+ - Custom model training
80
+ - Dedicated support
81
+ - Custom integrations
82
+ - SLA guarantees
83
+ - On-premise deployment
84
+
85
+ ## 🛠️ Technical Details
86
+
87
+ ### Architecture
88
+ ```
89
+ synthex/
90
+ ├── app.py # Main Streamlit application
91
+ ├── src/
92
+ │ ├── generation/ # Core generation logic
93
+ │ ├── api/ # REST API endpoints
94
+ │ ├── validation/ # Data validation
95
+ │ └── enterprise/ # Enterprise features
96
+ ├── data/
97
+ │ └── generated/ # Generated records storage
98
+ ├── tests/ # Test suite
99
+ ├── Dockerfile # Docker configuration
100
+ └── requirements.txt # Python dependencies
101
+ ```
102
+
103
+ ### API Reference
104
+
105
+ ```python
106
+ from synthex import SynthexClient
107
+
108
+ # Initialize client
109
+ client = SynthexClient(api_key="your_api_key")
110
+
111
+ # Generate records
112
+ records = client.generate_records(
113
+ record_type="clinical_note",
114
+ count=100,
115
+ options={
116
+ "include_metadata": True,
117
+ "custom_fields": ["patient_demographics", "vital_signs"]
118
+ }
119
+ )
120
+
121
+ # Export data
122
+ client.export_records(
123
+ records,
124
+ format="json",
125
+ destination="s3://your-bucket/path"
126
+ )
127
+ ```
128
+
129
+ ## 🔒 Security & Compliance
130
+
131
+ - HIPAA Compliance
132
+ - SOC 2 Type II Certification
133
+ - GDPR Compliance
134
+ - Data Encryption at Rest and in Transit
135
+ - Regular Security Audits
136
+ - Access Control and Audit Logging
137
+
138
+ ## 🤝 Enterprise Support
139
+
140
+ - 24/7 Technical Support
141
+ - Dedicated Account Manager
142
+ - Custom Integration Support
143
+ - Training and Onboarding
144
+ - Regular Updates and Maintenance
145
+ - Custom Development Services
146
+
147
+ ## 📞 Contact
148
+
149
+ ### Sales Inquiries
150
+ - Email: [email protected]
151
+ - Phone: +1 (555) 123-4567
152
+ - [Schedule a Demo](https://synthex.ai/demo)
153
+
154
+ ### Technical Support
155
+ - Email: [email protected]
156
+ - [Documentation](https://docs.synthex.ai)
157
+ - [API Reference](https://api.synthex.ai)
158
+
159
+ ## 🌟 Why Choose Synthex AI?
160
+
161
+ 1. **Enterprise-Ready**: Built for scale and security
162
+ 2. **Compliance-First**: HIPAA and GDPR compliant
163
+ 3. **Customizable**: Tailored to your needs
164
+ 4. **Support**: Enterprise-grade support
165
+ 5. **Innovation**: Cutting-edge AI technology
166
+
167
+ ## 🚀 Getting Started
168
+
169
+ ### Quick Start
170
+ ```bash
171
+ # Install Synthex CLI
172
+ pip install synthex
173
+
174
+ # Initialize client
175
+ synthex init
176
+
177
+ # Generate records
178
+ synthex generate --type clinical_note --count 10
179
+ ```
180
+
181
+ ### Docker Deployment
182
+ ```bash
183
+ # Pull image
184
+ docker pull synthex/synthex:latest
185
+
186
+ # Run container
187
+ docker run -p 8501:8501 synthex/synthex
188
+ ```
189
+
190
+ ## 📚 Documentation
191
+
192
+ - [User Guide](https://docs.synthex.ai/guide)
193
+ - [API Documentation](https://docs.synthex.ai/api)
194
+ - [Enterprise Guide](https://docs.synthex.ai/enterprise)
195
+ - [Security Whitepaper](https://docs.synthex.ai/security)
196
+
197
+ ## 🙏 Acknowledgments
198
+
199
+ - Built with [Streamlit](https://streamlit.io/)
200
+ - Powered by [Hugging Face](https://huggingface.co/)
201
+ - Enterprise features by [Google Cloud](https://cloud.google.com/)
202
+
203
+ ---
204
+
205
+ © 2024 Synthex AI. All rights reserved.
206
+
207
+ # Synthex Medical Text Generator
208
+
209
+ A synthetic medical text generator that creates realistic medical records using AI models. The application provides both a FastAPI backend and a Streamlit interface.
210
+
211
+ ## Features
212
+
213
+ - Generate various types of medical records:
214
+ - Clinical Notes
215
+ - Discharge Summaries
216
+ - Lab Reports
217
+ - Prescriptions
218
+ - Patient Intake Forms
219
+ - Support for multiple AI models:
220
+ - Hugging Face models (default)
221
+ - Google Gemini (optional)
222
+ - Two interfaces:
223
+ - FastAPI with HTML frontend
224
+ - Streamlit interface
225
+
226
+ ## API Endpoints
227
+
228
+ - `GET /`: HTML interface
229
+ - `GET /record-types`: List available record types
230
+ - `POST /generate`: Generate medical records
231
+ ```json
232
+ {
233
+ "record_type": "clinical_note",
234
+ "quantity": 1,
235
+ "use_gemini": false,
236
+ "include_metadata": true
237
+ }
238
+ ```
239
+
240
+ ## Deployment
241
+
242
+ ### Local Development
243
+
244
+ 1. Install dependencies:
245
+ ```bash
246
+ pip install -r requirements.txt
247
+ ```
248
+
249
+ 2. Run FastAPI server:
250
+ ```bash
251
+ uvicorn src.api.app:app --reload
252
+ ```
253
+
254
+ 3. Run Streamlit app (optional):
255
+ ```bash
256
+ streamlit run app.py
257
+ ```
258
+
259
+ ### Docker Deployment
260
+
261
+ 1. Build the Docker image:
262
+ ```bash
263
+ docker build -t synthex-medical-generator .
264
+ ```
265
+
266
+ 2. Run the container:
267
+ ```bash
268
+ docker run -p 8000:8000 synthex-medical-generator
269
+ ```
270
+
271
+ ### Hugging Face Spaces Deployment
272
+
273
+ 1. Create a new Space on Hugging Face
274
+ 2. Choose "Docker" as the SDK
275
+ 3. Push this repository to your Space
276
+ 4. The application will be automatically deployed
277
+
278
+ ## Environment Variables
279
+
280
+ - `GEMINI_API_KEY`: Google Gemini API key (optional)
281
+
282
+ ## License
283
+
284
+ MIT License
analyze_data_quality.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import logging
5
+ from collections import defaultdict
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from typing import Dict, List, Any
9
+ import re
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class DataQualityAnalyzer:
16
+ def __init__(self, data_dir: str = "data/raw"):
17
+ self.data_dir = Path(data_dir)
18
+ self.stats = defaultdict(dict)
19
+
20
+ def load_dataset(self, file_path: Path) -> List[Dict]:
21
+ """Load a dataset from JSON file"""
22
+ try:
23
+ with open(file_path, 'r', encoding='utf-8') as f:
24
+ return json.load(f)
25
+ except Exception as e:
26
+ logger.error(f"Error loading {file_path}: {str(e)}")
27
+ return []
28
+
29
+ def analyze_text_quality(self, text: str) -> Dict[str, Any]:
30
+ """Analyze quality metrics for a text"""
31
+ if not text:
32
+ return {
33
+ "length": 0,
34
+ "word_count": 0,
35
+ "avg_word_length": 0,
36
+ "has_numbers": False,
37
+ "has_special_chars": False
38
+ }
39
+
40
+ words = text.split()
41
+ return {
42
+ "length": len(text),
43
+ "word_count": len(words),
44
+ "avg_word_length": sum(len(w) for w in words) / len(words) if words else 0,
45
+ "has_numbers": bool(re.search(r'\d', text)),
46
+ "has_special_chars": bool(re.search(r'[^a-zA-Z0-9\s.,!?-]', text))
47
+ }
48
+
49
+ def analyze_dataset(self, dataset_name: str, data: List[Dict]):
50
+ """Analyze a single dataset"""
51
+ if not data:
52
+ logger.warning(f"No data found in {dataset_name}")
53
+ return
54
+
55
+ # Basic stats
56
+ self.stats[dataset_name]["total_samples"] = len(data)
57
+
58
+ # Text quality metrics
59
+ title_metrics = []
60
+ abstract_metrics = []
61
+
62
+ for item in data:
63
+ if "title" in item:
64
+ title_metrics.append(self.analyze_text_quality(item["title"]))
65
+ if "abstract" in item:
66
+ abstract_metrics.append(self.analyze_text_quality(item["abstract"]))
67
+
68
+ # Aggregate metrics
69
+ if title_metrics:
70
+ self.stats[dataset_name]["title"] = {
71
+ "avg_length": sum(m["length"] for m in title_metrics) / len(title_metrics),
72
+ "avg_word_count": sum(m["word_count"] for m in title_metrics) / len(title_metrics),
73
+ "avg_word_length": sum(m["avg_word_length"] for m in title_metrics) / len(title_metrics),
74
+ "has_numbers_ratio": sum(1 for m in title_metrics if m["has_numbers"]) / len(title_metrics),
75
+ "has_special_chars_ratio": sum(1 for m in title_metrics if m["has_special_chars"]) / len(title_metrics)
76
+ }
77
+
78
+ if abstract_metrics:
79
+ self.stats[dataset_name]["abstract"] = {
80
+ "avg_length": sum(m["length"] for m in abstract_metrics) / len(abstract_metrics),
81
+ "avg_word_count": sum(m["word_count"] for m in abstract_metrics) / len(abstract_metrics),
82
+ "avg_word_length": sum(m["avg_word_length"] for m in abstract_metrics) / len(abstract_metrics),
83
+ "has_numbers_ratio": sum(1 for m in abstract_metrics if m["has_numbers"]) / len(abstract_metrics),
84
+ "has_special_chars_ratio": sum(1 for m in abstract_metrics if m["has_special_chars"]) / len(abstract_metrics)
85
+ }
86
+
87
+ # Field presence
88
+ fields = set()
89
+ for item in data:
90
+ fields.update(item.keys())
91
+ self.stats[dataset_name]["fields"] = list(fields)
92
+
93
+ # Year distribution (if available)
94
+ if "year" in fields:
95
+ years = [item["year"] for item in data if "year" in item]
96
+ self.stats[dataset_name]["year_distribution"] = pd.Series(years).value_counts().to_dict()
97
+
98
+ def analyze_all_datasets(self):
99
+ """Analyze all datasets in the data directory"""
100
+ for file_path in self.data_dir.glob("*.json"):
101
+ dataset_name = file_path.stem
102
+ logger.info(f"Analyzing dataset: {dataset_name}")
103
+ data = self.load_dataset(file_path)
104
+ self.analyze_dataset(dataset_name, data)
105
+
106
+ def generate_report(self):
107
+ """Generate a comprehensive report"""
108
+ report = {
109
+ "summary": {},
110
+ "datasets": self.stats
111
+ }
112
+
113
+ # Overall summary
114
+ total_samples = sum(stats["total_samples"] for stats in self.stats.values())
115
+ report["summary"]["total_samples"] = total_samples
116
+ report["summary"]["total_datasets"] = len(self.stats)
117
+
118
+ # Save report
119
+ report_file = self.data_dir.parent / "reports" / "data_quality_report.json"
120
+ report_file.parent.mkdir(exist_ok=True)
121
+
122
+ with open(report_file, 'w', encoding='utf-8') as f:
123
+ json.dump(report, f, indent=2, ensure_ascii=False)
124
+
125
+ logger.info(f"Quality report saved to {report_file}")
126
+ return report
127
+
128
+ def plot_metrics(self):
129
+ """Generate plots for key metrics"""
130
+ plots_dir = self.data_dir.parent / "reports" / "plots"
131
+ plots_dir.mkdir(exist_ok=True)
132
+
133
+ # Sample distribution
134
+ plt.figure(figsize=(10, 6))
135
+ samples = {name: stats["total_samples"] for name, stats in self.stats.items()}
136
+ plt.bar(samples.keys(), samples.values())
137
+ plt.xticks(rotation=45)
138
+ plt.title("Sample Distribution Across Datasets")
139
+ plt.tight_layout()
140
+ plt.savefig(plots_dir / "sample_distribution.png")
141
+ plt.close()
142
+
143
+ # Text length distribution
144
+ for dataset_name, stats in self.stats.items():
145
+ if "abstract" in stats:
146
+ plt.figure(figsize=(10, 6))
147
+ plt.hist([m["length"] for m in stats["abstract"]], bins=50)
148
+ plt.title(f"Abstract Length Distribution - {dataset_name}")
149
+ plt.xlabel("Length")
150
+ plt.ylabel("Count")
151
+ plt.tight_layout()
152
+ plt.savefig(plots_dir / f"abstract_length_{dataset_name}.png")
153
+ plt.close()
154
+
155
+ def main():
156
+ analyzer = DataQualityAnalyzer()
157
+ analyzer.analyze_all_datasets()
158
+ report = analyzer.generate_report()
159
+ analyzer.plot_metrics()
160
+
161
+ # Print summary
162
+ print("\nData Quality Summary:")
163
+ print(f"Total samples: {report['summary']['total_samples']}")
164
+ print(f"Total datasets: {report['summary']['total_datasets']}")
165
+ print("\nPer Dataset Summary:")
166
+ for dataset_name, stats in report["datasets"].items():
167
+ print(f"\n{dataset_name}:")
168
+ print(f" Samples: {stats['total_samples']}")
169
+ if "abstract" in stats:
170
+ print(f" Avg abstract length: {stats['abstract']['avg_length']:.1f}")
171
+ print(f" Avg words per abstract: {stats['abstract']['avg_word_count']:.1f}")
172
+
173
+ if __name__ == "__main__":
174
+ main()
aniket.py ADDED
File without changes
api.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from typing import List, Optional
5
+ import uvicorn
6
+ import sys
7
+ import os
8
+
9
+ # Add src directory to Python path
10
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
11
+
12
+ # Import the medical generator
13
+ from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
14
+
15
+ app = FastAPI(
16
+ title="Synthex Medical Text Generator API",
17
+ description="API for generating synthetic medical records",
18
+ version="1.0.0"
19
+ )
20
+
21
+ # Add CORS middleware
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"], # Allows all origins
25
+ allow_credentials=True,
26
+ allow_methods=["*"], # Allows all methods
27
+ allow_headers=["*"], # Allows all headers
28
+ )
29
+
30
+ # Initialize the generator
31
+ generator = None
32
+
33
+ class GenerationRequest(BaseModel):
34
+ record_type: str
35
+ quantity: int = 1
36
+ use_gemini: bool = False
37
+ gemini_api_key: Optional[str] = None
38
+ include_metadata: bool = True
39
+
40
+ class GenerationResponse(BaseModel):
41
+ records: List[dict]
42
+ total_generated: int
43
+
44
+ @app.on_event("startup")
45
+ async def startup_event():
46
+ global generator
47
+ try:
48
+ generator = MedicalTextGenerator()
49
+ except Exception as e:
50
+ print(f"Error initializing generator: {str(e)}")
51
+
52
+ @app.get("/")
53
+ async def root():
54
+ return {"message": "Welcome to Synthex Medical Text Generator API"}
55
+
56
+ @app.post("/generate", response_model=GenerationResponse)
57
+ async def generate_records(request: GenerationRequest):
58
+ global generator
59
+
60
+ if generator is None:
61
+ try:
62
+ generator = MedicalTextGenerator(gemini_api_key=request.gemini_api_key)
63
+ except Exception as e:
64
+ raise HTTPException(status_code=500, detail=f"Failed to initialize generator: {str(e)}")
65
+
66
+ try:
67
+ generated_records = []
68
+ for _ in range(request.quantity):
69
+ record = generator.generate_record(
70
+ request.record_type,
71
+ use_gemini=request.use_gemini
72
+ )
73
+ generated_records.append(record)
74
+
75
+ return GenerationResponse(
76
+ records=generated_records,
77
+ total_generated=len(generated_records)
78
+ )
79
+ except Exception as e:
80
+ raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
81
+
82
+ @app.get("/record-types")
83
+ async def get_record_types():
84
+ return {
85
+ "record_types": [
86
+ "clinical_note",
87
+ "discharge_summary",
88
+ "lab_report",
89
+ "prescription",
90
+ "patient_intake"
91
+ ]
92
+ }
93
+
94
+ if __name__ == "__main__":
95
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthex Medical Text Generator - MVP Streamlit App
3
+ Deploy this on Hugging Face Spaces for free hosting
4
+ """
5
+
6
+ import streamlit as st
7
+ import json
8
+ import time
9
+ from datetime import datetime
10
+ import pandas as pd
11
+ import os
12
+ import sys
13
+ import logging
14
+
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Add src directory to Python path
20
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
21
+
22
+ # Import the medical generator
23
+ from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
24
+
25
+ # Page config
26
+ st.set_page_config(
27
+ page_title="Synthex Medical Text Generator",
28
+ page_icon="🏥",
29
+ layout="wide",
30
+ initial_sidebar_state="expanded"
31
+ )
32
+
33
+ # Custom CSS
34
+ st.markdown("""
35
+ <style>
36
+ /* Main container styling */
37
+ .main {
38
+ padding: 2rem;
39
+ background-color: #f8f9fa;
40
+ }
41
+
42
+ /* Header styling */
43
+ .main-header {
44
+ font-size: 2.5rem;
45
+ font-weight: bold;
46
+ color: #1f77b4;
47
+ text-align: center;
48
+ margin-bottom: 1rem;
49
+ padding: 1rem;
50
+ background: linear-gradient(135deg, #1f77b4 0%, #2c9cdb 100%);
51
+ color: white;
52
+ border-radius: 10px;
53
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
54
+ }
55
+
56
+ .sub-header {
57
+ font-size: 1.2rem;
58
+ color: #666;
59
+ text-align: center;
60
+ margin-bottom: 2rem;
61
+ padding: 0.5rem;
62
+ }
63
+
64
+ /* Card styling */
65
+ .record-container {
66
+ background-color: white;
67
+ padding: 1.5rem;
68
+ border-radius: 10px;
69
+ border-left: 4px solid #1f77b4;
70
+ margin: 1rem 0;
71
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
72
+ transition: transform 0.2s;
73
+ }
74
+
75
+ .record-container:hover {
76
+ transform: translateY(-2px);
77
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
78
+ }
79
+
80
+ /* Stats container styling */
81
+ .stats-container {
82
+ background-color: white;
83
+ padding: 1.5rem;
84
+ border-radius: 10px;
85
+ margin: 1rem 0;
86
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
87
+ }
88
+
89
+ /* Button styling */
90
+ .stButton>button {
91
+ width: 100%;
92
+ border-radius: 5px;
93
+ height: 3em;
94
+ font-weight: bold;
95
+ transition: all 0.3s;
96
+ }
97
+
98
+ .stButton>button:hover {
99
+ transform: translateY(-2px);
100
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
101
+ }
102
+
103
+ /* Metric styling */
104
+ .stMetric {
105
+ background-color: #f8f9fa;
106
+ padding: 1rem;
107
+ border-radius: 5px;
108
+ text-align: center;
109
+ }
110
+
111
+ /* Sidebar styling */
112
+ .sidebar .sidebar-content {
113
+ background-color: #f8f9fa;
114
+ }
115
+
116
+ /* Progress bar styling */
117
+ .stProgress > div > div {
118
+ background-color: #1f77b4;
119
+ }
120
+
121
+ /* Success message styling */
122
+ .stSuccess {
123
+ padding: 1rem;
124
+ border-radius: 5px;
125
+ background-color: #d4edda;
126
+ color: #155724;
127
+ margin: 1rem 0;
128
+ }
129
+
130
+ /* Error message styling */
131
+ .stError {
132
+ padding: 1rem;
133
+ border-radius: 5px;
134
+ background-color: #f8d7da;
135
+ color: #721c24;
136
+ margin: 1rem 0;
137
+ }
138
+
139
+ /* Expander styling */
140
+ .streamlit-expanderHeader {
141
+ font-size: 1.1rem;
142
+ font-weight: bold;
143
+ color: #1f77b4;
144
+ }
145
+
146
+ /* Text area styling */
147
+ .stTextArea textarea {
148
+ font-family: monospace;
149
+ font-size: 0.9rem;
150
+ line-height: 1.5;
151
+ }
152
+ </style>
153
+ """, unsafe_allow_html=True)
154
+
155
+ # Initialize session state
156
+ if 'generated_records' not in st.session_state:
157
+ st.session_state.generated_records = []
158
+ if 'total_generated' not in st.session_state:
159
+ st.session_state.total_generated = 0
160
+ if 'generator' not in st.session_state:
161
+ st.session_state.generator = None
162
+
163
+ # Header
164
+ st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
165
+ st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
166
+
167
+ # Add a status message area
168
+ status_area = st.empty()
169
+
170
+ # Sidebar
171
+ with st.sidebar:
172
+ st.markdown("### ⚙️ Configuration")
173
+
174
+ # API Key section
175
+ with st.expander("🔑 API Settings", expanded=False):
176
+ gemini_api_key = st.text_input(
177
+ "Gemini API Key",
178
+ value=os.getenv('GEMINI_API_KEY', ''),
179
+ type="password",
180
+ help="Enter your Google Gemini API key for better generation quality"
181
+ )
182
+
183
+ # Record settings
184
+ st.markdown("### 📝 Record Settings")
185
+ record_type = st.selectbox(
186
+ "Select Record Type",
187
+ ["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
188
+ format_func=lambda x: x.replace("_", " ").title()
189
+ )
190
+
191
+ quantity = st.slider("Number of Records", 1, 20, 5)
192
+
193
+ # Generation settings
194
+ st.markdown("### 🤖 Generation Settings")
195
+ use_gemini = st.checkbox(
196
+ "Use Gemini API",
197
+ value=False,
198
+ help="Uses Google Gemini API for better quality generation"
199
+ )
200
+
201
+ # Advanced options
202
+ with st.expander("⚡ Advanced Options"):
203
+ include_metadata = st.checkbox("Include Metadata", value=True)
204
+ export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
205
+
206
+ # Main content with better organization
207
+ col1, col2 = st.columns([2, 1])
208
+
209
+ with col1:
210
+ st.markdown("### 📝 Generate Records")
211
+
212
+ # Generation button with better styling
213
+ if st.button("🚀 Generate Records", type="primary", use_container_width=True):
214
+ status_area.info("Initializing generator...")
215
+
216
+ # Initialize generator if not already done
217
+ if st.session_state.generator is None:
218
+ try:
219
+ with st.spinner("Initializing medical text generator..."):
220
+ st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
221
+ status_area.success("Generator initialized successfully!")
222
+ except Exception as e:
223
+ status_area.error(f"Error initializing generator: {str(e)}")
224
+ st.stop()
225
+
226
+ # Generate records with progress
227
+ progress_bar = st.progress(0)
228
+ status_text = st.empty()
229
+
230
+ generated_records = []
231
+
232
+ for i in range(quantity):
233
+ status_text.text(f"Generating record {i+1} of {quantity}...")
234
+ progress_bar.progress((i + 1) / quantity)
235
+
236
+ try:
237
+ record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
238
+ generated_records.append(record)
239
+
240
+ # Rate limiting
241
+ if use_gemini:
242
+ time.sleep(1)
243
+
244
+ except Exception as e:
245
+ logger.error(f"Failed to generate record {i+1}: {str(e)}")
246
+ status_area.error(f"Failed to generate record {i+1}: {str(e)}")
247
+ continue
248
+
249
+ # Update session state
250
+ if generated_records:
251
+ st.session_state.generated_records.extend(generated_records)
252
+ st.session_state.total_generated += len(generated_records)
253
+
254
+ status_text.text("✅ Generation complete!")
255
+ progress_bar.progress(1.0)
256
+
257
+ status_area.success(f"Successfully generated {len(generated_records)} medical records!")
258
+
259
+ # Display generated records with better organization
260
+ if st.session_state.generated_records:
261
+ st.markdown("### 📋 Generated Records")
262
+
263
+ # Filters with better layout
264
+ col_filter1, col_filter2 = st.columns(2)
265
+ with col_filter1:
266
+ filter_type = st.selectbox(
267
+ "Filter by Type",
268
+ ["All"] + list(set([r.get('type', 'Unknown') for r in st.session_state.generated_records]))
269
+ )
270
+ with col_filter2:
271
+ records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
272
+
273
+ # Filter records
274
+ filtered_records = st.session_state.generated_records
275
+ if filter_type != "All":
276
+ filtered_records = [r for r in filtered_records if r.get('type', 'Unknown') == filter_type]
277
+
278
+ # Pagination
279
+ total_records = len(filtered_records)
280
+ total_pages = (total_records - 1) // records_per_page + 1
281
+
282
+ if total_pages > 1:
283
+ page = st.selectbox("Page", range(1, total_pages + 1))
284
+ start_idx = (page - 1) * records_per_page
285
+ end_idx = start_idx + records_per_page
286
+ page_records = filtered_records[start_idx:end_idx]
287
+ else:
288
+ page_records = filtered_records
289
+
290
+ # Display records with better styling
291
+ for i, record in enumerate(page_records):
292
+ with st.expander(f"Record {record.get('id', 'Unknown')} - {record.get('type', 'Unknown').replace('_', ' ').title()}"):
293
+ if include_metadata:
294
+ col_meta1, col_meta2, col_meta3 = st.columns(3)
295
+ with col_meta1:
296
+ st.metric("Type", record.get('type', 'Unknown').replace('_', ' ').title())
297
+ with col_meta2:
298
+ st.metric("Generated", record.get('timestamp', 'N/A'))
299
+ with col_meta3:
300
+ st.metric("Source", record.get('source', 'Hugging Face'))
301
+
302
+ st.markdown('<div class="record-container">', unsafe_allow_html=True)
303
+ st.text_area("Content", record.get('text', 'No content available'), height=200, key=f"record_{i}")
304
+ st.markdown('</div>', unsafe_allow_html=True)
305
+
306
+ with col2:
307
+ st.markdown("### 📊 Statistics")
308
+
309
+ # Stats container with better styling
310
+ st.markdown('<div class="stats-container">', unsafe_allow_html=True)
311
+
312
+ # Total records
313
+ st.metric("Total Records Generated", st.session_state.total_generated)
314
+
315
+ # Record type distribution with better visualization
316
+ if st.session_state.generated_records:
317
+ type_counts = pd.Series([r.get('type', 'Unknown') for r in st.session_state.generated_records]).value_counts()
318
+ st.markdown("#### Record Type Distribution")
319
+ st.bar_chart(type_counts)
320
+
321
+ # Export options with better organization
322
+ st.markdown("#### 💾 Export Data")
323
+ if st.session_state.generated_records:
324
+ if export_format == "JSON":
325
+ json_str = json.dumps(st.session_state.generated_records, indent=2)
326
+ st.download_button(
327
+ "📥 Download JSON",
328
+ json_str,
329
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
330
+ mime="application/json",
331
+ use_container_width=True
332
+ )
333
+ elif export_format == "CSV":
334
+ df = pd.DataFrame(st.session_state.generated_records)
335
+ csv = df.to_csv(index=False)
336
+ st.download_button(
337
+ "📥 Download CSV",
338
+ csv,
339
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
340
+ mime="text/csv",
341
+ use_container_width=True
342
+ )
343
+ elif export_format == "TXT":
344
+ txt = "\n\n".join([f"Record {r.get('id', 'Unknown')} ({r.get('type', 'Unknown')}):\n{r.get('text', 'No content available')}" for r in st.session_state.generated_records])
345
+ st.download_button(
346
+ "📥 Download TXT",
347
+ txt,
348
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
349
+ mime="text/plain",
350
+ use_container_width=True
351
+ )
352
+
353
+ st.markdown('</div>', unsafe_allow_html=True)
354
+
355
+ # Add a footer
356
+ st.markdown("---")
357
+ st.markdown("""
358
+ <div style='text-align: center; color: #666;'>
359
+ <p>Built with ❤️ using Streamlit | Synthex Medical Text Generator</p>
360
+ </div>
361
+ """, unsafe_allow_html=True)
batch_generate.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import time
5
+ from pathlib import Path
6
+ from src.generation.medical_generator import MedicalTextGenerator
7
+
8
+ # Check for Gemini API key
9
+ if not os.getenv('GEMINI_API_KEY'):
10
+ print("Please set the GEMINI_API_KEY environment variable:")
11
+ print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'")
12
+ print("Windows CMD: set GEMINI_API_KEY=your-api-key-here")
13
+ exit(1)
14
+
15
+ # Ensure the output directory exists
16
+ output_dir = Path("data/synthetic")
17
+ output_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ # Initialize the generator
20
+ generator = MedicalTextGenerator()
21
+
22
+ # Define supported record types (using the keys from the generator's templates)
23
+ record_types = ["clinical_note", "discharge_summary", "lab_report"]
24
+
25
+ # Generate 100 mixed records
26
+ records = []
27
+ for i in range(100):
28
+ # Randomly select record type
29
+ record_type = random.choice(record_types)
30
+
31
+ # Generate record using Hugging Face
32
+ try:
33
+ record = generator.generate_record(record_type, use_gemini=False)
34
+ print(f"Generated record {i+1}/100: {record_type}")
35
+
36
+ # Append record details
37
+ records.append({
38
+ "id": i + 1,
39
+ "type": record_type,
40
+ "content": record,
41
+ "generator": "Hugging Face",
42
+ "generated_at": time.strftime("%Y-%m-%d %H:%M:%S")
43
+ })
44
+
45
+ # Respect rate limits (e.g., 4 seconds between calls)
46
+ time.sleep(4)
47
+
48
+ except Exception as e:
49
+ print(f"Error generating record {i+1}: {str(e)}")
50
+ continue
51
+
52
+ # Save records to a JSON file
53
+ output_file = output_dir / "synthetic_records.json"
54
+ with open(output_file, "w") as f:
55
+ json.dump(records, f, indent=2)
56
+
57
+ print(f"\nGenerated {len(records)} records and saved to {output_file}")
data/processed/.gitkeep ADDED
File without changes
data/reports/plots/sample_distribution.png ADDED
data/synthetic/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML libraries
2
+ torch==2.2.1
3
+ transformers==4.38.2
4
+ datasets>=2.12.0
5
+ huggingface_hub>=0.15.0
6
+
7
+ # Web framework
8
+ streamlit==1.32.0
9
+ gradio>=3.35.0
10
+ fastapi>=0.115.2
11
+ uvicorn>=0.24.0
12
+
13
+ # Data processing
14
+ pandas==2.2.1
15
+ numpy==1.26.4
16
+ requests>=2.31.0
17
+ beautifulsoup4>=4.12.0
18
+ lxml>=4.9.0
19
+
20
+ # Medical NLP
21
+ spacy>=3.6.0
22
+ scikit-learn>=1.3.0
23
+
24
+ # API integration
25
+ google-generativeai==0.3.2
26
+
27
+ # Utilities
28
+ python-dotenv==1.0.1
29
+ tqdm>=4.65.0
setup.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import subprocess
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ # Setup logging
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def check_python_version():
12
+ """Check if Python version is compatible"""
13
+ if sys.version_info < (3, 11):
14
+ logger.error("Python 3.11 or higher is required")
15
+ sys.exit(1)
16
+ logger.info(f"Python version {sys.version_info.major}.{sys.version_info.minor} detected")
17
+
18
+ def create_virtual_environment():
19
+ """Create and activate virtual environment"""
20
+ venv_name = "synthex_env"
21
+ if not os.path.exists(venv_name):
22
+ logger.info(f"Creating virtual environment: {venv_name}")
23
+ subprocess.run([sys.executable, "-m", "venv", venv_name], check=True)
24
+ else:
25
+ logger.info(f"Virtual environment {venv_name} already exists")
26
+
27
+ def install_requirements():
28
+ """Install required packages"""
29
+ logger.info("Installing requirements...")
30
+ subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)
31
+
32
+ def create_directories():
33
+ """Create necessary directories"""
34
+ directories = [
35
+ "data/raw",
36
+ "data/processed",
37
+ "data/reports",
38
+ "data/reports/plots"
39
+ ]
40
+ for directory in directories:
41
+ Path(directory).mkdir(parents=True, exist_ok=True)
42
+ logger.info(f"Created directory: {directory}")
43
+
44
+ def setup_environment():
45
+ """Setup the complete environment"""
46
+ try:
47
+ logger.info("Starting environment setup...")
48
+
49
+ # Check Python version
50
+ check_python_version()
51
+
52
+ # Create virtual environment
53
+ create_virtual_environment()
54
+
55
+ # Install requirements
56
+ install_requirements()
57
+
58
+ # Create directories
59
+ create_directories()
60
+
61
+ logger.info("Environment setup completed successfully!")
62
+ logger.info("\nNext steps:")
63
+ logger.info("1. Activate the virtual environment:")
64
+ logger.info(" - Windows: synthex_env\\Scripts\\activate")
65
+ logger.info(" - Unix/MacOS: source synthex_env/bin/activate")
66
+ logger.info("2. Run data collection: python setup_data.py")
67
+ logger.info("3. Analyze data quality: python analyze_data_quality.py")
68
+
69
+ except subprocess.CalledProcessError as e:
70
+ logger.error(f"Error during setup: {str(e)}")
71
+ sys.exit(1)
72
+ except Exception as e:
73
+ logger.error(f"Unexpected error: {str(e)}")
74
+ sys.exit(1)
75
+
76
+ if __name__ == "__main__":
77
+ setup_environment()
setup_data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+ import logging
5
+ import subprocess
6
+
7
+ # Add src directory to Python path
8
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
9
+
10
+ # Setup logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def setup_data_directories():
18
+ """Create necessary data directories"""
19
+ directories = [
20
+ "data/raw",
21
+ "data/processed",
22
+ "data/synthetic"
23
+ ]
24
+
25
+ for directory in directories:
26
+ path = Path(directory)
27
+ path.mkdir(parents=True, exist_ok=True)
28
+ logger.info(f"Created directory: {directory}")
29
+
30
+ # Create .gitkeep file
31
+ gitkeep = path / ".gitkeep"
32
+ gitkeep.touch(exist_ok=True)
33
+ logger.info(f"Created .gitkeep in {directory}")
34
+
35
+ def main():
36
+ """Setup data directories and run collection"""
37
+ logger.info("Setting up data directories...")
38
+ setup_data_directories()
39
+
40
+ logger.info("Running data collection script via subprocess...")
41
+ result = subprocess.run([sys.executable, 'src/data_collection/data_collection.py'])
42
+ if result.returncode != 0:
43
+ logger.error(f"Data collection script failed with exit code {result.returncode}")
44
+ else:
45
+ logger.info("Data collection completed successfully.")
46
+
47
+ if __name__ == "__main__":
48
+ main()
src/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Synthex Medical Text Generator
3
+ A tool for generating synthetic medical records
4
+ """
5
+
6
+ __version__ = "0.1.0"
src/api/app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.responses import FileResponse
5
+ from pydantic import BaseModel
6
+ from typing import List, Optional
7
+ import sys
8
+ import os
9
+ import logging
10
+
11
+ # Add src directory to Python path
12
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from generation.medical_generator import MedicalTextGenerator
15
+
16
+ # Setup logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ app = FastAPI(
21
+ title="Synthex Medical Text Generator API",
22
+ description="API for generating synthetic medical records",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # Add CORS middleware
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"], # Allows all origins
30
+ allow_credentials=True,
31
+ allow_methods=["*"], # Allows all methods
32
+ allow_headers=["*"], # Allows all headers
33
+ )
34
+
35
+ # Mount static files
36
+ app.mount("/static", StaticFiles(directory="src/web"), name="static")
37
+
38
+ # Initialize generator
39
+ generator = MedicalTextGenerator()
40
+
41
+ class GenerationRequest(BaseModel):
42
+ record_type: str
43
+ quantity: int = 1
44
+ use_gemini: bool = False
45
+ include_metadata: bool = True
46
+
47
+ class MedicalRecord(BaseModel):
48
+ id: str
49
+ type: str
50
+ text: str
51
+ timestamp: str
52
+ source: str
53
+
54
+ class GenerationResponse(BaseModel):
55
+ records: List[MedicalRecord]
56
+ total_generated: int
57
+
58
+ @app.get("/")
59
+ async def read_root():
60
+ """Serve the HTML interface"""
61
+ return FileResponse("src/web/index.html")
62
+
63
+ @app.get("/record-types")
64
+ async def get_record_types():
65
+ """Get available record types"""
66
+ return {"record_types": list(generator.templates.keys())}
67
+
68
+ @app.post("/generate", response_model=GenerationResponse)
69
+ async def generate_records(request: GenerationRequest):
70
+ """Generate synthetic medical records"""
71
+ try:
72
+ if request.record_type not in generator.templates:
73
+ raise HTTPException(status_code=400, detail=f"Invalid record type. Available types: {list(generator.templates.keys())}")
74
+
75
+ if request.quantity < 1 or request.quantity > 10:
76
+ raise HTTPException(status_code=400, detail="Quantity must be between 1 and 10")
77
+
78
+ records = generator.batch_generate(
79
+ record_type=request.record_type,
80
+ count=request.quantity,
81
+ use_gemini=request.use_gemini
82
+ )
83
+
84
+ return {
85
+ "records": records,
86
+ "total_generated": len(records)
87
+ }
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error generating records: {str(e)}")
91
+ raise HTTPException(status_code=500, detail=str(e))
92
+
93
+ if __name__ == "__main__":
94
+ import uvicorn
95
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/data_collection/data_collection.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Medical Data Collection Pipeline for Synthex MVP
3
+ Collects medical text from free sources for training data
4
+ """
5
+
6
+ import requests
7
+ import pandas as pd
8
+ from datasets import load_dataset
9
+ import time
10
+ import json
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any
13
+ import logging
14
+ import sys
15
+ from tqdm import tqdm
16
+ from bs4 import BeautifulSoup
17
+ import re
18
+ from datetime import datetime
19
+
20
+ # Setup logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ logging.StreamHandler(sys.stdout),
26
+ logging.FileHandler('data_collection.log')
27
+ ]
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ class MedicalDataCollector:
32
+ def __init__(self, output_dir: str = "data/raw"):
33
+ self.output_dir = Path(output_dir)
34
+ self.output_dir.mkdir(parents=True, exist_ok=True)
35
+ self.stats = {
36
+ "total_samples": 0,
37
+ "sources": {},
38
+ "errors": [],
39
+ "start_time": datetime.now()
40
+ }
41
+ logger.info(f"Initialized MedicalDataCollector with output directory: {self.output_dir}")
42
+
43
+ def collect_huggingface_datasets(self) -> Dict[str, List]:
44
+ """Collect medical datasets from Hugging Face Hub"""
45
+
46
+ # Only include datasets that are known to exist and are medical-related
47
+ datasets_to_collect = [
48
+ "medical_questions_pairs",
49
+ "medalpaca/medical_meadow_medical_flashcards",
50
+ "gamino/wiki_medical_terms",
51
+ ("pubmed_qa", "pqa_artificial") # pubmed_qa requires a config
52
+ ]
53
+
54
+ collected_data = {}
55
+
56
+ for dataset_entry in tqdm(datasets_to_collect, desc="Collecting Hugging Face datasets"):
57
+ try:
58
+ if isinstance(dataset_entry, tuple):
59
+ dataset_name, config = dataset_entry
60
+ logger.info(f"Loading dataset: {dataset_name} with config: {config}")
61
+ dataset = load_dataset(dataset_name, config, split="train")
62
+ dataset_key = f"{dataset_name}_{config}"
63
+ else:
64
+ dataset_name = dataset_entry
65
+ logger.info(f"Loading dataset: {dataset_name}")
66
+ dataset = load_dataset(dataset_name, split="train")
67
+ dataset_key = dataset_name
68
+
69
+ # Convert to list of dictionaries
70
+ data_list = []
71
+ for item in dataset:
72
+ processed_item = self._process_dataset_item(item)
73
+ if processed_item:
74
+ data_list.append(processed_item)
75
+
76
+ if data_list:
77
+ collected_data[dataset_key] = data_list
78
+ self.stats["sources"][dataset_key] = len(data_list)
79
+ self.stats["total_samples"] += len(data_list)
80
+
81
+ # Save to file
82
+ output_file = self.output_dir / f"{dataset_key.replace('/', '_')}.json"
83
+ with open(output_file, 'w', encoding='utf-8') as f:
84
+ json.dump(data_list, f, indent=2, ensure_ascii=False)
85
+
86
+ logger.info(f"Saved {len(data_list)} samples from {dataset_key} to {output_file}")
87
+ else:
88
+ logger.warning(f"No valid data found in dataset: {dataset_key}")
89
+
90
+ time.sleep(1) # Be respectful to APIs
91
+
92
+ except Exception as e:
93
+ error_msg = f"Failed to load {dataset_entry}: {str(e)}"
94
+ logger.error(error_msg, exc_info=True)
95
+ self.stats["errors"].append(error_msg)
96
+ continue
97
+
98
+ return collected_data
99
+
100
+ def collect_pubmed_abstracts(self, queries: List[str] = None, max_results: int = 1000) -> List[Dict]:
101
+ """Collect PubMed abstracts via API"""
102
+
103
+ if queries is None:
104
+ queries = [
105
+ "clinical notes",
106
+ "medical case reports",
107
+ "patient discharge summaries",
108
+ "medical laboratory reports",
109
+ "medical imaging reports"
110
+ ]
111
+
112
+ all_abstracts = []
113
+
114
+ for query in tqdm(queries, desc="Collecting PubMed abstracts"):
115
+ try:
116
+ abstracts = self._collect_pubmed_query(query, max_results)
117
+ all_abstracts.extend(abstracts)
118
+ self.stats["sources"]["pubmed_" + query.replace(" ", "_")] = len(abstracts)
119
+ self.stats["total_samples"] += len(abstracts)
120
+
121
+ except Exception as e:
122
+ error_msg = f"Failed to collect PubMed abstracts for {query}: {str(e)}"
123
+ logger.error(error_msg)
124
+ self.stats["errors"].append(error_msg)
125
+ continue
126
+
127
+ # Save all abstracts
128
+ if all_abstracts:
129
+ output_file = self.output_dir / "pubmed_abstracts.json"
130
+ with open(output_file, 'w', encoding='utf-8') as f:
131
+ json.dump(all_abstracts, f, indent=2, ensure_ascii=False)
132
+
133
+ return all_abstracts
134
+
135
+ def _collect_pubmed_query(self, query: str, max_results: int) -> List[Dict]:
136
+ """Collect PubMed abstracts for a specific query"""
137
+
138
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
139
+ search_url = f"{base_url}esearch.fcgi"
140
+
141
+ search_params = {
142
+ "db": "pubmed",
143
+ "term": query,
144
+ "retmax": max_results,
145
+ "retmode": "json",
146
+ "sort": "relevance"
147
+ }
148
+
149
+ try:
150
+ response = requests.get(search_url, params=search_params)
151
+ response.raise_for_status() # Raise exception for bad status codes
152
+ search_results = response.json()
153
+
154
+ # Check rate limits
155
+ rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
156
+ rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
157
+ logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
158
+
159
+ if rate_remaining <= 1:
160
+ logger.warning("Rate limit nearly reached, waiting 60 seconds")
161
+ time.sleep(60)
162
+
163
+ except requests.exceptions.RequestException as e:
164
+ logger.error(f"Failed to fetch PubMed search results for query '{query}': {str(e)}")
165
+ return []
166
+ except json.JSONDecodeError as e:
167
+ logger.error(f"Failed to parse PubMed search results for query '{query}': {str(e)}")
168
+ return []
169
+
170
+ if "esearchresult" not in search_results:
171
+ logger.warning(f"No search results found for query '{query}'")
172
+ return []
173
+
174
+ id_list = search_results["esearchresult"]["idlist"]
175
+ abstracts = []
176
+ batch_size = 100
177
+
178
+ for i in range(0, len(id_list), batch_size):
179
+ batch_ids = id_list[i:i+batch_size]
180
+ ids_str = ",".join(batch_ids)
181
+
182
+ fetch_url = f"{base_url}efetch.fcgi"
183
+ fetch_params = {
184
+ "db": "pubmed",
185
+ "id": ids_str,
186
+ "retmode": "xml"
187
+ }
188
+
189
+ try:
190
+ response = requests.get(fetch_url, params=fetch_params)
191
+ response.raise_for_status()
192
+
193
+ # Check rate limits
194
+ rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
195
+ rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
196
+ logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
197
+
198
+ if rate_remaining <= 1:
199
+ logger.warning("Rate limit nearly reached, waiting 60 seconds")
200
+ time.sleep(60)
201
+
202
+ # Parse XML with proper features
203
+ soup = BeautifulSoup(response.text, 'lxml', features="xml")
204
+
205
+ except requests.exceptions.RequestException as e:
206
+ logger.error(f"Failed to fetch PubMed article batch {i//batch_size + 1}: {str(e)}")
207
+ continue
208
+ except Exception as e:
209
+ logger.error(f"Failed to parse PubMed article batch {i//batch_size + 1}: {str(e)}")
210
+ continue
211
+
212
+ for article in soup.find_all('PubmedArticle'):
213
+ try:
214
+ abstract = article.find('Abstract')
215
+ if abstract:
216
+ abstract_text = abstract.get_text().strip()
217
+ if len(abstract_text) > 100: # Filter out very short abstracts
218
+ title = article.find('ArticleTitle')
219
+ if not title:
220
+ continue
221
+ title_text = title.get_text().strip()
222
+
223
+ pub_date = article.find('PubDate')
224
+ year = "Unknown"
225
+ if pub_date and pub_date.find('Year'):
226
+ year = pub_date.find('Year').get_text().strip()
227
+
228
+ abstracts.append({
229
+ "title": title_text,
230
+ "abstract": abstract_text,
231
+ "year": year,
232
+ "source": "pubmed",
233
+ "query": query
234
+ })
235
+ except Exception as e:
236
+ logger.debug(f"Failed to process article in batch {i//batch_size + 1}: {str(e)}")
237
+ continue
238
+
239
+ # Always wait between batches to respect rate limits
240
+ time.sleep(1)
241
+
242
+ logger.info(f"Collected {len(abstracts)} abstracts for query '{query}'")
243
+ return abstracts
244
+
245
+ def create_training_dataset(self) -> pd.DataFrame:
246
+ """Combine all collected data into training dataset"""
247
+
248
+ all_texts = []
249
+
250
+ # Load all collected datasets
251
+ for json_file in tqdm(list(self.output_dir.glob("*.json")), desc="Processing collected data"):
252
+ try:
253
+ with open(json_file, 'r', encoding='utf-8') as f:
254
+ data = json.load(f)
255
+
256
+ # Extract text content
257
+ for item in data:
258
+ text_content = self._extract_text_content(item)
259
+ if text_content:
260
+ processed_text = self._clean_text(text_content)
261
+ if processed_text:
262
+ all_texts.append({
263
+ "text": processed_text,
264
+ "source": json_file.stem,
265
+ "length": len(processed_text),
266
+ "type": self._determine_text_type(processed_text)
267
+ })
268
+
269
+ except Exception as e:
270
+ error_msg = f"Failed to process {json_file}: {str(e)}"
271
+ logger.error(error_msg)
272
+ self.stats["errors"].append(error_msg)
273
+ continue
274
+
275
+ # Create DataFrame
276
+ df = pd.DataFrame(all_texts)
277
+
278
+ # Basic filtering
279
+ df = df[df['length'] > 100] # Remove very short texts
280
+ df = df[df['length'] < 5000] # Remove very long texts
281
+
282
+ # Remove duplicates
283
+ df = df.drop_duplicates(subset=['text'])
284
+
285
+ # Save processed dataset
286
+ output_file = self.output_dir.parent / "processed" / "training_data.csv"
287
+ output_file.parent.mkdir(exist_ok=True)
288
+ df.to_csv(output_file, index=False, encoding='utf-8')
289
+
290
+ # Update stats
291
+ self.stats["final_samples"] = len(df)
292
+ self.stats["text_types"] = df['type'].value_counts().to_dict()
293
+
294
+ logger.info(f"Created training dataset with {len(df)} samples")
295
+ return df
296
+
297
+ def _process_dataset_item(self, item: Dict) -> Dict:
298
+ """Process and validate a dataset item"""
299
+ try:
300
+ # Extract text content
301
+ text = self._extract_text_content(item)
302
+ if not text or len(text) < 100:
303
+ return None
304
+
305
+ # Clean text
306
+ cleaned_text = self._clean_text(text)
307
+ if not cleaned_text:
308
+ return None
309
+
310
+ # Create processed item
311
+ processed = {
312
+ "text": cleaned_text,
313
+ "source": "huggingface",
314
+ "type": self._determine_text_type(cleaned_text)
315
+ }
316
+
317
+ # Add metadata if available
318
+ for key in ['title', 'question', 'answer', 'instruction']:
319
+ if key in item:
320
+ processed[key] = str(item[key])
321
+
322
+ return processed
323
+
324
+ except Exception:
325
+ return None
326
+
327
+ def _extract_text_content(self, item: Dict) -> str:
328
+ """Extract relevant text content from dataset item"""
329
+
330
+ # Common text fields in medical datasets
331
+ text_fields = ['text', 'content', 'abstract', 'question', 'answer',
332
+ 'instruction', 'output', 'input', 'context']
333
+
334
+ for field in text_fields:
335
+ if field in item and item[field]:
336
+ return str(item[field])
337
+
338
+ # Fallback: combine multiple fields
339
+ combined_text = ""
340
+ for key, value in item.items():
341
+ if isinstance(value, str) and len(value) > 20:
342
+ combined_text += f"{value} "
343
+
344
+ return combined_text.strip()
345
+
346
+ def _clean_text(self, text: str) -> str:
347
+ """Clean and normalize text"""
348
+ if not text:
349
+ return ""
350
+
351
+ # Remove special characters and normalize whitespace
352
+ text = re.sub(r'[^\w\s.,;:!?()-]', ' ', text)
353
+ text = re.sub(r'\s+', ' ', text)
354
+
355
+ # Remove common noise
356
+ text = re.sub(r'http\S+', '', text)
357
+ text = re.sub(r'www\S+', '', text)
358
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
359
+
360
+ return text.strip()
361
+
362
+ def _determine_text_type(self, text: str) -> str:
363
+ """Determine the type of medical text"""
364
+ text = text.lower()
365
+
366
+ if any(term in text for term in ['discharge', 'summary', 'discharge summary']):
367
+ return 'discharge_summary'
368
+ elif any(term in text for term in ['lab', 'laboratory', 'test results']):
369
+ return 'lab_report'
370
+ elif any(term in text for term in ['prescription', 'medication', 'drug']):
371
+ return 'prescription'
372
+ elif any(term in text for term in ['question', 'answer', 'qa']):
373
+ return 'medical_qa'
374
+ else:
375
+ return 'clinical_note'
376
+
377
+ def generate_report(self) -> Dict:
378
+ """Generate a report of the data collection process"""
379
+ # Convert all datetime objects to strings
380
+ for k, v in self.stats.items():
381
+ if isinstance(v, datetime):
382
+ self.stats[k] = str(v)
383
+ self.stats["end_time"] = str(datetime.now())
384
+ if isinstance(self.stats["start_time"], datetime):
385
+ self.stats["start_time"] = str(self.stats["start_time"])
386
+ # Calculate duration as string
387
+ try:
388
+ start_dt = datetime.fromisoformat(self.stats["start_time"])
389
+ end_dt = datetime.fromisoformat(self.stats["end_time"])
390
+ self.stats["duration"] = str(end_dt - start_dt)
391
+ except Exception:
392
+ self.stats["duration"] = "unknown"
393
+
394
+ report_file = self.output_dir.parent / "reports" / "collection_report.json"
395
+ report_file.parent.mkdir(exist_ok=True)
396
+
397
+ with open(report_file, 'w', encoding='utf-8') as f:
398
+ json.dump(self.stats, f, indent=2, ensure_ascii=False)
399
+
400
+ return self.stats
401
+
402
+ def main():
403
+ """Run data collection pipeline"""
404
+
405
+ try:
406
+ collector = MedicalDataCollector()
407
+
408
+ # Collect from Hugging Face
409
+ logger.info("Starting Hugging Face dataset collection...")
410
+ hf_data = collector.collect_huggingface_datasets()
411
+
412
+ # Collect from PubMed
413
+ logger.info("Starting PubMed collection...")
414
+ pubmed_data = collector.collect_pubmed_abstracts()
415
+
416
+ # Create training dataset
417
+ logger.info("Creating training dataset...")
418
+ training_df = collector.create_training_dataset()
419
+
420
+ # Generate report
421
+ report = collector.generate_report()
422
+
423
+ # Print summary
424
+ logger.info("\nData Collection Summary:")
425
+ logger.info(f"Total samples collected: {report['total_samples']}")
426
+ logger.info(f"Final training samples: {report['final_samples']}")
427
+ logger.info(f"Duration: {report['duration']}")
428
+ logger.info("\nText types distribution:")
429
+ for type_, count in report['text_types'].items():
430
+ logger.info(f"- {type_}: {count}")
431
+
432
+ if report['errors']:
433
+ logger.warning(f"\nEncountered {len(report['errors'])} errors during collection")
434
+
435
+ except Exception as e:
436
+ logger.error(f"Data collection failed: {str(e)}", exc_info=True)
437
+ sys.exit(1)
438
+
439
+ if __name__ == "__main__":
440
+ main()
src/generation/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthex Medical Text Generation Package
3
+ """
4
+
5
+ from .medical_generator import MedicalTextGenerator
6
+
7
+ __all__ = ['MedicalTextGenerator']
src/generation/medical_generator.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic Medical Text Generator for Synthex MVP
3
+ Uses Hugging Face models and Gemini API
4
+ """
5
+
6
+ import google.generativeai as genai
7
+ from transformers import pipeline
8
+ import random
9
+ import time
10
+ import json
11
+ from typing import List, Dict, Optional
12
+ import logging
13
+ from datetime import datetime
14
+ import os
15
+ import sys
16
+
17
+ # Setup logging with better formatting
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
21
+ handlers=[
22
+ logging.StreamHandler(sys.stdout)
23
+ ]
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Get Gemini API key from environment variable
28
+ DEFAULT_GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', '')
29
+
30
+ class MedicalTextGenerator:
31
+ def __init__(self, gemini_api_key: Optional[str] = None):
32
+ """Initialize the medical text generator"""
33
+
34
+ self.gemini_api_key = gemini_api_key or DEFAULT_GEMINI_API_KEY
35
+ if not self.gemini_api_key:
36
+ logger.warning("No Gemini API key provided. Using Hugging Face model only.")
37
+
38
+ self.hf_model = None
39
+ self.gemini_model = None
40
+
41
+ # Initialize models
42
+ self._setup_models()
43
+
44
+ # Medical record templates
45
+ self.templates = {
46
+ "clinical_note": self._get_clinical_note_template(),
47
+ "discharge_summary": self._get_discharge_summary_template(),
48
+ "lab_report": self._get_lab_report_template(),
49
+ "prescription": self._get_prescription_template(),
50
+ "patient_intake": self._get_patient_intake_template()
51
+ }
52
+
53
+ def _setup_models(self):
54
+ """Setup Hugging Face and Gemini models"""
55
+
56
+ try:
57
+ # Setup Hugging Face model (free)
58
+ logger.info("Loading Hugging Face medical model...")
59
+
60
+ # Use text generation pipeline with a smaller model and CPU device
61
+ self.hf_generator = pipeline(
62
+ "text-generation",
63
+ model="distilgpt2",
64
+ max_length=512,
65
+ do_sample=True,
66
+ temperature=0.7,
67
+ device=-1, # Force CPU usage to avoid CUDA issues
68
+ truncation=True # Add truncation to avoid warnings
69
+ )
70
+
71
+ logger.info("Hugging Face model loaded successfully")
72
+
73
+ except Exception as e:
74
+ logger.error(f"Failed to load Hugging Face model: {str(e)}")
75
+ self.hf_generator = None
76
+ logger.info("Falling back to template-based generation")
77
+
78
+ try:
79
+ # Setup Gemini (free tier)
80
+ if self.gemini_api_key:
81
+ genai.configure(api_key=self.gemini_api_key)
82
+ # List available models
83
+ for m in genai.list_models():
84
+ logger.info(f"Available model: {m.name}")
85
+ self.gemini_model = genai.GenerativeModel('gemini-pro')
86
+ logger.info("Gemini model loaded successfully")
87
+
88
+ except Exception as e:
89
+ logger.error(f"Failed to load Gemini model: {str(e)}")
90
+ self.gemini_model = None
91
+ logger.info("Gemini API will not be available")
92
+
93
+ def generate_record(self, record_type: str, use_gemini: bool = False) -> Dict:
94
+ """Generate a synthetic medical record"""
95
+
96
+ if record_type not in self.templates:
97
+ raise ValueError(f"Unknown record type: {record_type}")
98
+
99
+ template = self.templates[record_type]
100
+ content = None
101
+
102
+ # Try generation methods in order of preference
103
+ if use_gemini and self.gemini_model:
104
+ try:
105
+ content = self._generate_with_gemini(template)
106
+ logger.info("Successfully generated record using Gemini")
107
+ except Exception as e:
108
+ logger.error(f"Gemini generation failed: {str(e)}")
109
+ content = None
110
+
111
+ if content is None and self.hf_generator:
112
+ try:
113
+ content = self._generate_with_huggingface(template)
114
+ logger.info("Successfully generated record using Hugging Face")
115
+ except Exception as e:
116
+ logger.error(f"Hugging Face generation failed: {str(e)}")
117
+ content = None
118
+
119
+ if content is None:
120
+ try:
121
+ content = self._generate_with_template(template)
122
+ logger.info("Successfully generated record using template")
123
+ except Exception as e:
124
+ logger.error(f"Template generation failed: {str(e)}")
125
+ raise RuntimeError("All generation methods failed")
126
+
127
+ return {
128
+ "id": self._generate_id(),
129
+ "type": record_type,
130
+ "text": content,
131
+ "timestamp": datetime.now().isoformat(),
132
+ "source": "Gemini" if use_gemini and self.gemini_model else "Hugging Face" if self.hf_generator else "Template"
133
+ }
134
+
135
+ def _generate_with_gemini(self, template: str) -> str:
136
+ """Generate text using Gemini API"""
137
+
138
+ try:
139
+ prompt = f"""
140
+ Generate a realistic but completely fictional medical record using this template:
141
+
142
+ {template}
143
+
144
+ Requirements:
145
+ - Use fictional patient names and details
146
+ - Include medically accurate terminology
147
+ - Make it realistic but not based on any real patient
148
+ - Include specific medical details and measurements
149
+ - Follow standard medical documentation format
150
+ """
151
+
152
+ response = self.gemini_model.generate_content(prompt)
153
+ return response.text
154
+
155
+ except Exception as e:
156
+ logger.error(f"Gemini generation failed: {str(e)}")
157
+ raise
158
+
159
+ def _generate_with_huggingface(self, template: str) -> str:
160
+ """Generate text using Hugging Face model"""
161
+ try:
162
+ # First fill the template with random values
163
+ fake_data = {
164
+ "patient_name": random.choice([
165
+ "John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
166
+ "Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
167
+ ]),
168
+ "age": random.randint(18, 90),
169
+ "gender": random.choice(["Male", "Female", "Other"]),
170
+ "chief_complaint": random.choice([
171
+ "chest pain", "shortness of breath", "abdominal pain", "headache",
172
+ "fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
173
+ ]),
174
+ "blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
175
+ "heart_rate": random.randint(55, 120),
176
+ "temperature": round(random.uniform(97.0, 104.0), 1),
177
+ "diagnosis": random.choice([
178
+ "Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
179
+ "Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
180
+ ]),
181
+ "date": time.strftime("%Y-%m-%d"),
182
+ "address": random.choice([
183
+ "123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
184
+ ]),
185
+ "phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
186
+ "email": random.choice([
187
188
+ ]),
189
+ }
190
+
191
+ # Fill template with fake data
192
+ filled_template = template
193
+ for key, value in fake_data.items():
194
+ filled_template = filled_template.replace(f"{{{key}}}", str(value))
195
+
196
+ # Use the filled template as starting prompt
197
+ prompt = filled_template[:100] + "..."
198
+
199
+ # Generate text with explicit configuration
200
+ generated = self.hf_generator(
201
+ prompt,
202
+ max_length=400,
203
+ num_return_sequences=1,
204
+ pad_token_id=50256,
205
+ truncation=True
206
+ )
207
+
208
+ # Use the generated text
209
+ return generated[0]['generated_text']
210
+
211
+ except Exception as e:
212
+ logger.error(f"Hugging Face generation failed: {str(e)}")
213
+ logger.info("Falling back to template-based generation")
214
+ return self._generate_with_template(template)
215
+
216
+ def _generate_with_template(self, template: str) -> str:
217
+ """Fallback: Generate using template with random values"""
218
+ try:
219
+ # Expanded fake data for more variety
220
+ fake_data = {
221
+ "patient_name": random.choice([
222
+ "John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
223
+ "Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
224
+ ]),
225
+ "age": random.randint(18, 90),
226
+ "gender": random.choice(["Male", "Female", "Other"]),
227
+ "chief_complaint": random.choice([
228
+ "chest pain", "shortness of breath", "abdominal pain", "headache",
229
+ "fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
230
+ ]),
231
+ "blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
232
+ "heart_rate": random.randint(55, 120),
233
+ "temperature": round(random.uniform(97.0, 104.0), 1),
234
+ "diagnosis": random.choice([
235
+ "Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
236
+ "Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
237
+ ]),
238
+ "date": time.strftime("%Y-%m-%d"),
239
+ "address": random.choice([
240
+ "123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
241
+ ]),
242
+ "phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
243
+ "email": random.choice([
244
245
+ ]),
246
+ }
247
+ # Fill template with fake data
248
+ filled_template = template
249
+ for key, value in fake_data.items():
250
+ filled_template = filled_template.replace(f"{{{key}}}", str(value))
251
+ return filled_template
252
+ except Exception as e:
253
+ logger.error(f"Template generation failed: {str(e)}")
254
+ raise
255
+
256
+ def batch_generate(self, record_type: str, count: int = 10, use_gemini: bool = False) -> List[Dict]:
257
+ """Generate multiple records"""
258
+
259
+ records = []
260
+ for i in range(count):
261
+ try:
262
+ record = self.generate_record(record_type, use_gemini)
263
+ records.append(record)
264
+
265
+ # Rate limiting for API calls
266
+ if use_gemini:
267
+ time.sleep(1) # Respect API limits
268
+
269
+ logger.info(f"Generated record {i+1}/{count}")
270
+
271
+ except Exception as e:
272
+ logger.error(f"Failed to generate record {i+1}: {str(e)}")
273
+ continue
274
+
275
+ return records
276
+
277
+ def _generate_id(self) -> str:
278
+ """Generate unique record ID"""
279
+ return f"SYN-{int(time.time())}-{random.randint(1000, 9999)}"
280
+
281
+ def _get_clinical_note_template(self) -> str:
282
+ return """
283
+ CLINICAL NOTE
284
+
285
+ Patient: {patient_name}
286
+ Age: {age}
287
+ Gender: {gender}
288
+ Date: {date}
289
+
290
+ Chief Complaint:
291
+ {chief_complaint}
292
+
293
+ Vital Signs:
294
+ - Blood Pressure: {blood_pressure} mmHg
295
+ - Heart Rate: {heart_rate} bpm
296
+ - Temperature: {temperature}°F
297
+
298
+ Assessment:
299
+ {diagnosis}
300
+
301
+ Plan:
302
+ 1. Follow-up in 2 weeks
303
+ 2. Continue current medications
304
+ 3. Monitor symptoms
305
+
306
+ Provider: Dr. Smith
307
+ """
308
+
309
+ def _get_discharge_summary_template(self) -> str:
310
+ return """
311
+ DISCHARGE SUMMARY
312
+
313
+ Patient: {patient_name}
314
+ Age: {age}
315
+ Gender: {gender}
316
+ Admission Date: {date}
317
+ Discharge Date: {date}
318
+
319
+ Reason for Admission:
320
+ {chief_complaint}
321
+
322
+ Hospital Course:
323
+ Patient was admitted for {chief_complaint}. During hospitalization, patient was treated with appropriate medications and showed improvement.
324
+
325
+ Final Diagnosis:
326
+ {diagnosis}
327
+
328
+ Discharge Medications:
329
+ 1. Medication A - 1 tablet daily
330
+ 2. Medication B - 2 tablets twice daily
331
+
332
+ Follow-up:
333
+ - Primary Care Provider: Dr. Johnson
334
+ - Appointment: 2 weeks from discharge
335
+
336
+ Discharge Instructions:
337
+ 1. Take medications as prescribed
338
+ 2. Follow up with primary care provider
339
+ 3. Call if symptoms worsen
340
+
341
+ Discharging Provider: Dr. Smith
342
+ """
343
+
344
+ def _get_lab_report_template(self) -> str:
345
+ return """
346
+ LABORATORY REPORT
347
+
348
+ Patient: {patient_name}
349
+ Age: {age}
350
+ Gender: {gender}
351
+ Date: {date}
352
+
353
+ Test Results:
354
+
355
+ Complete Blood Count (CBC):
356
+ - White Blood Cells: {random.randint(4,11)} K/uL
357
+ - Red Blood Cells: {round(random.uniform(4.0,5.5),2)} M/uL
358
+ - Hemoglobin: {round(random.uniform(12.0,16.0),1)} g/dL
359
+ - Platelets: {random.randint(150,450)} K/uL
360
+
361
+ Basic Metabolic Panel:
362
+ - Glucose: {random.randint(70,140)} mg/dL
363
+ - BUN: {random.randint(7,20)} mg/dL
364
+ - Creatinine: {round(random.uniform(0.6,1.2),2)} mg/dL
365
+
366
+ Interpretation:
367
+ Results are within normal limits.
368
+
369
+ Lab Director: Dr. Wilson
370
+ """
371
+
372
+ def _get_prescription_template(self) -> str:
373
+ return """
374
+ PRESCRIPTION
375
+
376
+ Patient: {patient_name}
377
+ Age: {age}
378
+ Gender: {gender}
379
+ Date: {date}
380
+
381
+ Prescription:
382
+ {diagnosis} - {random.choice(['Amoxicillin', 'Lisinopril', 'Metformin', 'Atorvastatin', 'Albuterol'])}
383
+
384
+ Dosage: {random.choice(['1 tablet', '2 tablets', '1 capsule'])} {random.choice(['daily', 'twice daily', 'three times daily'])}
385
+
386
+ Quantity: {random.randint(30,90)} tablets
387
+
388
+ Refills: {random.randint(0,3)}
389
+
390
+ Prescribing Provider: Dr. Smith
391
+ DEA Number: AB1234567
392
+ """
393
+
394
+ def _get_patient_intake_template(self) -> str:
395
+ return """
396
+ PATIENT INTAKE FORM
397
+
398
+ Personal Information:
399
+ Name: {patient_name}
400
+ Age: {age}
401
+ Gender: {gender}
402
+ Address: {address}
403
+ Phone: {phone}
404
+ Email: {email}
405
+
406
+ Emergency Contact:
407
+ Name: {random.choice(['Spouse', 'Parent', 'Sibling'])} {patient_name.split()[0]}
408
+ Phone: {phone}
409
+ Relationship: {random.choice(['Spouse', 'Parent', 'Sibling'])}
410
+
411
+ Insurance Information:
412
+ Provider: {random.choice(['Blue Cross', 'Aetna', 'United Healthcare', 'Cigna'])}
413
+ Policy Number: {random.randint(100000000,999999999)}
414
+ Group Number: {random.randint(10000,99999)}
415
+
416
+ Medical History:
417
+ Chief Complaint: {chief_complaint}
418
+ Current Medications: {random.choice(['None', 'Aspirin', 'Metformin', 'Lisinopril'])}
419
+ Allergies: {random.choice(['None', 'Penicillin', 'Sulfa', 'Peanuts'])}
420
+
421
+ Vital Signs:
422
+ Blood Pressure: {blood_pressure} mmHg
423
+ Heart Rate: {heart_rate} bpm
424
+ Temperature: {temperature}°F
425
+
426
+ Intake Date: {date}
427
+ Intake Provider: Dr. Smith
428
+ """
429
+
430
+ def main():
431
+ """Test the generator"""
432
+ generator = MedicalTextGenerator()
433
+
434
+ # Test each record type
435
+ for record_type in generator.templates.keys():
436
+ print(f"\nGenerating {record_type}...")
437
+ record = generator.generate_record(record_type)
438
+ print(json.dumps(record, indent=2))
439
+
440
+ if __name__ == "__main__":
441
+ main()
src/streamlit_app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))
src/web/index.html ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Synthex Medical Text Generator</title>
7
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
8
+ <style>
9
+ body {
10
+ padding: 20px;
11
+ background-color: #f8f9fa;
12
+ }
13
+ .container {
14
+ max-width: 800px;
15
+ background-color: white;
16
+ padding: 30px;
17
+ border-radius: 10px;
18
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
19
+ }
20
+ .result-box {
21
+ background-color: #f8f9fa;
22
+ padding: 15px;
23
+ border-radius: 5px;
24
+ margin-top: 20px;
25
+ white-space: pre-wrap;
26
+ }
27
+ .loading {
28
+ display: none;
29
+ text-align: center;
30
+ margin: 20px 0;
31
+ }
32
+ </style>
33
+ </head>
34
+ <body>
35
+ <div class="container">
36
+ <h1 class="mb-4">Synthex Medical Text Generator</h1>
37
+
38
+ <div class="mb-3">
39
+ <label for="recordType" class="form-label">Record Type</label>
40
+ <select class="form-select" id="recordType">
41
+ <option value="clinical_note">Clinical Note</option>
42
+ <option value="discharge_summary">Discharge Summary</option>
43
+ <option value="lab_report">Lab Report</option>
44
+ <option value="prescription">Prescription</option>
45
+ <option value="patient_intake">Patient Intake</option>
46
+ </select>
47
+ </div>
48
+
49
+ <div class="mb-3">
50
+ <label for="quantity" class="form-label">Quantity</label>
51
+ <input type="number" class="form-control" id="quantity" value="1" min="1" max="10">
52
+ </div>
53
+
54
+ <div class="mb-3 form-check">
55
+ <input type="checkbox" class="form-check-input" id="useGemini">
56
+ <label class="form-check-label" for="useGemini">Use Gemini (if available)</label>
57
+ </div>
58
+
59
+ <div class="mb-3 form-check">
60
+ <input type="checkbox" class="form-check-input" id="includeMetadata" checked>
61
+ <label class="form-check-label" for="includeMetadata">Include Metadata</label>
62
+ </div>
63
+
64
+ <button class="btn btn-primary" onclick="generateRecords()">Generate Records</button>
65
+
66
+ <div class="loading" id="loading">
67
+ <div class="spinner-border text-primary" role="status">
68
+ <span class="visually-hidden">Loading...</span>
69
+ </div>
70
+ <p class="mt-2">Generating records...</p>
71
+ </div>
72
+
73
+ <div id="result" class="result-box"></div>
74
+ </div>
75
+
76
+ <script>
77
+ async function generateRecords() {
78
+ const recordType = document.getElementById('recordType').value;
79
+ const quantity = parseInt(document.getElementById('quantity').value);
80
+ const useGemini = document.getElementById('useGemini').checked;
81
+ const includeMetadata = document.getElementById('includeMetadata').checked;
82
+
83
+ // Show loading
84
+ document.getElementById('loading').style.display = 'block';
85
+ document.getElementById('result').innerHTML = '';
86
+
87
+ try {
88
+ const response = await fetch('/generate', {
89
+ method: 'POST',
90
+ headers: {
91
+ 'Content-Type': 'application/json',
92
+ 'Accept': 'application/json'
93
+ },
94
+ body: JSON.stringify({
95
+ record_type: recordType,
96
+ quantity: quantity,
97
+ use_gemini: useGemini,
98
+ include_metadata: includeMetadata
99
+ })
100
+ });
101
+
102
+ const data = await response.json();
103
+
104
+ // Format and display results
105
+ let resultHtml = '<h3>Generated Records:</h3>';
106
+ data.records.forEach(record => {
107
+ resultHtml += `
108
+ <div class="mb-4">
109
+ <strong>ID:</strong> ${record.id}<br>
110
+ <strong>Type:</strong> ${record.type}<br>
111
+ <strong>Source:</strong> ${record.source}<br>
112
+ <strong>Timestamp:</strong> ${record.timestamp}<br>
113
+ <strong>Text:</strong><br>
114
+ <pre>${record.text}</pre>
115
+ </div>
116
+ `;
117
+ });
118
+ document.getElementById('result').innerHTML = resultHtml;
119
+ } catch (error) {
120
+ document.getElementById('result').innerHTML = `<div class="alert alert-danger">Error: ${error.message}</div>`;
121
+ } finally {
122
+ document.getElementById('loading').style.display = 'none';
123
+ }
124
+ }
125
+ </script>
126
+ </body>
127
+ </html>
streamlit_app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthex Medical Text Generator - MVP Streamlit App
3
+ Deploy this on Hugging Face Spaces for free hosting
4
+ """
5
+
6
+ import streamlit as st
7
+ import json
8
+ import time
9
+ from datetime import datetime
10
+ import pandas as pd
11
+ import os
12
+ import sys
13
+ import logging
14
+
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Add src directory to Python path
20
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
21
+
22
+ # Import the medical generator
23
+ from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
24
+
25
+ # Page config
26
+ st.set_page_config(
27
+ page_title="Synthex Medical Text Generator",
28
+ page_icon="🏥",
29
+ layout="wide",
30
+ initial_sidebar_state="expanded"
31
+ )
32
+
33
+ # Custom CSS
34
+ st.markdown("""
35
+ <style>
36
+ .main-header {
37
+ font-size: 3rem;
38
+ font-weight: bold;
39
+ color: #1f77b4;
40
+ text-align: center;
41
+ margin-bottom: 2rem;
42
+ }
43
+ .sub-header {
44
+ font-size: 1.5rem;
45
+ color: #666;
46
+ text-align: center;
47
+ margin-bottom: 3rem;
48
+ }
49
+ .record-container {
50
+ background-color: #f8f9fa;
51
+ padding: 1rem;
52
+ border-radius: 0.5rem;
53
+ border-left: 4px solid #1f77b4;
54
+ margin: 1rem 0;
55
+ }
56
+ .stats-container {
57
+ background-color: #e8f4fd;
58
+ padding: 1rem;
59
+ border-radius: 0.5rem;
60
+ margin: 1rem 0;
61
+ }
62
+ </style>
63
+ """, unsafe_allow_html=True)
64
+
65
+ # Initialize session state
66
+ if 'generated_records' not in st.session_state:
67
+ st.session_state.generated_records = []
68
+ if 'total_generated' not in st.session_state:
69
+ st.session_state.total_generated = 0
70
+ if 'generator' not in st.session_state:
71
+ st.session_state.generator = None
72
+
73
+ # Header
74
+ st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
75
+ st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
76
+
77
+ # Sidebar
78
+ with st.sidebar:
79
+ st.header("⚙️ Configuration")
80
+
81
+ # API Key input (pre-filled with environment variable if available)
82
+ gemini_api_key = st.text_input(
83
+ "Gemini API Key",
84
+ value=os.getenv('GEMINI_API_KEY', ''),
85
+ type="password",
86
+ help="Enter your Google Gemini API key for better generation quality"
87
+ )
88
+
89
+ # Record type selection
90
+ record_type = st.selectbox(
91
+ "Select Record Type",
92
+ ["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
93
+ format_func=lambda x: x.replace("_", " ").title()
94
+ )
95
+
96
+ # Quantity
97
+ quantity = st.slider("Number of Records", 1, 20, 5)
98
+
99
+ # Generation method
100
+ use_gemini = st.checkbox(
101
+ "Use Gemini API",
102
+ value=bool(gemini_api_key), # Only default to True if API key is available
103
+ help="Uses Google Gemini API for better quality generation"
104
+ )
105
+
106
+ # Advanced options
107
+ with st.expander("Advanced Options"):
108
+ include_metadata = st.checkbox("Include Metadata", value=True)
109
+ export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
110
+
111
+ # Main content
112
+ col1, col2 = st.columns([2, 1])
113
+
114
+ with col1:
115
+ st.header("📝 Generate Medical Records")
116
+
117
+ # Generation button
118
+ if st.button("🚀 Generate Records", type="primary", use_container_width=True):
119
+
120
+ # Initialize generator if not already done
121
+ if st.session_state.generator is None:
122
+ try:
123
+ with st.spinner("Initializing medical text generator..."):
124
+ st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
125
+ except Exception as e:
126
+ st.error(f"Error initializing generator: {str(e)}")
127
+ st.stop()
128
+
129
+ # Generate records
130
+ progress_bar = st.progress(0)
131
+ status_text = st.empty()
132
+
133
+ generated_records = []
134
+
135
+ for i in range(quantity):
136
+ status_text.text(f"Generating record {i+1} of {quantity}...")
137
+ progress_bar.progress((i + 1) / quantity)
138
+
139
+ try:
140
+ record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
141
+ generated_records.append(record)
142
+
143
+ # Rate limiting
144
+ if use_gemini:
145
+ time.sleep(1)
146
+
147
+ except Exception as e:
148
+ logger.error(f"Failed to generate record {i+1}: {str(e)}")
149
+ st.error(f"Failed to generate record {i+1}: {str(e)}")
150
+ continue
151
+
152
+ # Update session state
153
+ if generated_records:
154
+ st.session_state.generated_records.extend(generated_records)
155
+ st.session_state.total_generated += len(generated_records)
156
+
157
+ status_text.text("✅ Generation complete!")
158
+ progress_bar.progress(1.0)
159
+
160
+ st.success(f"Successfully generated {len(generated_records)} medical records!")
161
+
162
+ # Display generated records
163
+ if st.session_state.generated_records:
164
+ st.header("📋 Generated Records")
165
+
166
+ # Filters
167
+ col_filter1, col_filter2 = st.columns(2)
168
+ with col_filter1:
169
+ filter_type = st.selectbox(
170
+ "Filter by Type",
171
+ ["All"] + list(set([r['type'] for r in st.session_state.generated_records]))
172
+ )
173
+ with col_filter2:
174
+ records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
175
+
176
+ # Filter records
177
+ filtered_records = st.session_state.generated_records
178
+ if filter_type != "All":
179
+ filtered_records = [r for r in filtered_records if r['type'] == filter_type]
180
+
181
+ # Pagination
182
+ total_records = len(filtered_records)
183
+ total_pages = (total_records - 1) // records_per_page + 1
184
+
185
+ if total_pages > 1:
186
+ page = st.selectbox("Page", range(1, total_pages + 1))
187
+ start_idx = (page - 1) * records_per_page
188
+ end_idx = start_idx + records_per_page
189
+ page_records = filtered_records[start_idx:end_idx]
190
+ else:
191
+ page_records = filtered_records
192
+
193
+ # Display records
194
+ for i, record in enumerate(page_records):
195
+ with st.expander(f"Record {record['id']} - {record['type'].replace('_', ' ').title()}"):
196
+ if include_metadata:
197
+ col_meta1, col_meta2, col_meta3 = st.columns(3)
198
+ with col_meta1:
199
+ st.metric("Type", record['type'].replace('_', ' ').title())
200
+ with col_meta2:
201
+ st.metric("Generated", record['timestamp'])
202
+ with col_meta3:
203
+ st.metric("Source", record['source'])
204
+
205
+ st.markdown('<div class="record-container">', unsafe_allow_html=True)
206
+ st.text_area("Content", record['text'], height=200, key=f"record_{i}")
207
+ st.markdown('</div>', unsafe_allow_html=True)
208
+
209
+ with col2:
210
+ st.header("📊 Statistics")
211
+
212
+ # Stats container
213
+ st.markdown('<div class="stats-container">', unsafe_allow_html=True)
214
+
215
+ # Total records
216
+ st.metric("Total Records Generated", st.session_state.total_generated)
217
+
218
+ # Record type distribution
219
+ if st.session_state.generated_records:
220
+ type_counts = pd.Series([r['type'] for r in st.session_state.generated_records]).value_counts()
221
+ st.subheader("Record Type Distribution")
222
+ st.bar_chart(type_counts)
223
+
224
+ # Export options
225
+ st.subheader("Export Data")
226
+ if st.session_state.generated_records:
227
+ if export_format == "JSON":
228
+ json_str = json.dumps(st.session_state.generated_records, indent=2)
229
+ st.download_button(
230
+ "Download JSON",
231
+ json_str,
232
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
233
+ mime="application/json"
234
+ )
235
+ elif export_format == "CSV":
236
+ df = pd.DataFrame(st.session_state.generated_records)
237
+ csv = df.to_csv(index=False)
238
+ st.download_button(
239
+ "Download CSV",
240
+ csv,
241
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
242
+ mime="text/csv"
243
+ )
244
+ elif export_format == "TXT":
245
+ txt = "\n\n".join([f"Record {r['id']} ({r['type']}):\n{r['text']}" for r in st.session_state.generated_records])
246
+ st.download_button(
247
+ "Download TXT",
248
+ txt,
249
+ file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
250
+ mime="text/plain"
251
+ )
252
+
253
+ st.markdown('</div>', unsafe_allow_html=True)
templates/index.html ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Synthex - Medical Data Collection and Analysis</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 20px;
12
+ background-color: #f4f4f4;
13
+ }
14
+ .container {
15
+ max-width: 800px;
16
+ margin: 0 auto;
17
+ background: white;
18
+ padding: 20px;
19
+ border-radius: 5px;
20
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
21
+ }
22
+ h1 {
23
+ color: #333;
24
+ }
25
+ .button {
26
+ display: inline-block;
27
+ padding: 10px 20px;
28
+ margin: 10px 0;
29
+ background-color: #007bff;
30
+ color: white;
31
+ text-decoration: none;
32
+ border-radius: 5px;
33
+ }
34
+ .button:hover {
35
+ background-color: #0056b3;
36
+ }
37
+ .flash {
38
+ padding: 10px;
39
+ margin: 10px 0;
40
+ border-radius: 5px;
41
+ }
42
+ .flash.success {
43
+ background-color: #d4edda;
44
+ color: #155724;
45
+ }
46
+ .flash.error {
47
+ background-color: #f8d7da;
48
+ color: #721c24;
49
+ }
50
+ </style>
51
+ </head>
52
+ <body>
53
+ <div class="container">
54
+ <h1>Synthex - Medical Data Collection and Analysis</h1>
55
+ {% with messages = get_flashed_messages(with_categories=true) %}
56
+ {% if messages %}
57
+ {% for category, message in messages %}
58
+ <div class="flash {{ category }}">{{ message }}</div>
59
+ {% endfor %}
60
+ {% endif %}
61
+ {% endwith %}
62
+ <a href="{{ url_for('collect_data') }}" class="button">Collect Data</a>
63
+ <a href="{{ url_for('analyze_data') }}" class="button">Analyze Data</a>
64
+ </div>
65
+ </body>
66
+ </html>
test_dataset.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ def test_medical_dataset():
7
+ try:
8
+ # Load a small sample of the medical questions dataset
9
+ dataset = load_dataset("medical_questions_pairs", split="train[:100]")
10
+ print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs")
11
+
12
+ # Print sample structure
13
+ print("\nSample structure:")
14
+ print(json.dumps(dataset[0], indent=2))
15
+
16
+ return True
17
+ except Exception as e:
18
+ print(f"Error loading dataset: {str(e)}")
19
+ return False
20
+
21
+ def verify_data_directory():
22
+ data_dir = Path("data/raw")
23
+ if not data_dir.exists():
24
+ print(f"Creating data directory: {data_dir}")
25
+ data_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ # Check for JSON files
28
+ json_files = list(data_dir.glob("*.json"))
29
+ if json_files:
30
+ print(f"\nFound {len(json_files)} JSON files in data/raw:")
31
+ for file in json_files:
32
+ print(f"- {file.name}")
33
+ else:
34
+ print("\nNo JSON files found in data/raw directory")
35
+
36
+ if __name__ == "__main__":
37
+ print("Testing Hugging Face dataset loading...")
38
+ test_medical_dataset()
39
+
40
+ print("\nVerifying data directory structure...")
41
+ verify_data_directory()
test_pubmed.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from bs4 import BeautifulSoup
4
+ import logging
5
+
6
+ # Setup logging
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def test_pubmed_search():
11
+ """Test PubMed search API"""
12
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
13
+ search_url = f"{base_url}esearch.fcgi"
14
+
15
+ # Test query
16
+ query = "clinical notes"
17
+
18
+ search_params = {
19
+ "db": "pubmed",
20
+ "term": query,
21
+ "retmax": 10, # Just get 10 results for testing
22
+ "retmode": "json",
23
+ "sort": "relevance"
24
+ }
25
+
26
+ logger.info(f"Testing PubMed search with query: {query}")
27
+ logger.info(f"Search URL: {search_url}")
28
+ logger.info(f"Search params: {search_params}")
29
+
30
+ try:
31
+ response = requests.get(search_url, params=search_params)
32
+ response.raise_for_status()
33
+ search_results = response.json()
34
+
35
+ logger.info(f"Response status code: {response.status_code}")
36
+ logger.info(f"Response headers: {dict(response.headers)}")
37
+ logger.info(f"Search results: {json.dumps(search_results, indent=2)}")
38
+
39
+ if "esearchresult" in search_results:
40
+ id_list = search_results["esearchresult"]["idlist"]
41
+ logger.info(f"Found {len(id_list)} article IDs")
42
+
43
+ # Test fetching one article
44
+ if id_list:
45
+ test_id = id_list[0]
46
+ fetch_url = f"{base_url}efetch.fcgi"
47
+ fetch_params = {
48
+ "db": "pubmed",
49
+ "id": test_id,
50
+ "retmode": "xml"
51
+ }
52
+
53
+ logger.info(f"\nTesting article fetch for ID: {test_id}")
54
+ logger.info(f"Fetch URL: {fetch_url}")
55
+ logger.info(f"Fetch params: {fetch_params}")
56
+
57
+ response = requests.get(fetch_url, params=fetch_params)
58
+ response.raise_for_status()
59
+
60
+ logger.info(f"Fetch response status code: {response.status_code}")
61
+ logger.info(f"Fetch response headers: {dict(response.headers)}")
62
+ logger.info(f"First 500 chars of response: {response.text[:500]}")
63
+
64
+ soup = BeautifulSoup(response.text, 'lxml')
65
+ article = soup.find('PubmedArticle')
66
+
67
+ if article:
68
+ logger.info("\nArticle structure:")
69
+ logger.info(f"Title: {article.find('ArticleTitle').get_text() if article.find('ArticleTitle') else 'Not found'}")
70
+ logger.info(f"Abstract: {article.find('Abstract').get_text()[:200] + '...' if article.find('Abstract') else 'Not found'}")
71
+ else:
72
+ logger.error("No PubmedArticle found in response")
73
+
74
+ except Exception as e:
75
+ logger.error(f"Error during test: {str(e)}", exc_info=True)
76
+
77
+ if __name__ == "__main__":
78
+ test_pubmed_search()
web_app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, redirect, url_for, flash
2
+ import subprocess
3
+ import os
4
+
5
+ app = Flask(__name__)
6
+ app.secret_key = 'your_secret_key' # Required for flashing messages
7
+
8
+ @app.route('/')
9
+ def index():
10
+ return render_template('index.html')
11
+
12
+ @app.route('/collect_data')
13
+ def collect_data():
14
+ try:
15
+ subprocess.run(['python', 'setup_data.py'], check=True)
16
+ flash('Data collection completed successfully!', 'success')
17
+ except subprocess.CalledProcessError as e:
18
+ flash(f'Error during data collection: {str(e)}', 'error')
19
+ return redirect(url_for('index'))
20
+
21
+ @app.route('/analyze_data')
22
+ def analyze_data():
23
+ try:
24
+ subprocess.run(['python', 'analyze_data_quality.py'], check=True)
25
+ flash('Data analysis completed successfully!', 'success')
26
+ except subprocess.CalledProcessError as e:
27
+ flash(f'Error during data analysis: {str(e)}', 'error')
28
+ return redirect(url_for('index'))
29
+
30
+ if __name__ == '__main__':
31
+ app.run(debug=True)