Spaces:

theaniketgiri
/

syn

Sleeping

App Files Files Community

theaniketgiri commited on Jun 16

Commit

32519eb

0 Parent(s):

� Initial commit to Hugging Face Space

Browse files

Files changed (28) hide show

.dockerignore +60 -0
.gitignore +125 -0
Dockerfile +30 -0
LICENSE +55 -0
README.md +284 -0
analyze_data_quality.py +174 -0
aniket.py +0 -0
api.py +95 -0
app.py +361 -0
batch_generate.py +57 -0
data/processed/.gitkeep +0 -0
data/reports/plots/sample_distribution.png +0 -0
data/synthetic/.gitkeep +0 -0
requirements.txt +29 -0
setup.py +77 -0
setup_data.py +48 -0
src/__init__.py +6 -0
src/api/app.py +95 -0
src/data_collection/data_collection.py +440 -0
src/generation/__init__.py +7 -0
src/generation/medical_generator.py +441 -0
src/streamlit_app.py +40 -0
src/web/index.html +127 -0
streamlit_app.py +253 -0
templates/index.html +66 -0
test_dataset.py +41 -0
test_pubmed.py +78 -0
web_app.py +31 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+synthex_env/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Git
+.git
+.gitignore
+# Data
+data/raw/*
+data/processed/*
+data/synthetic/*
+!data/raw/.gitkeep
+!data/processed/.gitkeep
+!data/synthetic/.gitkeep
+# Logs
+*.log
+# Local development
+.env
+.env.local
+.env.*.local
+# Docker
+Dockerfile
+.dockerignore
+# Misc
+.DS_Store
+Thumbs.db

.gitignore ADDED Viewed

	@@ -0,0 +1,125 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+.env
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Project specific
+data/raw/
+data/generated/
+*.log
+.DS_Store
+.coverage
+htmlcov/
+.pytest_cache/
+# Hugging Face
+.huggingface/
+.hf/
+# Docker
+.docker/
+docker-compose.override.yml
+# Security
+*.pem
+*.key
+*.cert
+# Large files
+*.json
+*.csv
+*.xlsx
+*.xls
+*.db
+*.sqlite
+*.h5
+*.pkl
+*.model
+*.bin
+*.pt
+*.pth
+*.onnx
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+.env
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Project specific
+data/generated/
+*.log
+.DS_Store
+.coverage
+htmlcov/
+.pytest_cache/
+# Hugging Face
+.huggingface/
+.hf/
+# Docker
+.docker/
+docker-compose.override.yml
+# Security
+*.pem
+*.key
+*.cert

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Use official Python image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Create necessary directories
+RUN mkdir -p src/web
+# Expose the port
+EXPOSE 8000
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PORT=8000
+# Command to run the application
+CMD ["uvicorn", "src.api.app:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,55 @@

+Synthex AI - Commercial License
+Copyright (c) 2024 Synthex AI
+This software and associated documentation files (the "Software") are proprietary and confidential.
+The Software is protected by copyright laws and international copyright treaties, as well as other
+intellectual property laws and treaties.
+TERMS AND CONDITIONS
+1. License Grant
+   This license grants you a limited, non-exclusive, non-transferable license to use the Software
+   solely for your internal business purposes, subject to the terms and conditions of this Agreement.
+2. Restrictions
+   You may not:
+   - Copy, modify, or create derivative works of the Software
+   - Reverse engineer, decompile, or disassemble the Software
+   - Remove or alter any proprietary notices or labels on the Software
+   - Use the Software for any illegal purpose
+   - Transfer, sublicense, or resell the Software
+3. Proprietary Rights
+   The Software and all copies, modifications, and derivative works are owned by Synthex AI and
+   are protected by copyright, trade secret, and other intellectual property laws.
+4. Confidentiality
+   You agree to maintain the confidentiality of the Software and not disclose it to any third party
+   without Synthex AI's prior written consent.
+5. Warranty Disclaimer
+   THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. SYNTHEX AI DISCLAIMS ALL
+   WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT.
+6. Limitation of Liability
+   IN NO EVENT SHALL SYNTHEX AI BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY ARISING FROM,
+   OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+7. Termination
+   This license is effective until terminated. Your rights under this license will terminate
+   automatically without notice if you fail to comply with any of its terms.
+8. Governing Law
+   This Agreement shall be governed by and construed in accordance with the laws of the State of
+   Delaware, without regard to its conflict of law provisions.
+9. Contact Information
+   For licensing inquiries, please contact:
+   Synthex AI
+   Email: [email protected]
+   Website: https://synthex.ai
+By using the Software, you acknowledge that you have read this Agreement, understand it, and agree
+to be bound by its terms and conditions.

README.md ADDED Viewed

	@@ -0,0 +1,284 @@

+# Synthex AI - Medical Text Generation Platform
+![Synthex AI](https://img.shields.io/badge/Synthex-AI-blue)
+![Version](https://img.shields.io/badge/version-1.0.0-green)
+![License](https://img.shields.io/badge/license-MIT-blue)
+> Synthex AI is a cutting-edge platform that generates HIPAA-compliant synthetic medical records for healthcare AI development, testing, and research.
+## 🏢 Enterprise Solution
+Synthex AI provides enterprise-grade synthetic medical data generation with:
+- **HIPAA Compliance**: All generated data is synthetic and compliant with healthcare regulations
+- **Enterprise Security**: SOC 2 Type II certified infrastructure
+- **Custom Solutions**: Tailored generation for specific medical domains
+- **API Access**: RESTful API for integration with existing systems
+- **Dedicated Support**: 24/7 enterprise support and SLAs
+## 💼 Use Cases
+### Healthcare AI Development
+- Train and test AI models without real patient data
+- Generate diverse medical scenarios for model validation
+- Create synthetic datasets for research and development
+### Medical Software Testing
+- Test EHR systems with realistic synthetic data
+- Validate clinical decision support systems
+- QA medical software with diverse patient scenarios
+### Healthcare Research
+- Conduct research with privacy-compliant data
+- Generate synthetic datasets for medical studies
+- Test hypotheses without patient privacy concerns
+## 🚀 Features
+### Core Features
+- Multiple medical record types:
+  - Clinical Notes
+  - Discharge Summaries
+  - Lab Reports
+  - Prescriptions
+  - Patient Intake Forms
+- Advanced generation methods:
+  - Hugging Face models (default)
+  - Google Gemini API (premium)
+  - Custom model integration (enterprise)
+- Enterprise-grade UI/UX
+- Multiple export formats (JSON, CSV, TXT)
+- Batch generation capabilities
+- API access (enterprise)
+### Enterprise Features
+- Custom model training
+- Domain-specific generation
+- Advanced data validation
+- Integration support
+- Dedicated infrastructure
+- Custom SLAs
+## 💰 Pricing
+### Free Tier
+- Basic medical record generation
+- Limited to 100 records/month
+- Community support
+- Basic templates
+### Pro Plan ($99/month)
+- Up to 10,000 records/month
+- Advanced generation features
+- Priority support
+- API access
+- Custom templates
+### Enterprise Plan (Custom)
+- Unlimited generation
+- Custom model training
+- Dedicated support
+- Custom integrations
+- SLA guarantees
+- On-premise deployment
+## 🛠️ Technical Details
+### Architecture
+```
+synthex/
+├── app.py                 # Main Streamlit application
+├── src/
+│   ├── generation/        # Core generation logic
+│   ├── api/              # REST API endpoints
+│   ├── validation/       # Data validation
+│   └── enterprise/       # Enterprise features
+├── data/
+│   └── generated/        # Generated records storage
+├── tests/                # Test suite
+├── Dockerfile           # Docker configuration
+└── requirements.txt     # Python dependencies
+```
+### API Reference
+```python
+from synthex import SynthexClient
+# Initialize client
+client = SynthexClient(api_key="your_api_key")
+# Generate records
+records = client.generate_records(
+    record_type="clinical_note",
+    count=100,
+    options={
+        "include_metadata": True,
+        "custom_fields": ["patient_demographics", "vital_signs"]
+    }
+)
+# Export data
+client.export_records(
+    records,
+    format="json",
+    destination="s3://your-bucket/path"
+)
+```
+## 🔒 Security & Compliance
+- HIPAA Compliance
+- SOC 2 Type II Certification
+- GDPR Compliance
+- Data Encryption at Rest and in Transit
+- Regular Security Audits
+- Access Control and Audit Logging
+## 🤝 Enterprise Support
+- 24/7 Technical Support
+- Dedicated Account Manager
+- Custom Integration Support
+- Training and Onboarding
+- Regular Updates and Maintenance
+- Custom Development Services
+## 📞 Contact
+### Sales Inquiries
+- Email: [email protected]
+- Phone: +1 (555) 123-4567
+- [Schedule a Demo](https://synthex.ai/demo)
+### Technical Support
+- Email: [email protected]
+- [Documentation](https://docs.synthex.ai)
+- [API Reference](https://api.synthex.ai)
+## 🌟 Why Choose Synthex AI?
+1. **Enterprise-Ready**: Built for scale and security
+2. **Compliance-First**: HIPAA and GDPR compliant
+3. **Customizable**: Tailored to your needs
+4. **Support**: Enterprise-grade support
+5. **Innovation**: Cutting-edge AI technology
+## 🚀 Getting Started
+### Quick Start
+```bash
+# Install Synthex CLI
+pip install synthex
+# Initialize client
+synthex init
+# Generate records
+synthex generate --type clinical_note --count 10
+```
+### Docker Deployment
+```bash
+# Pull image
+docker pull synthex/synthex:latest
+# Run container
+docker run -p 8501:8501 synthex/synthex
+```
+## 📚 Documentation
+- [User Guide](https://docs.synthex.ai/guide)
+- [API Documentation](https://docs.synthex.ai/api)
+- [Enterprise Guide](https://docs.synthex.ai/enterprise)
+- [Security Whitepaper](https://docs.synthex.ai/security)
+## 🙏 Acknowledgments
+- Built with [Streamlit](https://streamlit.io/)
+- Powered by [Hugging Face](https://huggingface.co/)
+- Enterprise features by [Google Cloud](https://cloud.google.com/)
+---
+© 2024 Synthex AI. All rights reserved.
+# Synthex Medical Text Generator
+A synthetic medical text generator that creates realistic medical records using AI models. The application provides both a FastAPI backend and a Streamlit interface.
+## Features
+- Generate various types of medical records:
+  - Clinical Notes
+  - Discharge Summaries
+  - Lab Reports
+  - Prescriptions
+  - Patient Intake Forms
+- Support for multiple AI models:
+  - Hugging Face models (default)
+  - Google Gemini (optional)
+- Two interfaces:
+  - FastAPI with HTML frontend
+  - Streamlit interface
+## API Endpoints
+- `GET /`: HTML interface
+- `GET /record-types`: List available record types
+- `POST /generate`: Generate medical records
+  ```json
+  {
+    "record_type": "clinical_note",
+    "quantity": 1,
+    "use_gemini": false,
+    "include_metadata": true
+  }
+  ```
+## Deployment
+### Local Development
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. Run FastAPI server:
+   ```bash
+   uvicorn src.api.app:app --reload
+   ```
+3. Run Streamlit app (optional):
+   ```bash
+   streamlit run app.py
+   ```
+### Docker Deployment
+1. Build the Docker image:
+   ```bash
+   docker build -t synthex-medical-generator .
+   ```
+2. Run the container:
+   ```bash
+   docker run -p 8000:8000 synthex-medical-generator
+   ```
+### Hugging Face Spaces Deployment
+1. Create a new Space on Hugging Face
+2. Choose "Docker" as the SDK
+3. Push this repository to your Space
+4. The application will be automatically deployed
+## Environment Variables
+- `GEMINI_API_KEY`: Google Gemini API key (optional)
+## License
+MIT License

analyze_data_quality.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import json
+import pandas as pd
+from pathlib import Path
+import logging
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Any
+import re
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DataQualityAnalyzer:
+    def __init__(self, data_dir: str = "data/raw"):
+        self.data_dir = Path(data_dir)
+        self.stats = defaultdict(dict)
+    def load_dataset(self, file_path: Path) -> List[Dict]:
+        """Load a dataset from JSON file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Error loading {file_path}: {str(e)}")
+            return []
+    def analyze_text_quality(self, text: str) -> Dict[str, Any]:
+        """Analyze quality metrics for a text"""
+        if not text:
+            return {
+                "length": 0,
+                "word_count": 0,
+                "avg_word_length": 0,
+                "has_numbers": False,
+                "has_special_chars": False
+            }
+        words = text.split()
+        return {
+            "length": len(text),
+            "word_count": len(words),
+            "avg_word_length": sum(len(w) for w in words) / len(words) if words else 0,
+            "has_numbers": bool(re.search(r'\d', text)),
+            "has_special_chars": bool(re.search(r'[^a-zA-Z0-9\s.,!?-]', text))
+        }
+    def analyze_dataset(self, dataset_name: str, data: List[Dict]):
+        """Analyze a single dataset"""
+        if not data:
+            logger.warning(f"No data found in {dataset_name}")
+            return
+        # Basic stats
+        self.stats[dataset_name]["total_samples"] = len(data)
+        # Text quality metrics
+        title_metrics = []
+        abstract_metrics = []
+        for item in data:
+            if "title" in item:
+                title_metrics.append(self.analyze_text_quality(item["title"]))
+            if "abstract" in item:
+                abstract_metrics.append(self.analyze_text_quality(item["abstract"]))
+        # Aggregate metrics
+        if title_metrics:
+            self.stats[dataset_name]["title"] = {
+                "avg_length": sum(m["length"] for m in title_metrics) / len(title_metrics),
+                "avg_word_count": sum(m["word_count"] for m in title_metrics) / len(title_metrics),
+                "avg_word_length": sum(m["avg_word_length"] for m in title_metrics) / len(title_metrics),
+                "has_numbers_ratio": sum(1 for m in title_metrics if m["has_numbers"]) / len(title_metrics),
+                "has_special_chars_ratio": sum(1 for m in title_metrics if m["has_special_chars"]) / len(title_metrics)
+            }
+        if abstract_metrics:
+            self.stats[dataset_name]["abstract"] = {
+                "avg_length": sum(m["length"] for m in abstract_metrics) / len(abstract_metrics),
+                "avg_word_count": sum(m["word_count"] for m in abstract_metrics) / len(abstract_metrics),
+                "avg_word_length": sum(m["avg_word_length"] for m in abstract_metrics) / len(abstract_metrics),
+                "has_numbers_ratio": sum(1 for m in abstract_metrics if m["has_numbers"]) / len(abstract_metrics),
+                "has_special_chars_ratio": sum(1 for m in abstract_metrics if m["has_special_chars"]) / len(abstract_metrics)
+            }
+        # Field presence
+        fields = set()
+        for item in data:
+            fields.update(item.keys())
+        self.stats[dataset_name]["fields"] = list(fields)
+        # Year distribution (if available)
+        if "year" in fields:
+            years = [item["year"] for item in data if "year" in item]
+            self.stats[dataset_name]["year_distribution"] = pd.Series(years).value_counts().to_dict()
+    def analyze_all_datasets(self):
+        """Analyze all datasets in the data directory"""
+        for file_path in self.data_dir.glob("*.json"):
+            dataset_name = file_path.stem
+            logger.info(f"Analyzing dataset: {dataset_name}")
+            data = self.load_dataset(file_path)
+            self.analyze_dataset(dataset_name, data)
+    def generate_report(self):
+        """Generate a comprehensive report"""
+        report = {
+            "summary": {},
+            "datasets": self.stats
+        }
+        # Overall summary
+        total_samples = sum(stats["total_samples"] for stats in self.stats.values())
+        report["summary"]["total_samples"] = total_samples
+        report["summary"]["total_datasets"] = len(self.stats)
+        # Save report
+        report_file = self.data_dir.parent / "reports" / "data_quality_report.json"
+        report_file.parent.mkdir(exist_ok=True)
+        with open(report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2, ensure_ascii=False)
+        logger.info(f"Quality report saved to {report_file}")
+        return report
+    def plot_metrics(self):
+        """Generate plots for key metrics"""
+        plots_dir = self.data_dir.parent / "reports" / "plots"
+        plots_dir.mkdir(exist_ok=True)
+        # Sample distribution
+        plt.figure(figsize=(10, 6))
+        samples = {name: stats["total_samples"] for name, stats in self.stats.items()}
+        plt.bar(samples.keys(), samples.values())
+        plt.xticks(rotation=45)
+        plt.title("Sample Distribution Across Datasets")
+        plt.tight_layout()
+        plt.savefig(plots_dir / "sample_distribution.png")
+        plt.close()
+        # Text length distribution
+        for dataset_name, stats in self.stats.items():
+            if "abstract" in stats:
+                plt.figure(figsize=(10, 6))
+                plt.hist([m["length"] for m in stats["abstract"]], bins=50)
+                plt.title(f"Abstract Length Distribution - {dataset_name}")
+                plt.xlabel("Length")
+                plt.ylabel("Count")
+                plt.tight_layout()
+                plt.savefig(plots_dir / f"abstract_length_{dataset_name}.png")
+                plt.close()
+def main():
+    analyzer = DataQualityAnalyzer()
+    analyzer.analyze_all_datasets()
+    report = analyzer.generate_report()
+    analyzer.plot_metrics()
+    # Print summary
+    print("\nData Quality Summary:")
+    print(f"Total samples: {report['summary']['total_samples']}")
+    print(f"Total datasets: {report['summary']['total_datasets']}")
+    print("\nPer Dataset Summary:")
+    for dataset_name, stats in report["datasets"].items():
+        print(f"\n{dataset_name}:")
+        print(f"  Samples: {stats['total_samples']}")
+        if "abstract" in stats:
+            print(f"  Avg abstract length: {stats['abstract']['avg_length']:.1f}")
+            print(f"  Avg words per abstract: {stats['abstract']['avg_word_count']:.1f}")
+if __name__ == "__main__":
+    main()

aniket.py ADDED Viewed

File without changes

api.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional
+import uvicorn
+import sys
+import os
+# Add src directory to Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+# Import the medical generator
+from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
+app = FastAPI(
+    title="Synthex Medical Text Generator API",
+    description="API for generating synthetic medical records",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+# Initialize the generator
+generator = None
+class GenerationRequest(BaseModel):
+    record_type: str
+    quantity: int = 1
+    use_gemini: bool = False
+    gemini_api_key: Optional[str] = None
+    include_metadata: bool = True
+class GenerationResponse(BaseModel):
+    records: List[dict]
+    total_generated: int
+@app.on_event("startup")
+async def startup_event():
+    global generator
+    try:
+        generator = MedicalTextGenerator()
+    except Exception as e:
+        print(f"Error initializing generator: {str(e)}")
+@app.get("/")
+async def root():
+    return {"message": "Welcome to Synthex Medical Text Generator API"}
+@app.post("/generate", response_model=GenerationResponse)
+async def generate_records(request: GenerationRequest):
+    global generator
+    if generator is None:
+        try:
+            generator = MedicalTextGenerator(gemini_api_key=request.gemini_api_key)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Failed to initialize generator: {str(e)}")
+    try:
+        generated_records = []
+        for _ in range(request.quantity):
+            record = generator.generate_record(
+                request.record_type,
+                use_gemini=request.use_gemini
+            )
+            generated_records.append(record)
+        return GenerationResponse(
+            records=generated_records,
+            total_generated=len(generated_records)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.get("/record-types")
+async def get_record_types():
+    return {
+        "record_types": [
+            "clinical_note",
+            "discharge_summary",
+            "lab_report",
+            "prescription",
+            "patient_intake"
+        ]
+    }
+if __name__ == "__main__":
+    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)

app.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+Synthex Medical Text Generator - MVP Streamlit App
+Deploy this on Hugging Face Spaces for free hosting
+"""
+import streamlit as st
+import json
+import time
+from datetime import datetime
+import pandas as pd
+import os
+import sys
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Add src directory to Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+# Import the medical generator
+from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
+# Page config
+st.set_page_config(
+    page_title="Synthex Medical Text Generator",
+    page_icon="🏥",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+/* Main container styling */
+.main {
+    padding: 2rem;
+    background-color: #f8f9fa;
+}
+/* Header styling */
+.main-header {
+    font-size: 2.5rem;
+    font-weight: bold;
+    color: #1f77b4;
+    text-align: center;
+    margin-bottom: 1rem;
+    padding: 1rem;
+    background: linear-gradient(135deg, #1f77b4 0%, #2c9cdb 100%);
+    color: white;
+    border-radius: 10px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.sub-header {
+    font-size: 1.2rem;
+    color: #666;
+    text-align: center;
+    margin-bottom: 2rem;
+    padding: 0.5rem;
+}
+/* Card styling */
+.record-container {
+    background-color: white;
+    padding: 1.5rem;
+    border-radius: 10px;
+    border-left: 4px solid #1f77b4;
+    margin: 1rem 0;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+    transition: transform 0.2s;
+}
+.record-container:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+}
+/* Stats container styling */
+.stats-container {
+    background-color: white;
+    padding: 1.5rem;
+    border-radius: 10px;
+    margin: 1rem 0;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+}
+/* Button styling */
+.stButton>button {
+    width: 100%;
+    border-radius: 5px;
+    height: 3em;
+    font-weight: bold;
+    transition: all 0.3s;
+}
+.stButton>button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+}
+/* Metric styling */
+.stMetric {
+    background-color: #f8f9fa;
+    padding: 1rem;
+    border-radius: 5px;
+    text-align: center;
+}
+/* Sidebar styling */
+.sidebar .sidebar-content {
+    background-color: #f8f9fa;
+}
+/* Progress bar styling */
+.stProgress > div > div {
+    background-color: #1f77b4;
+}
+/* Success message styling */
+.stSuccess {
+    padding: 1rem;
+    border-radius: 5px;
+    background-color: #d4edda;
+    color: #155724;
+    margin: 1rem 0;
+}
+/* Error message styling */
+.stError {
+    padding: 1rem;
+    border-radius: 5px;
+    background-color: #f8d7da;
+    color: #721c24;
+    margin: 1rem 0;
+}
+/* Expander styling */
+.streamlit-expanderHeader {
+    font-size: 1.1rem;
+    font-weight: bold;
+    color: #1f77b4;
+}
+/* Text area styling */
+.stTextArea textarea {
+    font-family: monospace;
+    font-size: 0.9rem;
+    line-height: 1.5;
+}
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'generated_records' not in st.session_state:
+    st.session_state.generated_records = []
+if 'total_generated' not in st.session_state:
+    st.session_state.total_generated = 0
+if 'generator' not in st.session_state:
+    st.session_state.generator = None
+# Header
+st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
+st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
+# Add a status message area
+status_area = st.empty()
+# Sidebar
+with st.sidebar:
+    st.markdown("### ⚙️ Configuration")
+    # API Key section
+    with st.expander("🔑 API Settings", expanded=False):
+        gemini_api_key = st.text_input(
+            "Gemini API Key",
+            value=os.getenv('GEMINI_API_KEY', ''),
+            type="password",
+            help="Enter your Google Gemini API key for better generation quality"
+        )
+    # Record settings
+    st.markdown("### 📝 Record Settings")
+    record_type = st.selectbox(
+        "Select Record Type",
+        ["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
+        format_func=lambda x: x.replace("_", " ").title()
+    )
+    quantity = st.slider("Number of Records", 1, 20, 5)
+    # Generation settings
+    st.markdown("### 🤖 Generation Settings")
+    use_gemini = st.checkbox(
+        "Use Gemini API",
+        value=False,
+        help="Uses Google Gemini API for better quality generation"
+    )
+    # Advanced options
+    with st.expander("⚡ Advanced Options"):
+        include_metadata = st.checkbox("Include Metadata", value=True)
+        export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
+# Main content with better organization
+col1, col2 = st.columns([2, 1])
+with col1:
+    st.markdown("### 📝 Generate Records")
+    # Generation button with better styling
+    if st.button("🚀 Generate Records", type="primary", use_container_width=True):
+        status_area.info("Initializing generator...")
+        # Initialize generator if not already done
+        if st.session_state.generator is None:
+            try:
+                with st.spinner("Initializing medical text generator..."):
+                    st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
+                status_area.success("Generator initialized successfully!")
+            except Exception as e:
+                status_area.error(f"Error initializing generator: {str(e)}")
+                st.stop()
+        # Generate records with progress
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        generated_records = []
+        for i in range(quantity):
+            status_text.text(f"Generating record {i+1} of {quantity}...")
+            progress_bar.progress((i + 1) / quantity)
+            try:
+                record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
+                generated_records.append(record)
+                # Rate limiting
+                if use_gemini:
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"Failed to generate record {i+1}: {str(e)}")
+                status_area.error(f"Failed to generate record {i+1}: {str(e)}")
+                continue
+        # Update session state
+        if generated_records:
+            st.session_state.generated_records.extend(generated_records)
+            st.session_state.total_generated += len(generated_records)
+            status_text.text("✅ Generation complete!")
+            progress_bar.progress(1.0)
+            status_area.success(f"Successfully generated {len(generated_records)} medical records!")
+    # Display generated records with better organization
+    if st.session_state.generated_records:
+        st.markdown("### 📋 Generated Records")
+        # Filters with better layout
+        col_filter1, col_filter2 = st.columns(2)
+        with col_filter1:
+            filter_type = st.selectbox(
+                "Filter by Type",
+                ["All"] + list(set([r.get('type', 'Unknown') for r in st.session_state.generated_records]))
+            )
+        with col_filter2:
+            records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
+        # Filter records
+        filtered_records = st.session_state.generated_records
+        if filter_type != "All":
+            filtered_records = [r for r in filtered_records if r.get('type', 'Unknown') == filter_type]
+        # Pagination
+        total_records = len(filtered_records)
+        total_pages = (total_records - 1) // records_per_page + 1
+        if total_pages > 1:
+            page = st.selectbox("Page", range(1, total_pages + 1))
+            start_idx = (page - 1) * records_per_page
+            end_idx = start_idx + records_per_page
+            page_records = filtered_records[start_idx:end_idx]
+        else:
+            page_records = filtered_records
+        # Display records with better styling
+        for i, record in enumerate(page_records):
+            with st.expander(f"Record {record.get('id', 'Unknown')} - {record.get('type', 'Unknown').replace('_', ' ').title()}"):
+                if include_metadata:
+                    col_meta1, col_meta2, col_meta3 = st.columns(3)
+                    with col_meta1:
+                        st.metric("Type", record.get('type', 'Unknown').replace('_', ' ').title())
+                    with col_meta2:
+                        st.metric("Generated", record.get('timestamp', 'N/A'))
+                    with col_meta3:
+                        st.metric("Source", record.get('source', 'Hugging Face'))
+                st.markdown('<div class="record-container">', unsafe_allow_html=True)
+                st.text_area("Content", record.get('text', 'No content available'), height=200, key=f"record_{i}")
+                st.markdown('</div>', unsafe_allow_html=True)
+with col2:
+    st.markdown("### 📊 Statistics")
+    # Stats container with better styling
+    st.markdown('<div class="stats-container">', unsafe_allow_html=True)
+    # Total records
+    st.metric("Total Records Generated", st.session_state.total_generated)
+    # Record type distribution with better visualization
+    if st.session_state.generated_records:
+        type_counts = pd.Series([r.get('type', 'Unknown') for r in st.session_state.generated_records]).value_counts()
+        st.markdown("#### Record Type Distribution")
+        st.bar_chart(type_counts)
+    # Export options with better organization
+    st.markdown("#### 💾 Export Data")
+    if st.session_state.generated_records:
+        if export_format == "JSON":
+            json_str = json.dumps(st.session_state.generated_records, indent=2)
+            st.download_button(
+                "📥 Download JSON",
+                json_str,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json",
+                use_container_width=True
+            )
+        elif export_format == "CSV":
+            df = pd.DataFrame(st.session_state.generated_records)
+            csv = df.to_csv(index=False)
+            st.download_button(
+                "📥 Download CSV",
+                csv,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                mime="text/csv",
+                use_container_width=True
+            )
+        elif export_format == "TXT":
+            txt = "\n\n".join([f"Record {r.get('id', 'Unknown')} ({r.get('type', 'Unknown')}):\n{r.get('text', 'No content available')}" for r in st.session_state.generated_records])
+            st.download_button(
+                "📥 Download TXT",
+                txt,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                mime="text/plain",
+                use_container_width=True
+            )
+    st.markdown('</div>', unsafe_allow_html=True)
+# Add a footer
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center; color: #666;'>
+    <p>Built with ❤️ using Streamlit | Synthex Medical Text Generator</p>
+</div>
+""", unsafe_allow_html=True)

batch_generate.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import json
+import random
+import time
+from pathlib import Path
+from src.generation.medical_generator import MedicalTextGenerator
+# Check for Gemini API key
+if not os.getenv('GEMINI_API_KEY'):
+    print("Please set the GEMINI_API_KEY environment variable:")
+    print("Windows PowerShell: $env:GEMINI_API_KEY='your-api-key-here'")
+    print("Windows CMD: set GEMINI_API_KEY=your-api-key-here")
+    exit(1)
+# Ensure the output directory exists
+output_dir = Path("data/synthetic")
+output_dir.mkdir(parents=True, exist_ok=True)
+# Initialize the generator
+generator = MedicalTextGenerator()
+# Define supported record types (using the keys from the generator's templates)
+record_types = ["clinical_note", "discharge_summary", "lab_report"]
+# Generate 100 mixed records
+records = []
+for i in range(100):
+    # Randomly select record type
+    record_type = random.choice(record_types)
+    # Generate record using Hugging Face
+    try:
+        record = generator.generate_record(record_type, use_gemini=False)
+        print(f"Generated record {i+1}/100: {record_type}")
+        # Append record details
+        records.append({
+            "id": i + 1,
+            "type": record_type,
+            "content": record,
+            "generator": "Hugging Face",
+            "generated_at": time.strftime("%Y-%m-%d %H:%M:%S")
+        })
+        # Respect rate limits (e.g., 4 seconds between calls)
+        time.sleep(4)
+    except Exception as e:
+        print(f"Error generating record {i+1}: {str(e)}")
+        continue
+# Save records to a JSON file
+output_file = output_dir / "synthetic_records.json"
+with open(output_file, "w") as f:
+    json.dump(records, f, indent=2)
+print(f"\nGenerated {len(records)} records and saved to {output_file}")

data/processed/.gitkeep ADDED Viewed

File without changes

data/reports/plots/sample_distribution.png ADDED Viewed

data/synthetic/.gitkeep ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Core ML libraries
+torch==2.2.1
+transformers==4.38.2
+datasets>=2.12.0
+huggingface_hub>=0.15.0
+# Web framework
+streamlit==1.32.0
+gradio>=3.35.0
+fastapi>=0.115.2
+uvicorn>=0.24.0
+# Data processing
+pandas==2.2.1
+numpy==1.26.4
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+lxml>=4.9.0
+# Medical NLP
+spacy>=3.6.0
+scikit-learn>=1.3.0
+# API integration
+google-generativeai==0.3.2
+# Utilities
+python-dotenv==1.0.1
+tqdm>=4.65.0

setup.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import sys
+import subprocess
+import logging
+from pathlib import Path
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def check_python_version():
+    """Check if Python version is compatible"""
+    if sys.version_info < (3, 11):
+        logger.error("Python 3.11 or higher is required")
+        sys.exit(1)
+    logger.info(f"Python version {sys.version_info.major}.{sys.version_info.minor} detected")
+def create_virtual_environment():
+    """Create and activate virtual environment"""
+    venv_name = "synthex_env"
+    if not os.path.exists(venv_name):
+        logger.info(f"Creating virtual environment: {venv_name}")
+        subprocess.run([sys.executable, "-m", "venv", venv_name], check=True)
+    else:
+        logger.info(f"Virtual environment {venv_name} already exists")
+def install_requirements():
+    """Install required packages"""
+    logger.info("Installing requirements...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)
+def create_directories():
+    """Create necessary directories"""
+    directories = [
+        "data/raw",
+        "data/processed",
+        "data/reports",
+        "data/reports/plots"
+    ]
+    for directory in directories:
+        Path(directory).mkdir(parents=True, exist_ok=True)
+        logger.info(f"Created directory: {directory}")
+def setup_environment():
+    """Setup the complete environment"""
+    try:
+        logger.info("Starting environment setup...")
+        # Check Python version
+        check_python_version()
+        # Create virtual environment
+        create_virtual_environment()
+        # Install requirements
+        install_requirements()
+        # Create directories
+        create_directories()
+        logger.info("Environment setup completed successfully!")
+        logger.info("\nNext steps:")
+        logger.info("1. Activate the virtual environment:")
+        logger.info("   - Windows: synthex_env\\Scripts\\activate")
+        logger.info("   - Unix/MacOS: source synthex_env/bin/activate")
+        logger.info("2. Run data collection: python setup_data.py")
+        logger.info("3. Analyze data quality: python analyze_data_quality.py")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error during setup: {str(e)}")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"Unexpected error: {str(e)}")
+        sys.exit(1)
+if __name__ == "__main__":
+    setup_environment()

setup_data.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import sys
+from pathlib import Path
+import logging
+import subprocess
+# Add src directory to Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def setup_data_directories():
+    """Create necessary data directories"""
+    directories = [
+        "data/raw",
+        "data/processed",
+        "data/synthetic"
+    ]
+    for directory in directories:
+        path = Path(directory)
+        path.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Created directory: {directory}")
+        # Create .gitkeep file
+        gitkeep = path / ".gitkeep"
+        gitkeep.touch(exist_ok=True)
+        logger.info(f"Created .gitkeep in {directory}")
+def main():
+    """Setup data directories and run collection"""
+    logger.info("Setting up data directories...")
+    setup_data_directories()
+    logger.info("Running data collection script via subprocess...")
+    result = subprocess.run([sys.executable, 'src/data_collection/data_collection.py'])
+    if result.returncode != 0:
+        logger.error(f"Data collection script failed with exit code {result.returncode}")
+    else:
+        logger.info("Data collection completed successfully.")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Synthex Medical Text Generator
+A tool for generating synthetic medical records
+"""
+__version__ = "0.1.0"

src/api/app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from typing import List, Optional
+import sys
+import os
+import logging
+# Add src directory to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from generation.medical_generator import MedicalTextGenerator
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="Synthex Medical Text Generator API",
+    description="API for generating synthetic medical records",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+# Mount static files
+app.mount("/static", StaticFiles(directory="src/web"), name="static")
+# Initialize generator
+generator = MedicalTextGenerator()
+class GenerationRequest(BaseModel):
+    record_type: str
+    quantity: int = 1
+    use_gemini: bool = False
+    include_metadata: bool = True
+class MedicalRecord(BaseModel):
+    id: str
+    type: str
+    text: str
+    timestamp: str
+    source: str
+class GenerationResponse(BaseModel):
+    records: List[MedicalRecord]
+    total_generated: int
+@app.get("/")
+async def read_root():
+    """Serve the HTML interface"""
+    return FileResponse("src/web/index.html")
+@app.get("/record-types")
+async def get_record_types():
+    """Get available record types"""
+    return {"record_types": list(generator.templates.keys())}
+@app.post("/generate", response_model=GenerationResponse)
+async def generate_records(request: GenerationRequest):
+    """Generate synthetic medical records"""
+    try:
+        if request.record_type not in generator.templates:
+            raise HTTPException(status_code=400, detail=f"Invalid record type. Available types: {list(generator.templates.keys())}")
+        if request.quantity < 1 or request.quantity > 10:
+            raise HTTPException(status_code=400, detail="Quantity must be between 1 and 10")
+        records = generator.batch_generate(
+            record_type=request.record_type,
+            count=request.quantity,
+            use_gemini=request.use_gemini
+        )
+        return {
+            "records": records,
+            "total_generated": len(records)
+        }
+    except Exception as e:
+        logger.error(f"Error generating records: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/data_collection/data_collection.py ADDED Viewed

	@@ -0,0 +1,440 @@

+"""
+Medical Data Collection Pipeline for Synthex MVP
+Collects medical text from free sources for training data
+"""
+import requests
+import pandas as pd
+from datasets import load_dataset
+import time
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+import logging
+import sys
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+import re
+from datetime import datetime
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('data_collection.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+class MedicalDataCollector:
+    def __init__(self, output_dir: str = "data/raw"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.stats = {
+            "total_samples": 0,
+            "sources": {},
+            "errors": [],
+            "start_time": datetime.now()
+        }
+        logger.info(f"Initialized MedicalDataCollector with output directory: {self.output_dir}")
+    def collect_huggingface_datasets(self) -> Dict[str, List]:
+        """Collect medical datasets from Hugging Face Hub"""
+        # Only include datasets that are known to exist and are medical-related
+        datasets_to_collect = [
+            "medical_questions_pairs",
+            "medalpaca/medical_meadow_medical_flashcards",
+            "gamino/wiki_medical_terms",
+            ("pubmed_qa", "pqa_artificial")  # pubmed_qa requires a config
+        ]
+        collected_data = {}
+        for dataset_entry in tqdm(datasets_to_collect, desc="Collecting Hugging Face datasets"):
+            try:
+                if isinstance(dataset_entry, tuple):
+                    dataset_name, config = dataset_entry
+                    logger.info(f"Loading dataset: {dataset_name} with config: {config}")
+                    dataset = load_dataset(dataset_name, config, split="train")
+                    dataset_key = f"{dataset_name}_{config}"
+                else:
+                    dataset_name = dataset_entry
+                    logger.info(f"Loading dataset: {dataset_name}")
+                    dataset = load_dataset(dataset_name, split="train")
+                    dataset_key = dataset_name
+                # Convert to list of dictionaries
+                data_list = []
+                for item in dataset:
+                    processed_item = self._process_dataset_item(item)
+                    if processed_item:
+                        data_list.append(processed_item)
+                if data_list:
+                    collected_data[dataset_key] = data_list
+                    self.stats["sources"][dataset_key] = len(data_list)
+                    self.stats["total_samples"] += len(data_list)
+                    # Save to file
+                    output_file = self.output_dir / f"{dataset_key.replace('/', '_')}.json"
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        json.dump(data_list, f, indent=2, ensure_ascii=False)
+                    logger.info(f"Saved {len(data_list)} samples from {dataset_key} to {output_file}")
+                else:
+                    logger.warning(f"No valid data found in dataset: {dataset_key}")
+                time.sleep(1)  # Be respectful to APIs
+            except Exception as e:
+                error_msg = f"Failed to load {dataset_entry}: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                self.stats["errors"].append(error_msg)
+                continue
+        return collected_data
+    def collect_pubmed_abstracts(self, queries: List[str] = None, max_results: int = 1000) -> List[Dict]:
+        """Collect PubMed abstracts via API"""
+        if queries is None:
+            queries = [
+                "clinical notes",
+                "medical case reports",
+                "patient discharge summaries",
+                "medical laboratory reports",
+                "medical imaging reports"
+            ]
+        all_abstracts = []
+        for query in tqdm(queries, desc="Collecting PubMed abstracts"):
+            try:
+                abstracts = self._collect_pubmed_query(query, max_results)
+                all_abstracts.extend(abstracts)
+                self.stats["sources"]["pubmed_" + query.replace(" ", "_")] = len(abstracts)
+                self.stats["total_samples"] += len(abstracts)
+            except Exception as e:
+                error_msg = f"Failed to collect PubMed abstracts for {query}: {str(e)}"
+                logger.error(error_msg)
+                self.stats["errors"].append(error_msg)
+                continue
+        # Save all abstracts
+        if all_abstracts:
+            output_file = self.output_dir / "pubmed_abstracts.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(all_abstracts, f, indent=2, ensure_ascii=False)
+        return all_abstracts
+    def _collect_pubmed_query(self, query: str, max_results: int) -> List[Dict]:
+        """Collect PubMed abstracts for a specific query"""
+        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        search_url = f"{base_url}esearch.fcgi"
+        search_params = {
+            "db": "pubmed",
+            "term": query,
+            "retmax": max_results,
+            "retmode": "json",
+            "sort": "relevance"
+        }
+        try:
+            response = requests.get(search_url, params=search_params)
+            response.raise_for_status()  # Raise exception for bad status codes
+            search_results = response.json()
+            # Check rate limits
+            rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
+            rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
+            logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
+            if rate_remaining <= 1:
+                logger.warning("Rate limit nearly reached, waiting 60 seconds")
+                time.sleep(60)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Failed to fetch PubMed search results for query '{query}': {str(e)}")
+            return []
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse PubMed search results for query '{query}': {str(e)}")
+            return []
+        if "esearchresult" not in search_results:
+            logger.warning(f"No search results found for query '{query}'")
+            return []
+        id_list = search_results["esearchresult"]["idlist"]
+        abstracts = []
+        batch_size = 100
+        for i in range(0, len(id_list), batch_size):
+            batch_ids = id_list[i:i+batch_size]
+            ids_str = ",".join(batch_ids)
+            fetch_url = f"{base_url}efetch.fcgi"
+            fetch_params = {
+                "db": "pubmed",
+                "id": ids_str,
+                "retmode": "xml"
+            }
+            try:
+                response = requests.get(fetch_url, params=fetch_params)
+                response.raise_for_status()
+                # Check rate limits
+                rate_limit = int(response.headers.get('X-RateLimit-Limit', '3'))
+                rate_remaining = int(response.headers.get('X-RateLimit-Remaining', '0'))
+                logger.info(f"Rate limit: {rate_remaining}/{rate_limit} requests remaining")
+                if rate_remaining <= 1:
+                    logger.warning("Rate limit nearly reached, waiting 60 seconds")
+                    time.sleep(60)
+                # Parse XML with proper features
+                soup = BeautifulSoup(response.text, 'lxml', features="xml")
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to fetch PubMed article batch {i//batch_size + 1}: {str(e)}")
+                continue
+            except Exception as e:
+                logger.error(f"Failed to parse PubMed article batch {i//batch_size + 1}: {str(e)}")
+                continue
+            for article in soup.find_all('PubmedArticle'):
+                try:
+                    abstract = article.find('Abstract')
+                    if abstract:
+                        abstract_text = abstract.get_text().strip()
+                        if len(abstract_text) > 100:  # Filter out very short abstracts
+                            title = article.find('ArticleTitle')
+                            if not title:
+                                continue
+                            title_text = title.get_text().strip()
+                            pub_date = article.find('PubDate')
+                            year = "Unknown"
+                            if pub_date and pub_date.find('Year'):
+                                year = pub_date.find('Year').get_text().strip()
+                            abstracts.append({
+                                "title": title_text,
+                                "abstract": abstract_text,
+                                "year": year,
+                                "source": "pubmed",
+                                "query": query
+                            })
+                except Exception as e:
+                    logger.debug(f"Failed to process article in batch {i//batch_size + 1}: {str(e)}")
+                    continue
+            # Always wait between batches to respect rate limits
+            time.sleep(1)
+        logger.info(f"Collected {len(abstracts)} abstracts for query '{query}'")
+        return abstracts
+    def create_training_dataset(self) -> pd.DataFrame:
+        """Combine all collected data into training dataset"""
+        all_texts = []
+        # Load all collected datasets
+        for json_file in tqdm(list(self.output_dir.glob("*.json")), desc="Processing collected data"):
+            try:
+                with open(json_file, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                # Extract text content
+                for item in data:
+                    text_content = self._extract_text_content(item)
+                    if text_content:
+                        processed_text = self._clean_text(text_content)
+                        if processed_text:
+                            all_texts.append({
+                                "text": processed_text,
+                                "source": json_file.stem,
+                                "length": len(processed_text),
+                                "type": self._determine_text_type(processed_text)
+                            })
+            except Exception as e:
+                error_msg = f"Failed to process {json_file}: {str(e)}"
+                logger.error(error_msg)
+                self.stats["errors"].append(error_msg)
+                continue
+        # Create DataFrame
+        df = pd.DataFrame(all_texts)
+        # Basic filtering
+        df = df[df['length'] > 100]  # Remove very short texts
+        df = df[df['length'] < 5000]  # Remove very long texts
+        # Remove duplicates
+        df = df.drop_duplicates(subset=['text'])
+        # Save processed dataset
+        output_file = self.output_dir.parent / "processed" / "training_data.csv"
+        output_file.parent.mkdir(exist_ok=True)
+        df.to_csv(output_file, index=False, encoding='utf-8')
+        # Update stats
+        self.stats["final_samples"] = len(df)
+        self.stats["text_types"] = df['type'].value_counts().to_dict()
+        logger.info(f"Created training dataset with {len(df)} samples")
+        return df
+    def _process_dataset_item(self, item: Dict) -> Dict:
+        """Process and validate a dataset item"""
+        try:
+            # Extract text content
+            text = self._extract_text_content(item)
+            if not text or len(text) < 100:
+                return None
+            # Clean text
+            cleaned_text = self._clean_text(text)
+            if not cleaned_text:
+                return None
+            # Create processed item
+            processed = {
+                "text": cleaned_text,
+                "source": "huggingface",
+                "type": self._determine_text_type(cleaned_text)
+            }
+            # Add metadata if available
+            for key in ['title', 'question', 'answer', 'instruction']:
+                if key in item:
+                    processed[key] = str(item[key])
+            return processed
+        except Exception:
+            return None
+    def _extract_text_content(self, item: Dict) -> str:
+        """Extract relevant text content from dataset item"""
+        # Common text fields in medical datasets
+        text_fields = ['text', 'content', 'abstract', 'question', 'answer',
+                      'instruction', 'output', 'input', 'context']
+        for field in text_fields:
+            if field in item and item[field]:
+                return str(item[field])
+        # Fallback: combine multiple fields
+        combined_text = ""
+        for key, value in item.items():
+            if isinstance(value, str) and len(value) > 20:
+                combined_text += f"{value} "
+        return combined_text.strip()
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        # Remove special characters and normalize whitespace
+        text = re.sub(r'[^\w\s.,;:!?()-]', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common noise
+        text = re.sub(r'http\S+', '', text)
+        text = re.sub(r'www\S+', '', text)
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
+        return text.strip()
+    def _determine_text_type(self, text: str) -> str:
+        """Determine the type of medical text"""
+        text = text.lower()
+        if any(term in text for term in ['discharge', 'summary', 'discharge summary']):
+            return 'discharge_summary'
+        elif any(term in text for term in ['lab', 'laboratory', 'test results']):
+            return 'lab_report'
+        elif any(term in text for term in ['prescription', 'medication', 'drug']):
+            return 'prescription'
+        elif any(term in text for term in ['question', 'answer', 'qa']):
+            return 'medical_qa'
+        else:
+            return 'clinical_note'
+    def generate_report(self) -> Dict:
+        """Generate a report of the data collection process"""
+        # Convert all datetime objects to strings
+        for k, v in self.stats.items():
+            if isinstance(v, datetime):
+                self.stats[k] = str(v)
+        self.stats["end_time"] = str(datetime.now())
+        if isinstance(self.stats["start_time"], datetime):
+            self.stats["start_time"] = str(self.stats["start_time"])
+        # Calculate duration as string
+        try:
+            start_dt = datetime.fromisoformat(self.stats["start_time"])
+            end_dt = datetime.fromisoformat(self.stats["end_time"])
+            self.stats["duration"] = str(end_dt - start_dt)
+        except Exception:
+            self.stats["duration"] = "unknown"
+        report_file = self.output_dir.parent / "reports" / "collection_report.json"
+        report_file.parent.mkdir(exist_ok=True)
+        with open(report_file, 'w', encoding='utf-8') as f:
+            json.dump(self.stats, f, indent=2, ensure_ascii=False)
+        return self.stats
+def main():
+    """Run data collection pipeline"""
+    try:
+        collector = MedicalDataCollector()
+        # Collect from Hugging Face
+        logger.info("Starting Hugging Face dataset collection...")
+        hf_data = collector.collect_huggingface_datasets()
+        # Collect from PubMed
+        logger.info("Starting PubMed collection...")
+        pubmed_data = collector.collect_pubmed_abstracts()
+        # Create training dataset
+        logger.info("Creating training dataset...")
+        training_df = collector.create_training_dataset()
+        # Generate report
+        report = collector.generate_report()
+        # Print summary
+        logger.info("\nData Collection Summary:")
+        logger.info(f"Total samples collected: {report['total_samples']}")
+        logger.info(f"Final training samples: {report['final_samples']}")
+        logger.info(f"Duration: {report['duration']}")
+        logger.info("\nText types distribution:")
+        for type_, count in report['text_types'].items():
+            logger.info(f"- {type_}: {count}")
+        if report['errors']:
+            logger.warning(f"\nEncountered {len(report['errors'])} errors during collection")
+    except Exception as e:
+        logger.error(f"Data collection failed: {str(e)}", exc_info=True)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

src/generation/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Synthex Medical Text Generation Package
+"""
+from .medical_generator import MedicalTextGenerator
+__all__ = ['MedicalTextGenerator']

src/generation/medical_generator.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Basic Medical Text Generator for Synthex MVP
+Uses Hugging Face models and Gemini API
+"""
+import google.generativeai as genai
+from transformers import pipeline
+import random
+import time
+import json
+from typing import List, Dict, Optional
+import logging
+from datetime import datetime
+import os
+import sys
+# Setup logging with better formatting
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+# Get Gemini API key from environment variable
+DEFAULT_GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', '')
+class MedicalTextGenerator:
+    def __init__(self, gemini_api_key: Optional[str] = None):
+        """Initialize the medical text generator"""
+        self.gemini_api_key = gemini_api_key or DEFAULT_GEMINI_API_KEY
+        if not self.gemini_api_key:
+            logger.warning("No Gemini API key provided. Using Hugging Face model only.")
+        self.hf_model = None
+        self.gemini_model = None
+        # Initialize models
+        self._setup_models()
+        # Medical record templates
+        self.templates = {
+            "clinical_note": self._get_clinical_note_template(),
+            "discharge_summary": self._get_discharge_summary_template(),
+            "lab_report": self._get_lab_report_template(),
+            "prescription": self._get_prescription_template(),
+            "patient_intake": self._get_patient_intake_template()
+        }
+    def _setup_models(self):
+        """Setup Hugging Face and Gemini models"""
+        try:
+            # Setup Hugging Face model (free)
+            logger.info("Loading Hugging Face medical model...")
+            # Use text generation pipeline with a smaller model and CPU device
+            self.hf_generator = pipeline(
+                "text-generation",
+                model="distilgpt2",
+                max_length=512,
+                do_sample=True,
+                temperature=0.7,
+                device=-1,  # Force CPU usage to avoid CUDA issues
+                truncation=True  # Add truncation to avoid warnings
+            )
+            logger.info("Hugging Face model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load Hugging Face model: {str(e)}")
+            self.hf_generator = None
+            logger.info("Falling back to template-based generation")
+        try:
+            # Setup Gemini (free tier)
+            if self.gemini_api_key:
+                genai.configure(api_key=self.gemini_api_key)
+                # List available models
+                for m in genai.list_models():
+                    logger.info(f"Available model: {m.name}")
+                self.gemini_model = genai.GenerativeModel('gemini-pro')
+                logger.info("Gemini model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load Gemini model: {str(e)}")
+            self.gemini_model = None
+            logger.info("Gemini API will not be available")
+    def generate_record(self, record_type: str, use_gemini: bool = False) -> Dict:
+        """Generate a synthetic medical record"""
+        if record_type not in self.templates:
+            raise ValueError(f"Unknown record type: {record_type}")
+        template = self.templates[record_type]
+        content = None
+        # Try generation methods in order of preference
+        if use_gemini and self.gemini_model:
+            try:
+                content = self._generate_with_gemini(template)
+                logger.info("Successfully generated record using Gemini")
+            except Exception as e:
+                logger.error(f"Gemini generation failed: {str(e)}")
+                content = None
+        if content is None and self.hf_generator:
+            try:
+                content = self._generate_with_huggingface(template)
+                logger.info("Successfully generated record using Hugging Face")
+            except Exception as e:
+                logger.error(f"Hugging Face generation failed: {str(e)}")
+                content = None
+        if content is None:
+            try:
+                content = self._generate_with_template(template)
+                logger.info("Successfully generated record using template")
+            except Exception as e:
+                logger.error(f"Template generation failed: {str(e)}")
+                raise RuntimeError("All generation methods failed")
+        return {
+            "id": self._generate_id(),
+            "type": record_type,
+            "text": content,
+            "timestamp": datetime.now().isoformat(),
+            "source": "Gemini" if use_gemini and self.gemini_model else "Hugging Face" if self.hf_generator else "Template"
+        }
+    def _generate_with_gemini(self, template: str) -> str:
+        """Generate text using Gemini API"""
+        try:
+            prompt = f"""
+            Generate a realistic but completely fictional medical record using this template:
+            {template}
+            Requirements:
+            - Use fictional patient names and details
+            - Include medically accurate terminology
+            - Make it realistic but not based on any real patient
+            - Include specific medical details and measurements
+            - Follow standard medical documentation format
+            """
+            response = self.gemini_model.generate_content(prompt)
+            return response.text
+        except Exception as e:
+            logger.error(f"Gemini generation failed: {str(e)}")
+            raise
+    def _generate_with_huggingface(self, template: str) -> str:
+        """Generate text using Hugging Face model"""
+        try:
+            # First fill the template with random values
+            fake_data = {
+                "patient_name": random.choice([
+                    "John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
+                    "Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
+                ]),
+                "age": random.randint(18, 90),
+                "gender": random.choice(["Male", "Female", "Other"]),
+                "chief_complaint": random.choice([
+                    "chest pain", "shortness of breath", "abdominal pain", "headache",
+                    "fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
+                ]),
+                "blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
+                "heart_rate": random.randint(55, 120),
+                "temperature": round(random.uniform(97.0, 104.0), 1),
+                "diagnosis": random.choice([
+                    "Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
+                    "Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
+                ]),
+                "date": time.strftime("%Y-%m-%d"),
+                "address": random.choice([
+                    "123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
+                ]),
+                "phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
+                "email": random.choice([
+                    "[email protected]", "[email protected]", "[email protected]", "[email protected]"
+                ]),
+            }
+            # Fill template with fake data
+            filled_template = template
+            for key, value in fake_data.items():
+                filled_template = filled_template.replace(f"{{{key}}}", str(value))
+            # Use the filled template as starting prompt
+            prompt = filled_template[:100] + "..."
+            # Generate text with explicit configuration
+            generated = self.hf_generator(
+                prompt,
+                max_length=400,
+                num_return_sequences=1,
+                pad_token_id=50256,
+                truncation=True
+            )
+            # Use the generated text
+            return generated[0]['generated_text']
+        except Exception as e:
+            logger.error(f"Hugging Face generation failed: {str(e)}")
+            logger.info("Falling back to template-based generation")
+            return self._generate_with_template(template)
+    def _generate_with_template(self, template: str) -> str:
+        """Fallback: Generate using template with random values"""
+        try:
+            # Expanded fake data for more variety
+            fake_data = {
+                "patient_name": random.choice([
+                    "John Smith", "Jane Doe", "Robert Johnson", "Mary Wilson", "Emily Clark",
+                    "Michael Brown", "Linda Lee", "David Kim", "Sarah Patel", "James Chen"
+                ]),
+                "age": random.randint(18, 90),
+                "gender": random.choice(["Male", "Female", "Other"]),
+                "chief_complaint": random.choice([
+                    "chest pain", "shortness of breath", "abdominal pain", "headache",
+                    "fever", "fatigue", "dizziness", "back pain", "cough", "palpitations"
+                ]),
+                "blood_pressure": f"{random.randint(110, 160)}/{random.randint(60, 100)}",
+                "heart_rate": random.randint(55, 120),
+                "temperature": round(random.uniform(97.0, 104.0), 1),
+                "diagnosis": random.choice([
+                    "Hypertension", "Type 2 Diabetes", "Pneumonia", "Migraine",
+                    "Gastroenteritis", "Anxiety", "Asthma", "COVID-19", "Anemia", "Hyperlipidemia"
+                ]),
+                "date": time.strftime("%Y-%m-%d"),
+                "address": random.choice([
+                    "123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Elm St"
+                ]),
+                "phone": f"({random.randint(200,999)})-{random.randint(100,999)}-{random.randint(1000,9999)}",
+                "email": random.choice([
+                    "[email protected]", "[email protected]", "[email protected]", "[email protected]"
+                ]),
+            }
+            # Fill template with fake data
+            filled_template = template
+            for key, value in fake_data.items():
+                filled_template = filled_template.replace(f"{{{key}}}", str(value))
+            return filled_template
+        except Exception as e:
+            logger.error(f"Template generation failed: {str(e)}")
+            raise
+    def batch_generate(self, record_type: str, count: int = 10, use_gemini: bool = False) -> List[Dict]:
+        """Generate multiple records"""
+        records = []
+        for i in range(count):
+            try:
+                record = self.generate_record(record_type, use_gemini)
+                records.append(record)
+                # Rate limiting for API calls
+                if use_gemini:
+                    time.sleep(1)  # Respect API limits
+                logger.info(f"Generated record {i+1}/{count}")
+            except Exception as e:
+                logger.error(f"Failed to generate record {i+1}: {str(e)}")
+                continue
+        return records
+    def _generate_id(self) -> str:
+        """Generate unique record ID"""
+        return f"SYN-{int(time.time())}-{random.randint(1000, 9999)}"
+    def _get_clinical_note_template(self) -> str:
+        return """
+        CLINICAL NOTE
+        Patient: {patient_name}
+        Age: {age}
+        Gender: {gender}
+        Date: {date}
+        Chief Complaint:
+        {chief_complaint}
+        Vital Signs:
+        - Blood Pressure: {blood_pressure} mmHg
+        - Heart Rate: {heart_rate} bpm
+        - Temperature: {temperature}°F
+        Assessment:
+        {diagnosis}
+        Plan:
+        1. Follow-up in 2 weeks
+        2. Continue current medications
+        3. Monitor symptoms
+        Provider: Dr. Smith
+        """
+    def _get_discharge_summary_template(self) -> str:
+        return """
+        DISCHARGE SUMMARY
+        Patient: {patient_name}
+        Age: {age}
+        Gender: {gender}
+        Admission Date: {date}
+        Discharge Date: {date}
+        Reason for Admission:
+        {chief_complaint}
+        Hospital Course:
+        Patient was admitted for {chief_complaint}. During hospitalization, patient was treated with appropriate medications and showed improvement.
+        Final Diagnosis:
+        {diagnosis}
+        Discharge Medications:
+        1. Medication A - 1 tablet daily
+        2. Medication B - 2 tablets twice daily
+        Follow-up:
+        - Primary Care Provider: Dr. Johnson
+        - Appointment: 2 weeks from discharge
+        Discharge Instructions:
+        1. Take medications as prescribed
+        2. Follow up with primary care provider
+        3. Call if symptoms worsen
+        Discharging Provider: Dr. Smith
+        """
+    def _get_lab_report_template(self) -> str:
+        return """
+        LABORATORY REPORT
+        Patient: {patient_name}
+        Age: {age}
+        Gender: {gender}
+        Date: {date}
+        Test Results:
+        Complete Blood Count (CBC):
+        - White Blood Cells: {random.randint(4,11)} K/uL
+        - Red Blood Cells: {round(random.uniform(4.0,5.5),2)} M/uL
+        - Hemoglobin: {round(random.uniform(12.0,16.0),1)} g/dL
+        - Platelets: {random.randint(150,450)} K/uL
+        Basic Metabolic Panel:
+        - Glucose: {random.randint(70,140)} mg/dL
+        - BUN: {random.randint(7,20)} mg/dL
+        - Creatinine: {round(random.uniform(0.6,1.2),2)} mg/dL
+        Interpretation:
+        Results are within normal limits.
+        Lab Director: Dr. Wilson
+        """
+    def _get_prescription_template(self) -> str:
+        return """
+        PRESCRIPTION
+        Patient: {patient_name}
+        Age: {age}
+        Gender: {gender}
+        Date: {date}
+        Prescription:
+        {diagnosis} - {random.choice(['Amoxicillin', 'Lisinopril', 'Metformin', 'Atorvastatin', 'Albuterol'])}
+        Dosage: {random.choice(['1 tablet', '2 tablets', '1 capsule'])} {random.choice(['daily', 'twice daily', 'three times daily'])}
+        Quantity: {random.randint(30,90)} tablets
+        Refills: {random.randint(0,3)}
+        Prescribing Provider: Dr. Smith
+        DEA Number: AB1234567
+        """
+    def _get_patient_intake_template(self) -> str:
+        return """
+        PATIENT INTAKE FORM
+        Personal Information:
+        Name: {patient_name}
+        Age: {age}
+        Gender: {gender}
+        Address: {address}
+        Phone: {phone}
+        Email: {email}
+        Emergency Contact:
+        Name: {random.choice(['Spouse', 'Parent', 'Sibling'])} {patient_name.split()[0]}
+        Phone: {phone}
+        Relationship: {random.choice(['Spouse', 'Parent', 'Sibling'])}
+        Insurance Information:
+        Provider: {random.choice(['Blue Cross', 'Aetna', 'United Healthcare', 'Cigna'])}
+        Policy Number: {random.randint(100000000,999999999)}
+        Group Number: {random.randint(10000,99999)}
+        Medical History:
+        Chief Complaint: {chief_complaint}
+        Current Medications: {random.choice(['None', 'Aspirin', 'Metformin', 'Lisinopril'])}
+        Allergies: {random.choice(['None', 'Penicillin', 'Sulfa', 'Peanuts'])}
+        Vital Signs:
+        Blood Pressure: {blood_pressure} mmHg
+        Heart Rate: {heart_rate} bpm
+        Temperature: {temperature}°F
+        Intake Date: {date}
+        Intake Provider: Dr. Smith
+        """
+def main():
+    """Test the generator"""
+    generator = MedicalTextGenerator()
+    # Test each record type
+    for record_type in generator.templates.keys():
+        print(f"\nGenerating {record_type}...")
+        record = generator.generate_record(record_type)
+        print(json.dumps(record, indent=2))
+if __name__ == "__main__":
+    main()

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import altair as alt
+import numpy as np
+import pandas as pd
+import streamlit as st
+"""
+# Welcome to Streamlit!
+Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).
+In the meantime, below is an example of what you can do with just a few lines of code:
+"""
+num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
+num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
+indices = np.linspace(0, 1, num_points)
+theta = 2 * np.pi * num_turns * indices
+radius = indices
+x = radius * np.cos(theta)
+y = radius * np.sin(theta)
+df = pd.DataFrame({
+    "x": x,
+    "y": y,
+    "idx": indices,
+    "rand": np.random.randn(num_points),
+})
+st.altair_chart(alt.Chart(df, height=700, width=700)
+    .mark_point(filled=True)
+    .encode(
+        x=alt.X("x", axis=None),
+        y=alt.Y("y", axis=None),
+        color=alt.Color("idx", legend=None, scale=alt.Scale()),
+        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
+    ))

src/web/index.html ADDED Viewed

	@@ -0,0 +1,127 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Synthex Medical Text Generator</title>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body {
+            padding: 20px;
+            background-color: #f8f9fa;
+        }
+        .container {
+            max-width: 800px;
+            background-color: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+        }
+        .result-box {
+            background-color: #f8f9fa;
+            padding: 15px;
+            border-radius: 5px;
+            margin-top: 20px;
+            white-space: pre-wrap;
+        }
+        .loading {
+            display: none;
+            text-align: center;
+            margin: 20px 0;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1 class="mb-4">Synthex Medical Text Generator</h1>
+        <div class="mb-3">
+            <label for="recordType" class="form-label">Record Type</label>
+            <select class="form-select" id="recordType">
+                <option value="clinical_note">Clinical Note</option>
+                <option value="discharge_summary">Discharge Summary</option>
+                <option value="lab_report">Lab Report</option>
+                <option value="prescription">Prescription</option>
+                <option value="patient_intake">Patient Intake</option>
+            </select>
+        </div>
+        <div class="mb-3">
+            <label for="quantity" class="form-label">Quantity</label>
+            <input type="number" class="form-control" id="quantity" value="1" min="1" max="10">
+        </div>
+        <div class="mb-3 form-check">
+            <input type="checkbox" class="form-check-input" id="useGemini">
+            <label class="form-check-label" for="useGemini">Use Gemini (if available)</label>
+        </div>
+        <div class="mb-3 form-check">
+            <input type="checkbox" class="form-check-input" id="includeMetadata" checked>
+            <label class="form-check-label" for="includeMetadata">Include Metadata</label>
+        </div>
+        <button class="btn btn-primary" onclick="generateRecords()">Generate Records</button>
+        <div class="loading" id="loading">
+            <div class="spinner-border text-primary" role="status">
+                <span class="visually-hidden">Loading...</span>
+            </div>
+            <p class="mt-2">Generating records...</p>
+        </div>
+        <div id="result" class="result-box"></div>
+    </div>
+    <script>
+        async function generateRecords() {
+            const recordType = document.getElementById('recordType').value;
+            const quantity = parseInt(document.getElementById('quantity').value);
+            const useGemini = document.getElementById('useGemini').checked;
+            const includeMetadata = document.getElementById('includeMetadata').checked;
+            // Show loading
+            document.getElementById('loading').style.display = 'block';
+            document.getElementById('result').innerHTML = '';
+            try {
+                const response = await fetch('/generate', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                        'Accept': 'application/json'
+                    },
+                    body: JSON.stringify({
+                        record_type: recordType,
+                        quantity: quantity,
+                        use_gemini: useGemini,
+                        include_metadata: includeMetadata
+                    })
+                });
+                const data = await response.json();
+                // Format and display results
+                let resultHtml = '<h3>Generated Records:</h3>';
+                data.records.forEach(record => {
+                    resultHtml += `
+                        <div class="mb-4">
+                            <strong>ID:</strong> ${record.id}<br>
+                            <strong>Type:</strong> ${record.type}<br>
+                            <strong>Source:</strong> ${record.source}<br>
+                            <strong>Timestamp:</strong> ${record.timestamp}<br>
+                            <strong>Text:</strong><br>
+                            <pre>${record.text}</pre>
+                        </div>
+                    `;
+                });
+                document.getElementById('result').innerHTML = resultHtml;
+            } catch (error) {
+                document.getElementById('result').innerHTML = `<div class="alert alert-danger">Error: ${error.message}</div>`;
+            } finally {
+                document.getElementById('loading').style.display = 'none';
+            }
+        }
+    </script>
+</body>
+</html>

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Synthex Medical Text Generator - MVP Streamlit App
+Deploy this on Hugging Face Spaces for free hosting
+"""
+import streamlit as st
+import json
+import time
+from datetime import datetime
+import pandas as pd
+import os
+import sys
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Add src directory to Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+# Import the medical generator
+from src.generation.medical_generator import MedicalTextGenerator, DEFAULT_GEMINI_API_KEY
+# Page config
+st.set_page_config(
+    page_title="Synthex Medical Text Generator",
+    page_icon="🏥",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+.main-header {
+    font-size: 3rem;
+    font-weight: bold;
+    color: #1f77b4;
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.sub-header {
+    font-size: 1.5rem;
+    color: #666;
+    text-align: center;
+    margin-bottom: 3rem;
+}
+.record-container {
+    background-color: #f8f9fa;
+    padding: 1rem;
+    border-radius: 0.5rem;
+    border-left: 4px solid #1f77b4;
+    margin: 1rem 0;
+}
+.stats-container {
+    background-color: #e8f4fd;
+    padding: 1rem;
+    border-radius: 0.5rem;
+    margin: 1rem 0;
+}
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'generated_records' not in st.session_state:
+    st.session_state.generated_records = []
+if 'total_generated' not in st.session_state:
+    st.session_state.total_generated = 0
+if 'generator' not in st.session_state:
+    st.session_state.generator = None
+# Header
+st.markdown('<div class="main-header">🏥 Synthex Medical Text Generator</div>', unsafe_allow_html=True)
+st.markdown('<div class="sub-header">Generate synthetic medical records for AI training and testing</div>', unsafe_allow_html=True)
+# Sidebar
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    # API Key input (pre-filled with environment variable if available)
+    gemini_api_key = st.text_input(
+        "Gemini API Key",
+        value=os.getenv('GEMINI_API_KEY', ''),
+        type="password",
+        help="Enter your Google Gemini API key for better generation quality"
+    )
+    # Record type selection
+    record_type = st.selectbox(
+        "Select Record Type",
+        ["clinical_note", "discharge_summary", "lab_report", "prescription", "patient_intake"],
+        format_func=lambda x: x.replace("_", " ").title()
+    )
+    # Quantity
+    quantity = st.slider("Number of Records", 1, 20, 5)
+    # Generation method
+    use_gemini = st.checkbox(
+        "Use Gemini API",
+        value=bool(gemini_api_key),  # Only default to True if API key is available
+        help="Uses Google Gemini API for better quality generation"
+    )
+    # Advanced options
+    with st.expander("Advanced Options"):
+        include_metadata = st.checkbox("Include Metadata", value=True)
+        export_format = st.selectbox("Export Format", ["JSON", "CSV", "TXT"])
+# Main content
+col1, col2 = st.columns([2, 1])
+with col1:
+    st.header("📝 Generate Medical Records")
+    # Generation button
+    if st.button("🚀 Generate Records", type="primary", use_container_width=True):
+        # Initialize generator if not already done
+        if st.session_state.generator is None:
+            try:
+                with st.spinner("Initializing medical text generator..."):
+                    st.session_state.generator = MedicalTextGenerator(gemini_api_key=gemini_api_key)
+            except Exception as e:
+                st.error(f"Error initializing generator: {str(e)}")
+                st.stop()
+        # Generate records
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        generated_records = []
+        for i in range(quantity):
+            status_text.text(f"Generating record {i+1} of {quantity}...")
+            progress_bar.progress((i + 1) / quantity)
+            try:
+                record = st.session_state.generator.generate_record(record_type, use_gemini=use_gemini)
+                generated_records.append(record)
+                # Rate limiting
+                if use_gemini:
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"Failed to generate record {i+1}: {str(e)}")
+                st.error(f"Failed to generate record {i+1}: {str(e)}")
+                continue
+        # Update session state
+        if generated_records:
+            st.session_state.generated_records.extend(generated_records)
+            st.session_state.total_generated += len(generated_records)
+            status_text.text("✅ Generation complete!")
+            progress_bar.progress(1.0)
+            st.success(f"Successfully generated {len(generated_records)} medical records!")
+    # Display generated records
+    if st.session_state.generated_records:
+        st.header("📋 Generated Records")
+        # Filters
+        col_filter1, col_filter2 = st.columns(2)
+        with col_filter1:
+            filter_type = st.selectbox(
+                "Filter by Type",
+                ["All"] + list(set([r['type'] for r in st.session_state.generated_records]))
+            )
+        with col_filter2:
+            records_per_page = st.selectbox("Records per page", [5, 10, 20, 50])
+        # Filter records
+        filtered_records = st.session_state.generated_records
+        if filter_type != "All":
+            filtered_records = [r for r in filtered_records if r['type'] == filter_type]
+        # Pagination
+        total_records = len(filtered_records)
+        total_pages = (total_records - 1) // records_per_page + 1
+        if total_pages > 1:
+            page = st.selectbox("Page", range(1, total_pages + 1))
+            start_idx = (page - 1) * records_per_page
+            end_idx = start_idx + records_per_page
+            page_records = filtered_records[start_idx:end_idx]
+        else:
+            page_records = filtered_records
+        # Display records
+        for i, record in enumerate(page_records):
+            with st.expander(f"Record {record['id']} - {record['type'].replace('_', ' ').title()}"):
+                if include_metadata:
+                    col_meta1, col_meta2, col_meta3 = st.columns(3)
+                    with col_meta1:
+                        st.metric("Type", record['type'].replace('_', ' ').title())
+                    with col_meta2:
+                        st.metric("Generated", record['timestamp'])
+                    with col_meta3:
+                        st.metric("Source", record['source'])
+                st.markdown('<div class="record-container">', unsafe_allow_html=True)
+                st.text_area("Content", record['text'], height=200, key=f"record_{i}")
+                st.markdown('</div>', unsafe_allow_html=True)
+with col2:
+    st.header("📊 Statistics")
+    # Stats container
+    st.markdown('<div class="stats-container">', unsafe_allow_html=True)
+    # Total records
+    st.metric("Total Records Generated", st.session_state.total_generated)
+    # Record type distribution
+    if st.session_state.generated_records:
+        type_counts = pd.Series([r['type'] for r in st.session_state.generated_records]).value_counts()
+        st.subheader("Record Type Distribution")
+        st.bar_chart(type_counts)
+    # Export options
+    st.subheader("Export Data")
+    if st.session_state.generated_records:
+        if export_format == "JSON":
+            json_str = json.dumps(st.session_state.generated_records, indent=2)
+            st.download_button(
+                "Download JSON",
+                json_str,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json"
+            )
+        elif export_format == "CSV":
+            df = pd.DataFrame(st.session_state.generated_records)
+            csv = df.to_csv(index=False)
+            st.download_button(
+                "Download CSV",
+                csv,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                mime="text/csv"
+            )
+        elif export_format == "TXT":
+            txt = "\n\n".join([f"Record {r['id']} ({r['type']}):\n{r['text']}" for r in st.session_state.generated_records])
+            st.download_button(
+                "Download TXT",
+                txt,
+                file_name=f"medical_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                mime="text/plain"
+            )
+    st.markdown('</div>', unsafe_allow_html=True)

templates/index.html ADDED Viewed

	@@ -0,0 +1,66 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Synthex - Medical Data Collection and Analysis</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f4f4f4;
+        }
+        .container {
+            max-width: 800px;
+            margin: 0 auto;
+            background: white;
+            padding: 20px;
+            border-radius: 5px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+        }
+        h1 {
+            color: #333;
+        }
+        .button {
+            display: inline-block;
+            padding: 10px 20px;
+            margin: 10px 0;
+            background-color: #007bff;
+            color: white;
+            text-decoration: none;
+            border-radius: 5px;
+        }
+        .button:hover {
+            background-color: #0056b3;
+        }
+        .flash {
+            padding: 10px;
+            margin: 10px 0;
+            border-radius: 5px;
+        }
+        .flash.success {
+            background-color: #d4edda;
+            color: #155724;
+        }
+        .flash.error {
+            background-color: #f8d7da;
+            color: #721c24;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Synthex - Medical Data Collection and Analysis</h1>
+        {% with messages = get_flashed_messages(with_categories=true) %}
+            {% if messages %}
+                {% for category, message in messages %}
+                    <div class="flash {{ category }}">{{ message }}</div>
+                {% endfor %}
+            {% endif %}
+        {% endwith %}
+        <a href="{{ url_for('collect_data') }}" class="button">Collect Data</a>
+        <a href="{{ url_for('analyze_data') }}" class="button">Analyze Data</a>
+    </div>
+</body>
+</html>

test_dataset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from datasets import load_dataset
+import json
+import os
+from pathlib import Path
+def test_medical_dataset():
+    try:
+        # Load a small sample of the medical questions dataset
+        dataset = load_dataset("medical_questions_pairs", split="train[:100]")
+        print(f"Successfully loaded {len(dataset)} samples from medical_questions_pairs")
+        # Print sample structure
+        print("\nSample structure:")
+        print(json.dumps(dataset[0], indent=2))
+        return True
+    except Exception as e:
+        print(f"Error loading dataset: {str(e)}")
+        return False
+def verify_data_directory():
+    data_dir = Path("data/raw")
+    if not data_dir.exists():
+        print(f"Creating data directory: {data_dir}")
+        data_dir.mkdir(parents=True, exist_ok=True)
+    # Check for JSON files
+    json_files = list(data_dir.glob("*.json"))
+    if json_files:
+        print(f"\nFound {len(json_files)} JSON files in data/raw:")
+        for file in json_files:
+            print(f"- {file.name}")
+    else:
+        print("\nNo JSON files found in data/raw directory")
+if __name__ == "__main__":
+    print("Testing Hugging Face dataset loading...")
+    test_medical_dataset()
+    print("\nVerifying data directory structure...")
+    verify_data_directory()

test_pubmed.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import requests
+import json
+from bs4 import BeautifulSoup
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_pubmed_search():
+    """Test PubMed search API"""
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    search_url = f"{base_url}esearch.fcgi"
+    # Test query
+    query = "clinical notes"
+    search_params = {
+        "db": "pubmed",
+        "term": query,
+        "retmax": 10,  # Just get 10 results for testing
+        "retmode": "json",
+        "sort": "relevance"
+    }
+    logger.info(f"Testing PubMed search with query: {query}")
+    logger.info(f"Search URL: {search_url}")
+    logger.info(f"Search params: {search_params}")
+    try:
+        response = requests.get(search_url, params=search_params)
+        response.raise_for_status()
+        search_results = response.json()
+        logger.info(f"Response status code: {response.status_code}")
+        logger.info(f"Response headers: {dict(response.headers)}")
+        logger.info(f"Search results: {json.dumps(search_results, indent=2)}")
+        if "esearchresult" in search_results:
+            id_list = search_results["esearchresult"]["idlist"]
+            logger.info(f"Found {len(id_list)} article IDs")
+            # Test fetching one article
+            if id_list:
+                test_id = id_list[0]
+                fetch_url = f"{base_url}efetch.fcgi"
+                fetch_params = {
+                    "db": "pubmed",
+                    "id": test_id,
+                    "retmode": "xml"
+                }
+                logger.info(f"\nTesting article fetch for ID: {test_id}")
+                logger.info(f"Fetch URL: {fetch_url}")
+                logger.info(f"Fetch params: {fetch_params}")
+                response = requests.get(fetch_url, params=fetch_params)
+                response.raise_for_status()
+                logger.info(f"Fetch response status code: {response.status_code}")
+                logger.info(f"Fetch response headers: {dict(response.headers)}")
+                logger.info(f"First 500 chars of response: {response.text[:500]}")
+                soup = BeautifulSoup(response.text, 'lxml')
+                article = soup.find('PubmedArticle')
+                if article:
+                    logger.info("\nArticle structure:")
+                    logger.info(f"Title: {article.find('ArticleTitle').get_text() if article.find('ArticleTitle') else 'Not found'}")
+                    logger.info(f"Abstract: {article.find('Abstract').get_text()[:200] + '...' if article.find('Abstract') else 'Not found'}")
+                else:
+                    logger.error("No PubmedArticle found in response")
+    except Exception as e:
+        logger.error(f"Error during test: {str(e)}", exc_info=True)
+if __name__ == "__main__":
+    test_pubmed_search()

web_app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from flask import Flask, render_template, redirect, url_for, flash
+import subprocess
+import os
+app = Flask(__name__)
+app.secret_key = 'your_secret_key'  # Required for flashing messages
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/collect_data')
+def collect_data():
+    try:
+        subprocess.run(['python', 'setup_data.py'], check=True)
+        flash('Data collection completed successfully!', 'success')
+    except subprocess.CalledProcessError as e:
+        flash(f'Error during data collection: {str(e)}', 'error')
+    return redirect(url_for('index'))
+@app.route('/analyze_data')
+def analyze_data():
+    try:
+        subprocess.run(['python', 'analyze_data_quality.py'], check=True)
+        flash('Data analysis completed successfully!', 'success')
+    except subprocess.CalledProcessError as e:
+        flash(f'Error during data analysis: {str(e)}', 'error')
+    return redirect(url_for('index'))
+if __name__ == '__main__':
+    app.run(debug=True)