|
import numpy as np |
|
import pandas as pd |
|
import gradio as gr |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.metrics import accuracy_score, classification_report |
|
import joblib |
|
|
|
|
|
def generate_realistic_genetic_data(n_samples=1000): |
|
np.random.seed(42) |
|
|
|
|
|
|
|
genetic_data = { |
|
'BRCA1_mutation': np.random.choice([0, 1], size=n_samples, p=[0.95, 0.05]), |
|
'P53_variation': np.random.normal(0.5, 0.1, n_samples), |
|
'APOE_allele': np.random.choice([2, 3, 4], size=n_samples, p=[0.1, 0.7, 0.2]), |
|
'DNA_methylation': np.random.beta(2, 5, n_samples), |
|
'telomere_length': np.random.normal(6000, 1000, n_samples), |
|
'CYP2D6_activity': np.random.gamma(2, 2, n_samples), |
|
'inflammatory_markers': np.random.exponential(2, n_samples), |
|
'glucose_metabolism': np.random.normal(100, 15, n_samples), |
|
'oxidative_stress': np.random.gamma(3, 1, n_samples), |
|
'immune_response': np.random.normal(0.7, 0.1, n_samples) |
|
} |
|
|
|
|
|
df = pd.DataFrame(genetic_data) |
|
|
|
|
|
disease_probability = ( |
|
0.3 * genetic_data['BRCA1_mutation'] + |
|
0.2 * (genetic_data['P53_variation'] > 0.7) + |
|
0.15 * (genetic_data['APOE_allele'] == 4) + |
|
0.1 * (genetic_data['DNA_methylation'] > 0.6) + |
|
0.05 * (genetic_data['telomere_length'] < 5000) + |
|
0.1 * (genetic_data['CYP2D6_activity'] > 5) + |
|
0.05 * (genetic_data['inflammatory_markers'] > 3) + |
|
0.05 * (genetic_data['glucose_metabolism'] > 120) |
|
) |
|
|
|
df['disease'] = (disease_probability > 0.5).astype(int) |
|
|
|
return df |
|
|
|
|
|
def preprocess_data(data): |
|
X = data.drop('disease', axis=1) |
|
y = data['disease'] |
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
|
|
return X_scaled, y, scaler |
|
|
|
|
|
def train_and_evaluate_model(): |
|
|
|
print("Generating synthetic genetic data...") |
|
data = generate_realistic_genetic_data(1500) |
|
print("\nData Sample:") |
|
print(data.head()) |
|
print("\nData Statistics:") |
|
print(data.describe()) |
|
|
|
|
|
X_scaled, y, scaler = preprocess_data(data) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X_scaled, y, test_size=0.2, random_state=42 |
|
) |
|
|
|
|
|
print("\nTraining Random Forest model...") |
|
model = RandomForestClassifier( |
|
n_estimators=100, |
|
max_depth=5, |
|
random_state=42 |
|
) |
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = model.predict(X_test) |
|
accuracy = accuracy_score(y_test, y_pred) |
|
print("\nModel Evaluation:") |
|
print(f"Accuracy: {accuracy:.2f}") |
|
print("\nClassification Report:") |
|
print(classification_report(y_test, y_pred)) |
|
|
|
|
|
feature_importance = pd.DataFrame({ |
|
'feature': data.drop('disease', axis=1).columns, |
|
'importance': model.feature_importances_ |
|
}) |
|
print("\nFeature Importance:") |
|
print(feature_importance.sort_values('importance', ascending=False)) |
|
|
|
return model, scaler |
|
|
|
|
|
def predict_disease( |
|
brca1_mutation, p53_variation, apoe_allele, dna_methylation, |
|
telomere_length, cyp2d6_activity, inflammatory_markers, |
|
glucose_metabolism, oxidative_stress, immune_response |
|
): |
|
|
|
input_data = np.array([ |
|
brca1_mutation, p53_variation, apoe_allele, dna_methylation, |
|
telomere_length, cyp2d6_activity, inflammatory_markers, |
|
glucose_metabolism, oxidative_stress, immune_response |
|
]).reshape(1, -1) |
|
|
|
|
|
scaled_data = scaler.transform(input_data) |
|
|
|
|
|
prediction = model.predict_proba(scaled_data)[0] |
|
|
|
return { |
|
"No Disease": float(prediction[0]), |
|
"Disease": float(prediction[1]) |
|
} |
|
|
|
|
|
print("Training model and preparing interface...") |
|
model, scaler = train_and_evaluate_model() |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict_disease, |
|
inputs=[ |
|
gr.Number(label="BRCA1 Mutation (0 or 1)", value=0), |
|
gr.Number(label="P53 Variation (typically 0.3-0.7)", value=0.5), |
|
gr.Number(label="APOE Allele (2, 3, or 4)", value=3), |
|
gr.Number(label="DNA Methylation (0-1)", value=0.4), |
|
gr.Number(label="Telomere Length (typically 4000-8000)", value=6000), |
|
gr.Number(label="CYP2D6 Activity (typically 0-10)", value=4), |
|
gr.Number(label="Inflammatory Markers (typically 0-10)", value=2), |
|
gr.Number(label="Glucose Metabolism (typically 70-130)", value=100), |
|
gr.Number(label="Oxidative Stress (typically 0-10)", value=3), |
|
gr.Number(label="Immune Response (typically 0.5-0.9)", value=0.7) |
|
], |
|
outputs=gr.Label(label="Disease Prediction"), |
|
title="Genetic Disease Prediction System", |
|
description="""This system predicts genetic disease risk based on various genetic markers and biological indicators. |
|
Please input values within the suggested ranges for accurate predictions.""", |
|
examples=[ |
|
|
|
[1, 0.8, 4, 0.7, 4800, 6, 4, 125, 5, 0.6], |
|
|
|
[0, 0.4, 3, 0.3, 6500, 3, 1, 95, 2, 0.8], |
|
|
|
[0, 0.6, 3, 0.5, 5500, 4, 2, 110, 3, 0.7] |
|
] |
|
) |
|
|
|
|
|
iface.launch(share=True) |