Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File path
|
2 |
+
file_path = '/mnt/data/heart.csv'
|
3 |
+
|
4 |
+
# Import necessary libraries
|
5 |
+
import pandas as pd
|
6 |
+
import seaborn as sns
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from sklearn.metrics import confusion_matrix, roc_curve, auc
|
10 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
11 |
+
import xgboost as xgb
|
12 |
+
import joblib
|
13 |
+
|
14 |
+
# Step 1: Data Cleaning and Encoding
|
15 |
+
# Load data
|
16 |
+
data = pd.read_csv(file_path)
|
17 |
+
|
18 |
+
# Handle missing values (example: filling with median)
|
19 |
+
data = data.fillna(data.median())
|
20 |
+
|
21 |
+
# Encode categorical variables
|
22 |
+
label_encoders = {}
|
23 |
+
for column in data.select_dtypes(include=['object']).columns:
|
24 |
+
le = LabelEncoder()
|
25 |
+
data[column] = le.fit_transform(data[column])
|
26 |
+
label_encoders[column] = le
|
27 |
+
|
28 |
+
# Step 2: Plotting the Dependency Matrix
|
29 |
+
plt.figure(figsize=(12, 8))
|
30 |
+
correlation_matrix = data.corr()
|
31 |
+
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
|
32 |
+
plt.title('Correlation Matrix')
|
33 |
+
plt.show()
|
34 |
+
|
35 |
+
# Step 3: Supervised Learning Model for Prediction using XGBoost
|
36 |
+
# Define features and target
|
37 |
+
X = data.drop('target', axis=1) # Assuming 'target' is the target variable
|
38 |
+
y = data['target']
|
39 |
+
|
40 |
+
# Split the data
|
41 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
42 |
+
|
43 |
+
# Standardize the data
|
44 |
+
scaler = StandardScaler()
|
45 |
+
X_train = scaler.fit_transform(X_train)
|
46 |
+
X_test = scaler.transform(X_test)
|
47 |
+
|
48 |
+
# Train the model
|
49 |
+
model = xgb.XGBClassifier()
|
50 |
+
model.fit(X_train, y_train)
|
51 |
+
|
52 |
+
# Make predictions
|
53 |
+
y_pred = model.predict(X_test)
|
54 |
+
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
55 |
+
|
56 |
+
# Step 4: Evaluation Using Confusion Matrix and Plotting ROC Curve
|
57 |
+
# Confusion Matrix
|
58 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
59 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
|
60 |
+
plt.title('Confusion Matrix')
|
61 |
+
plt.xlabel('Predicted')
|
62 |
+
plt.ylabel('Actual')
|
63 |
+
plt.show()
|
64 |
+
|
65 |
+
# ROC Curve
|
66 |
+
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
|
67 |
+
roc_auc = auc(fpr, tpr)
|
68 |
+
plt.figure()
|
69 |
+
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})')
|
70 |
+
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
71 |
+
plt.xlim([0.0, 1.0])
|
72 |
+
plt.ylim([0.0, 1.05])
|
73 |
+
plt.xlabel('False Positive Rate')
|
74 |
+
plt.ylabel('True Positive Rate')
|
75 |
+
plt.title('Receiver Operating Characteristic (ROC) Curve')
|
76 |
+
plt.legend(loc='lower right')
|
77 |
+
plt.show()
|
78 |
+
|
79 |
+
# Save the model using joblib
|
80 |
+
model_filename = '/mnt/data/xgboost_model.joblib'
|
81 |
+
joblib.dump(model, model_filename)
|
82 |
+
print(f"Model saved to {model_filename}")
|