Roberta2024 commited on
Commit
79de571
·
verified ·
1 Parent(s): 6fb6c47

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File path
2
+ file_path = '/mnt/data/heart.csv'
3
+
4
+ # Import necessary libraries
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import confusion_matrix, roc_curve, auc
10
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
11
+ import xgboost as xgb
12
+ import joblib
13
+
14
+ # Step 1: Data Cleaning and Encoding
15
+ # Load data
16
+ data = pd.read_csv(file_path)
17
+
18
+ # Handle missing values (example: filling with median)
19
+ data = data.fillna(data.median())
20
+
21
+ # Encode categorical variables
22
+ label_encoders = {}
23
+ for column in data.select_dtypes(include=['object']).columns:
24
+ le = LabelEncoder()
25
+ data[column] = le.fit_transform(data[column])
26
+ label_encoders[column] = le
27
+
28
+ # Step 2: Plotting the Dependency Matrix
29
+ plt.figure(figsize=(12, 8))
30
+ correlation_matrix = data.corr()
31
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
32
+ plt.title('Correlation Matrix')
33
+ plt.show()
34
+
35
+ # Step 3: Supervised Learning Model for Prediction using XGBoost
36
+ # Define features and target
37
+ X = data.drop('target', axis=1) # Assuming 'target' is the target variable
38
+ y = data['target']
39
+
40
+ # Split the data
41
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
42
+
43
+ # Standardize the data
44
+ scaler = StandardScaler()
45
+ X_train = scaler.fit_transform(X_train)
46
+ X_test = scaler.transform(X_test)
47
+
48
+ # Train the model
49
+ model = xgb.XGBClassifier()
50
+ model.fit(X_train, y_train)
51
+
52
+ # Make predictions
53
+ y_pred = model.predict(X_test)
54
+ y_pred_prob = model.predict_proba(X_test)[:, 1]
55
+
56
+ # Step 4: Evaluation Using Confusion Matrix and Plotting ROC Curve
57
+ # Confusion Matrix
58
+ conf_matrix = confusion_matrix(y_test, y_pred)
59
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
60
+ plt.title('Confusion Matrix')
61
+ plt.xlabel('Predicted')
62
+ plt.ylabel('Actual')
63
+ plt.show()
64
+
65
+ # ROC Curve
66
+ fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
67
+ roc_auc = auc(fpr, tpr)
68
+ plt.figure()
69
+ plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})')
70
+ plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
71
+ plt.xlim([0.0, 1.0])
72
+ plt.ylim([0.0, 1.05])
73
+ plt.xlabel('False Positive Rate')
74
+ plt.ylabel('True Positive Rate')
75
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
76
+ plt.legend(loc='lower right')
77
+ plt.show()
78
+
79
+ # Save the model using joblib
80
+ model_filename = '/mnt/data/xgboost_model.joblib'
81
+ joblib.dump(model, model_filename)
82
+ print(f"Model saved to {model_filename}")