Spaces:
Sleeping
Sleeping
# File path | |
file_path = '/mnt/data/heart.csv' | |
# Import necessary libraries | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix, roc_curve, auc | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
import xgboost as xgb | |
import joblib | |
# Step 1: Data Cleaning and Encoding | |
# Load data | |
data = pd.read_csv(file_path) | |
# Handle missing values (example: filling with median) | |
data = data.fillna(data.median()) | |
# Encode categorical variables | |
label_encoders = {} | |
for column in data.select_dtypes(include=['object']).columns: | |
le = LabelEncoder() | |
data[column] = le.fit_transform(data[column]) | |
label_encoders[column] = le | |
# Step 2: Plotting the Dependency Matrix | |
plt.figure(figsize=(12, 8)) | |
correlation_matrix = data.corr() | |
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') | |
plt.title('Correlation Matrix') | |
plt.show() | |
# Step 3: Supervised Learning Model for Prediction using XGBoost | |
# Define features and target | |
X = data.drop('target', axis=1) # Assuming 'target' is the target variable | |
y = data['target'] | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Standardize the data | |
scaler = StandardScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
# Train the model | |
model = xgb.XGBClassifier() | |
model.fit(X_train, y_train) | |
# Make predictions | |
y_pred = model.predict(X_test) | |
y_pred_prob = model.predict_proba(X_test)[:, 1] | |
# Step 4: Evaluation Using Confusion Matrix and Plotting ROC Curve | |
# Confusion Matrix | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues') | |
plt.title('Confusion Matrix') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
plt.show() | |
# ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_prob) | |
roc_auc = auc(fpr, tpr) | |
plt.figure() | |
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})') | |
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Receiver Operating Characteristic (ROC) Curve') | |
plt.legend(loc='lower right') | |
plt.show() | |
# Save the model using joblib | |
model_filename = '/mnt/data/xgboost_model.joblib' | |
joblib.dump(model, model_filename) | |
print(f"Model saved to {model_filename}") | |