# File path file_path = '/mnt/data/heart.csv' # Import necessary libraries import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, roc_curve, auc from sklearn.preprocessing import StandardScaler, LabelEncoder import xgboost as xgb import joblib # Step 1: Data Cleaning and Encoding # Load data data = pd.read_csv(file_path) # Handle missing values (example: filling with median) data = data.fillna(data.median()) # Encode categorical variables label_encoders = {} for column in data.select_dtypes(include=['object']).columns: le = LabelEncoder() data[column] = le.fit_transform(data[column]) label_encoders[column] = le # Step 2: Plotting the Dependency Matrix plt.figure(figsize=(12, 8)) correlation_matrix = data.corr() sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.title('Correlation Matrix') plt.show() # Step 3: Supervised Learning Model for Prediction using XGBoost # Define features and target X = data.drop('target', axis=1) # Assuming 'target' is the target variable y = data['target'] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Standardize the data scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Train the model model = xgb.XGBClassifier() model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) y_pred_prob = model.predict_proba(X_test)[:, 1] # Step 4: Evaluation Using Confusion Matrix and Plotting ROC Curve # Confusion Matrix conf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues') plt.title('Confusion Matrix') plt.xlabel('Predicted') plt.ylabel('Actual') plt.show() # ROC Curve fpr, tpr, _ = roc_curve(y_test, y_pred_prob) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(loc='lower right') plt.show() # Save the model using joblib model_filename = '/mnt/data/xgboost_model.joblib' joblib.dump(model, model_filename) print(f"Model saved to {model_filename}")