vikasdeep commited on
Commit
ff6113c
·
verified ·
1 Parent(s): 66c2e0a

Upload model.py

Browse files
Files changed (1) hide show
  1. model.py +190 -0
model.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Libraries
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import os
7
+ import joblib
8
+
9
+ # Load Dataset
10
+ df = pd.read_csv("hospital_readmissions.csv")
11
+
12
+ # Basic Info
13
+ df.info()
14
+ df.describe()
15
+
16
+ # Missing Values
17
+ print(df.isnull().sum())
18
+
19
+ # Readmission Distribution
20
+ sns.countplot(x='readmitted', data=df)
21
+ plt.title('Readmitted Class Distribution')
22
+ plt.xlabel('Readmitted (0=No, 1=Yes)')
23
+ plt.ylabel('Count')
24
+ plt.show()
25
+
26
+ # Histograms for Numeric Features
27
+ numeric_features = ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications']
28
+ df[numeric_features].hist(figsize=(10,8), bins=15)
29
+ plt.suptitle('Distribution of Numeric Features')
30
+ plt.show()
31
+
32
+ # Encoding Categorical Variables
33
+ from sklearn.preprocessing import LabelEncoder
34
+
35
+ label_encoders_file = "label_encoders.pkl"
36
+ label_encoders_2_file = "label_encoders_2.pkl"
37
+
38
+ # Load or Fit Label Encoders
39
+ if os.path.exists(label_encoders_file) and os.path.exists(label_encoders_2_file):
40
+ print("Loading existing label encoders...")
41
+ label_encoders = joblib.load(label_encoders_file)
42
+ label_encoders_2 = joblib.load(label_encoders_2_file)
43
+ else:
44
+ print("Fitting new label encoders...")
45
+ categorical_cols = ['age', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'readmitted']
46
+ label_encoders = {col: LabelEncoder() for col in categorical_cols}
47
+
48
+ for col, le in label_encoders.items():
49
+ df[col] = le.fit_transform(df[col].astype(str))
50
+
51
+ categorical_cols_2 = ['medical_specialty', 'diag_1', 'diag_2', 'diag_3']
52
+ label_encoders_2 = {col: LabelEncoder() for col in categorical_cols_2}
53
+
54
+ for col2, le in label_encoders_2.items():
55
+ df[col2] = le.fit_transform(df[col2].astype(str))
56
+
57
+ joblib.dump(label_encoders, label_encoders_file)
58
+ joblib.dump(label_encoders_2, label_encoders_2_file)
59
+ print("Label encoders saved.")
60
+
61
+ # Feature Engineering (Interaction Terms)
62
+ from sklearn.preprocessing import PolynomialFeatures
63
+
64
+ poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
65
+ interaction_terms = poly.fit_transform(df[['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications']])
66
+ interaction_df = pd.DataFrame(interaction_terms, columns=poly.get_feature_names_out(
67
+ ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications']))
68
+ df = pd.concat([df, interaction_df], axis=1)
69
+
70
+ # Splitting the Data
71
+ from sklearn.model_selection import train_test_split
72
+
73
+ X = df.drop('readmitted', axis=1)
74
+ y = df['readmitted']
75
+
76
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
77
+
78
+ # Remove duplicate columns
79
+ X_train = X_train.loc[:, ~X_train.columns.duplicated()]
80
+ X_test = X_test.loc[:, ~X_test.columns.duplicated()]
81
+
82
+
83
+ feature_columns_file = "feature_columns.pkl"
84
+ joblib.dump(X_train.columns.tolist(), feature_columns_file)
85
+ print(f"Feature columns saved as {feature_columns_file}")
86
+
87
+ # Define Model Filenames
88
+ rf_model_file = "rf_tuned_model.pkl"
89
+ xgb_model_file = "xgb_model.pkl"
90
+ lgbm_model_file = "lgbm_model.pkl"
91
+
92
+ # Random Forest Classifier
93
+ from sklearn.ensemble import RandomForestClassifier
94
+
95
+ if os.path.exists(rf_model_file):
96
+ print(f"Loading existing Random Forest model from {rf_model_file}...")
97
+ rf_model = joblib.load(rf_model_file)
98
+ else:
99
+ print("Training Random Forest model...")
100
+ rf_model = RandomForestClassifier(
101
+ bootstrap=True,
102
+ max_depth=10,
103
+ min_samples_leaf=4,
104
+ min_samples_split=5,
105
+ n_estimators=200,
106
+ random_state=42
107
+ )
108
+ rf_model.fit(X_train, y_train)
109
+ joblib.dump(rf_model, rf_model_file)
110
+ print(f"Random Forest model saved as {rf_model_file}")
111
+
112
+ # XGBoost Classifier
113
+ from xgboost import XGBClassifier
114
+
115
+ if os.path.exists(xgb_model_file):
116
+ print(f"Loading existing XGBoost model from {xgb_model_file}...")
117
+ xgb_model = joblib.load(xgb_model_file)
118
+ else:
119
+ print("Training XGBoost model...")
120
+ xgb_model = XGBClassifier(
121
+ n_estimators=200,
122
+ max_depth=10,
123
+ learning_rate=0.1,
124
+ subsample=0.8,
125
+ colsample_bytree=0.8,
126
+ random_state=42
127
+ )
128
+ xgb_model.fit(X_train, y_train)
129
+ joblib.dump(xgb_model, xgb_model_file)
130
+ print(f"XGBoost model saved as {xgb_model_file}")
131
+
132
+ # LightGBM Classifier
133
+ from lightgbm import LGBMClassifier
134
+
135
+ if os.path.exists(lgbm_model_file):
136
+ print(f"Loading existing LightGBM model from {lgbm_model_file}...")
137
+ lgbm_model = joblib.load(lgbm_model_file)
138
+ else:
139
+ print("Training LightGBM model...")
140
+ lgbm_model = LGBMClassifier(
141
+ n_estimators=200,
142
+ max_depth=10,
143
+ learning_rate=0.1,
144
+ subsample=0.8,
145
+ colsample_bytree=0.8,
146
+ random_state=42
147
+ )
148
+ lgbm_model.fit(X_train, y_train)
149
+ joblib.dump(lgbm_model, lgbm_model_file)
150
+ print(f"LightGBM model saved as {lgbm_model_file}")
151
+
152
+ # Predictions
153
+ y_pred_rf = rf_model.predict(X_test)
154
+ y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
155
+
156
+ y_pred_xgb = xgb_model.predict(X_test)
157
+ y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
158
+
159
+ y_pred_lgbm = lgbm_model.predict(X_test)
160
+ y_proba_lgbm = lgbm_model.predict_proba(X_test)[:, 1]
161
+
162
+ # Evaluation
163
+ from sklearn.metrics import classification_report, roc_auc_score
164
+
165
+ print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
166
+ print("Random Forest ROC-AUC Score:", roc_auc_score(y_test, y_proba_rf))
167
+
168
+ print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
169
+ print("XGBoost ROC-AUC Score:", roc_auc_score(y_test, y_proba_xgb))
170
+
171
+ print("LightGBM Classification Report:\n", classification_report(y_test, y_pred_lgbm))
172
+ print("LightGBM ROC-AUC Score:", roc_auc_score(y_test, y_proba_lgbm))
173
+
174
+ # Compare Models
175
+ results = {
176
+ "Model": ["Random Forest", "XGBoost", "LightGBM"],
177
+ "ROC-AUC Score": [roc_auc_score(y_test, y_proba_rf),
178
+ roc_auc_score(y_test, y_proba_xgb),
179
+ roc_auc_score(y_test, y_proba_lgbm)]
180
+ }
181
+
182
+ results_df = pd.DataFrame(results)
183
+ print("\nModel Performance Summary:")
184
+ print(results_df)
185
+
186
+ # Plot Model Comparison
187
+ sns.barplot(data=results_df, x="Model", y="ROC-AUC Score", palette="viridis")
188
+ plt.title("Model ROC-AUC Comparison")
189
+ plt.ylabel("ROC-AUC Score")
190
+ plt.show()