diivien commited on
Commit
b208fb5
·
1 Parent(s): a70f389

edit feature selection

Browse files
Files changed (1) hide show
  1. Model Building.ipynb +297 -29
Model Building.ipynb CHANGED
@@ -1068,19 +1068,287 @@
1068
  },
1069
  {
1070
  "cell_type": "code",
1071
- "execution_count": 9,
1072
  "id": "d66743ec",
1073
  "metadata": {},
1074
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1075
  "source": [
1076
  "from feature_engine.selection import DropCorrelatedFeatures\n",
1077
  "\n",
1078
- "fes = DropCorrelatedFeatures(threshold=0.3)"
 
 
1079
  ]
1080
  },
1081
  {
1082
  "cell_type": "code",
1083
- "execution_count": 10,
1084
  "id": "4797c378",
1085
  "metadata": {},
1086
  "outputs": [],
@@ -1096,7 +1364,7 @@
1096
  },
1097
  {
1098
  "cell_type": "code",
1099
- "execution_count": 11,
1100
  "id": "6c8a6fd9",
1101
  "metadata": {},
1102
  "outputs": [],
@@ -1137,7 +1405,7 @@
1137
  },
1138
  {
1139
  "cell_type": "code",
1140
- "execution_count": 12,
1141
  "id": "077ca8bd",
1142
  "metadata": {},
1143
  "outputs": [
@@ -1151,11 +1419,11 @@
1151
  "Average precision: 0.606 (+/- 0.005)\n",
1152
  "F1 score: 0.596 (+/- 0.005)\n",
1153
  "Balanced accuracy: 0.754 (+/- 0.004)\n",
1154
- "Average precision: 0.628 (+/- 0.005)\n",
1155
- "F1 score: 0.528 (+/- 0.004)\n",
1156
- "Balanced accuracy: 0.684 (+/- 0.002)\n",
1157
- "Average precision: 0.609 (+/- 0.005)\n",
1158
- "F1 score: 0.596 (+/- 0.005)\n",
1159
  "Balanced accuracy: 0.754 (+/- 0.004)\n",
1160
  "Average precision: 0.627 (+/- 0.004)\n",
1161
  "F1 score: 0.608 (+/- 0.006)\n",
@@ -1238,7 +1506,7 @@
1238
  },
1239
  {
1240
  "cell_type": "code",
1241
- "execution_count": 13,
1242
  "id": "e4c87875",
1243
  "metadata": {},
1244
  "outputs": [
@@ -1301,21 +1569,21 @@
1301
  " </tr>\n",
1302
  " <tr>\n",
1303
  " <th>both</th>\n",
1304
- " <td>0.608667</td>\n",
1305
- " <td>0.753689</td>\n",
1306
- " <td>0.596222</td>\n",
1307
- " <td>0.004643</td>\n",
1308
- " <td>0.003505</td>\n",
1309
- " <td>0.004734</td>\n",
1310
  " </tr>\n",
1311
  " <tr>\n",
1312
  " <th>feature_selection</th>\n",
1313
- " <td>0.627933</td>\n",
1314
- " <td>0.684463</td>\n",
1315
- " <td>0.528416</td>\n",
1316
- " <td>0.005101</td>\n",
1317
- " <td>0.002420</td>\n",
1318
- " <td>0.004158</td>\n",
1319
  " </tr>\n",
1320
  " <tr>\n",
1321
  " <th>oversampling</th>\n",
@@ -1344,8 +1612,8 @@
1344
  "score average_precision balanced_accuracy f1_score \n",
1345
  "model \n",
1346
  "baseline 0.630397 0.687034 0.532710 \n",
1347
- "both 0.608667 0.753689 0.596222 \n",
1348
- "feature_selection 0.627933 0.684463 0.528416 \n",
1349
  "oversampling 0.605969 0.754218 0.596470 \n",
1350
  "oversampling2 0.627238 0.753277 0.608240 \n",
1351
  "\n",
@@ -1353,13 +1621,13 @@
1353
  "score average_precision balanced_accuracy f1_score \n",
1354
  "model \n",
1355
  "baseline 0.005217 0.004777 0.008199 \n",
1356
- "both 0.004643 0.003505 0.004734 \n",
1357
- "feature_selection 0.005101 0.002420 0.004158 \n",
1358
  "oversampling 0.005173 0.003581 0.004911 \n",
1359
  "oversampling2 0.004290 0.004862 0.006231 "
1360
  ]
1361
  },
1362
- "execution_count": 13,
1363
  "metadata": {},
1364
  "output_type": "execute_result"
1365
  }
 
1068
  },
1069
  {
1070
  "cell_type": "code",
1071
+ "execution_count": 22,
1072
  "id": "d66743ec",
1073
  "metadata": {},
1074
+ "outputs": [
1075
+ {
1076
+ "data": {
1077
+ "text/html": [
1078
+ "<div>\n",
1079
+ "<style scoped>\n",
1080
+ " .dataframe tbody tr th:only-of-type {\n",
1081
+ " vertical-align: middle;\n",
1082
+ " }\n",
1083
+ "\n",
1084
+ " .dataframe tbody tr th {\n",
1085
+ " vertical-align: top;\n",
1086
+ " }\n",
1087
+ "\n",
1088
+ " .dataframe thead th {\n",
1089
+ " text-align: right;\n",
1090
+ " }\n",
1091
+ "</style>\n",
1092
+ "<table border=\"1\" class=\"dataframe\">\n",
1093
+ " <thead>\n",
1094
+ " <tr style=\"text-align: right;\">\n",
1095
+ " <th></th>\n",
1096
+ " <th>duration_ms</th>\n",
1097
+ " <th>explicit</th>\n",
1098
+ " <th>danceability</th>\n",
1099
+ " <th>energy</th>\n",
1100
+ " <th>key</th>\n",
1101
+ " <th>mode</th>\n",
1102
+ " <th>speechiness</th>\n",
1103
+ " <th>instrumentalness</th>\n",
1104
+ " <th>liveness</th>\n",
1105
+ " <th>valence</th>\n",
1106
+ " <th>tempo</th>\n",
1107
+ " <th>time_signature</th>\n",
1108
+ " <th>track_genre</th>\n",
1109
+ " </tr>\n",
1110
+ " </thead>\n",
1111
+ " <tbody>\n",
1112
+ " <tr>\n",
1113
+ " <th>22832</th>\n",
1114
+ " <td>197424</td>\n",
1115
+ " <td>0</td>\n",
1116
+ " <td>0.247</td>\n",
1117
+ " <td>0.9370</td>\n",
1118
+ " <td>9</td>\n",
1119
+ " <td>0</td>\n",
1120
+ " <td>0.0914</td>\n",
1121
+ " <td>0.789000</td>\n",
1122
+ " <td>0.7210</td>\n",
1123
+ " <td>0.1350</td>\n",
1124
+ " <td>175.189</td>\n",
1125
+ " <td>4</td>\n",
1126
+ " <td>drum-and-bass</td>\n",
1127
+ " </tr>\n",
1128
+ " <tr>\n",
1129
+ " <th>79002</th>\n",
1130
+ " <td>84948</td>\n",
1131
+ " <td>0</td>\n",
1132
+ " <td>0.282</td>\n",
1133
+ " <td>0.0663</td>\n",
1134
+ " <td>0</td>\n",
1135
+ " <td>0</td>\n",
1136
+ " <td>0.0448</td>\n",
1137
+ " <td>0.930000</td>\n",
1138
+ " <td>0.0985</td>\n",
1139
+ " <td>0.1550</td>\n",
1140
+ " <td>209.557</td>\n",
1141
+ " <td>4</td>\n",
1142
+ " <td>turkish</td>\n",
1143
+ " </tr>\n",
1144
+ " <tr>\n",
1145
+ " <th>27606</th>\n",
1146
+ " <td>284307</td>\n",
1147
+ " <td>0</td>\n",
1148
+ " <td>0.492</td>\n",
1149
+ " <td>0.4740</td>\n",
1150
+ " <td>0</td>\n",
1151
+ " <td>1</td>\n",
1152
+ " <td>0.0351</td>\n",
1153
+ " <td>0.000022</td>\n",
1154
+ " <td>0.0855</td>\n",
1155
+ " <td>0.2700</td>\n",
1156
+ " <td>130.143</td>\n",
1157
+ " <td>4</td>\n",
1158
+ " <td>folk</td>\n",
1159
+ " </tr>\n",
1160
+ " <tr>\n",
1161
+ " <th>80020</th>\n",
1162
+ " <td>292693</td>\n",
1163
+ " <td>0</td>\n",
1164
+ " <td>0.327</td>\n",
1165
+ " <td>0.2690</td>\n",
1166
+ " <td>4</td>\n",
1167
+ " <td>1</td>\n",
1168
+ " <td>0.0339</td>\n",
1169
+ " <td>0.000234</td>\n",
1170
+ " <td>0.1080</td>\n",
1171
+ " <td>0.0857</td>\n",
1172
+ " <td>141.514</td>\n",
1173
+ " <td>4</td>\n",
1174
+ " <td>world-music</td>\n",
1175
+ " </tr>\n",
1176
+ " <tr>\n",
1177
+ " <th>49203</th>\n",
1178
+ " <td>320960</td>\n",
1179
+ " <td>0</td>\n",
1180
+ " <td>0.822</td>\n",
1181
+ " <td>0.4460</td>\n",
1182
+ " <td>7</td>\n",
1183
+ " <td>0</td>\n",
1184
+ " <td>0.0321</td>\n",
1185
+ " <td>0.001020</td>\n",
1186
+ " <td>0.1470</td>\n",
1187
+ " <td>0.6540</td>\n",
1188
+ " <td>119.994</td>\n",
1189
+ " <td>4</td>\n",
1190
+ " <td>k-pop</td>\n",
1191
+ " </tr>\n",
1192
+ " <tr>\n",
1193
+ " <th>...</th>\n",
1194
+ " <td>...</td>\n",
1195
+ " <td>...</td>\n",
1196
+ " <td>...</td>\n",
1197
+ " <td>...</td>\n",
1198
+ " <td>...</td>\n",
1199
+ " <td>...</td>\n",
1200
+ " <td>...</td>\n",
1201
+ " <td>...</td>\n",
1202
+ " <td>...</td>\n",
1203
+ " <td>...</td>\n",
1204
+ " <td>...</td>\n",
1205
+ " <td>...</td>\n",
1206
+ " <td>...</td>\n",
1207
+ " </tr>\n",
1208
+ " <tr>\n",
1209
+ " <th>11265</th>\n",
1210
+ " <td>454988</td>\n",
1211
+ " <td>0</td>\n",
1212
+ " <td>0.744</td>\n",
1213
+ " <td>0.8100</td>\n",
1214
+ " <td>1</td>\n",
1215
+ " <td>1</td>\n",
1216
+ " <td>0.0473</td>\n",
1217
+ " <td>0.907000</td>\n",
1218
+ " <td>0.0606</td>\n",
1219
+ " <td>0.7150</td>\n",
1220
+ " <td>123.018</td>\n",
1221
+ " <td>4</td>\n",
1222
+ " <td>chicago-house</td>\n",
1223
+ " </tr>\n",
1224
+ " <tr>\n",
1225
+ " <th>51104</th>\n",
1226
+ " <td>210367</td>\n",
1227
+ " <td>0</td>\n",
1228
+ " <td>0.817</td>\n",
1229
+ " <td>0.5390</td>\n",
1230
+ " <td>6</td>\n",
1231
+ " <td>0</td>\n",
1232
+ " <td>0.0621</td>\n",
1233
+ " <td>0.000496</td>\n",
1234
+ " <td>0.0990</td>\n",
1235
+ " <td>0.1580</td>\n",
1236
+ " <td>97.062</td>\n",
1237
+ " <td>4</td>\n",
1238
+ " <td>latin</td>\n",
1239
+ " </tr>\n",
1240
+ " <tr>\n",
1241
+ " <th>65722</th>\n",
1242
+ " <td>152093</td>\n",
1243
+ " <td>0</td>\n",
1244
+ " <td>0.545</td>\n",
1245
+ " <td>0.3350</td>\n",
1246
+ " <td>10</td>\n",
1247
+ " <td>1</td>\n",
1248
+ " <td>0.0300</td>\n",
1249
+ " <td>0.000714</td>\n",
1250
+ " <td>0.1770</td>\n",
1251
+ " <td>0.3060</td>\n",
1252
+ " <td>119.617</td>\n",
1253
+ " <td>4</td>\n",
1254
+ " <td>rockabilly</td>\n",
1255
+ " </tr>\n",
1256
+ " <tr>\n",
1257
+ " <th>5218</th>\n",
1258
+ " <td>128600</td>\n",
1259
+ " <td>0</td>\n",
1260
+ " <td>0.504</td>\n",
1261
+ " <td>0.5770</td>\n",
1262
+ " <td>7</td>\n",
1263
+ " <td>0</td>\n",
1264
+ " <td>0.0315</td>\n",
1265
+ " <td>0.215000</td>\n",
1266
+ " <td>0.1070</td>\n",
1267
+ " <td>0.7210</td>\n",
1268
+ " <td>145.168</td>\n",
1269
+ " <td>3</td>\n",
1270
+ " <td>black-metal</td>\n",
1271
+ " </tr>\n",
1272
+ " <tr>\n",
1273
+ " <th>41260</th>\n",
1274
+ " <td>184370</td>\n",
1275
+ " <td>0</td>\n",
1276
+ " <td>0.461</td>\n",
1277
+ " <td>0.5000</td>\n",
1278
+ " <td>1</td>\n",
1279
+ " <td>1</td>\n",
1280
+ " <td>0.0614</td>\n",
1281
+ " <td>0.030000</td>\n",
1282
+ " <td>0.1160</td>\n",
1283
+ " <td>0.5720</td>\n",
1284
+ " <td>76.176</td>\n",
1285
+ " <td>4</td>\n",
1286
+ " <td>hip-hop</td>\n",
1287
+ " </tr>\n",
1288
+ " </tbody>\n",
1289
+ "</table>\n",
1290
+ "<p>64334 rows × 13 columns</p>\n",
1291
+ "</div>"
1292
+ ],
1293
+ "text/plain": [
1294
+ " duration_ms explicit danceability energy key mode speechiness \\\n",
1295
+ "22832 197424 0 0.247 0.9370 9 0 0.0914 \n",
1296
+ "79002 84948 0 0.282 0.0663 0 0 0.0448 \n",
1297
+ "27606 284307 0 0.492 0.4740 0 1 0.0351 \n",
1298
+ "80020 292693 0 0.327 0.2690 4 1 0.0339 \n",
1299
+ "49203 320960 0 0.822 0.4460 7 0 0.0321 \n",
1300
+ "... ... ... ... ... .. ... ... \n",
1301
+ "11265 454988 0 0.744 0.8100 1 1 0.0473 \n",
1302
+ "51104 210367 0 0.817 0.5390 6 0 0.0621 \n",
1303
+ "65722 152093 0 0.545 0.3350 10 1 0.0300 \n",
1304
+ "5218 128600 0 0.504 0.5770 7 0 0.0315 \n",
1305
+ "41260 184370 0 0.461 0.5000 1 1 0.0614 \n",
1306
+ "\n",
1307
+ " instrumentalness liveness valence tempo time_signature \\\n",
1308
+ "22832 0.789000 0.7210 0.1350 175.189 4 \n",
1309
+ "79002 0.930000 0.0985 0.1550 209.557 4 \n",
1310
+ "27606 0.000022 0.0855 0.2700 130.143 4 \n",
1311
+ "80020 0.000234 0.1080 0.0857 141.514 4 \n",
1312
+ "49203 0.001020 0.1470 0.6540 119.994 4 \n",
1313
+ "... ... ... ... ... ... \n",
1314
+ "11265 0.907000 0.0606 0.7150 123.018 4 \n",
1315
+ "51104 0.000496 0.0990 0.1580 97.062 4 \n",
1316
+ "65722 0.000714 0.1770 0.3060 119.617 4 \n",
1317
+ "5218 0.215000 0.1070 0.7210 145.168 3 \n",
1318
+ "41260 0.030000 0.1160 0.5720 76.176 4 \n",
1319
+ "\n",
1320
+ " track_genre \n",
1321
+ "22832 drum-and-bass \n",
1322
+ "79002 turkish \n",
1323
+ "27606 folk \n",
1324
+ "80020 world-music \n",
1325
+ "49203 k-pop \n",
1326
+ "... ... \n",
1327
+ "11265 chicago-house \n",
1328
+ "51104 latin \n",
1329
+ "65722 rockabilly \n",
1330
+ "5218 black-metal \n",
1331
+ "41260 hip-hop \n",
1332
+ "\n",
1333
+ "[64334 rows x 13 columns]"
1334
+ ]
1335
+ },
1336
+ "execution_count": 22,
1337
+ "metadata": {},
1338
+ "output_type": "execute_result"
1339
+ }
1340
+ ],
1341
  "source": [
1342
  "from feature_engine.selection import DropCorrelatedFeatures\n",
1343
  "\n",
1344
+ "fes = DropCorrelatedFeatures(threshold=0.6)\n",
1345
+ "te = fes.fit_transform(X_train,y_train)\n",
1346
+ "te"
1347
  ]
1348
  },
1349
  {
1350
  "cell_type": "code",
1351
+ "execution_count": 17,
1352
  "id": "4797c378",
1353
  "metadata": {},
1354
  "outputs": [],
 
1364
  },
1365
  {
1366
  "cell_type": "code",
1367
+ "execution_count": 18,
1368
  "id": "6c8a6fd9",
1369
  "metadata": {},
1370
  "outputs": [],
 
1405
  },
1406
  {
1407
  "cell_type": "code",
1408
+ "execution_count": 19,
1409
  "id": "077ca8bd",
1410
  "metadata": {},
1411
  "outputs": [
 
1419
  "Average precision: 0.606 (+/- 0.005)\n",
1420
  "F1 score: 0.596 (+/- 0.005)\n",
1421
  "Balanced accuracy: 0.754 (+/- 0.004)\n",
1422
+ "Average precision: 0.630 (+/- 0.005)\n",
1423
+ "F1 score: 0.532 (+/- 0.007)\n",
1424
+ "Balanced accuracy: 0.687 (+/- 0.004)\n",
1425
+ "Average precision: 0.606 (+/- 0.005)\n",
1426
+ "F1 score: 0.596 (+/- 0.006)\n",
1427
  "Balanced accuracy: 0.754 (+/- 0.004)\n",
1428
  "Average precision: 0.627 (+/- 0.004)\n",
1429
  "F1 score: 0.608 (+/- 0.006)\n",
 
1506
  },
1507
  {
1508
  "cell_type": "code",
1509
+ "execution_count": 20,
1510
  "id": "e4c87875",
1511
  "metadata": {},
1512
  "outputs": [
 
1569
  " </tr>\n",
1570
  " <tr>\n",
1571
  " <th>both</th>\n",
1572
+ " <td>0.605690</td>\n",
1573
+ " <td>0.754049</td>\n",
1574
+ " <td>0.596330</td>\n",
1575
+ " <td>0.005015</td>\n",
1576
+ " <td>0.004315</td>\n",
1577
+ " <td>0.005606</td>\n",
1578
  " </tr>\n",
1579
  " <tr>\n",
1580
  " <th>feature_selection</th>\n",
1581
+ " <td>0.630199</td>\n",
1582
+ " <td>0.686720</td>\n",
1583
+ " <td>0.532165</td>\n",
1584
+ " <td>0.005109</td>\n",
1585
+ " <td>0.004134</td>\n",
1586
+ " <td>0.007051</td>\n",
1587
  " </tr>\n",
1588
  " <tr>\n",
1589
  " <th>oversampling</th>\n",
 
1612
  "score average_precision balanced_accuracy f1_score \n",
1613
  "model \n",
1614
  "baseline 0.630397 0.687034 0.532710 \n",
1615
+ "both 0.605690 0.754049 0.596330 \n",
1616
+ "feature_selection 0.630199 0.686720 0.532165 \n",
1617
  "oversampling 0.605969 0.754218 0.596470 \n",
1618
  "oversampling2 0.627238 0.753277 0.608240 \n",
1619
  "\n",
 
1621
  "score average_precision balanced_accuracy f1_score \n",
1622
  "model \n",
1623
  "baseline 0.005217 0.004777 0.008199 \n",
1624
+ "both 0.005015 0.004315 0.005606 \n",
1625
+ "feature_selection 0.005109 0.004134 0.007051 \n",
1626
  "oversampling 0.005173 0.003581 0.004911 \n",
1627
  "oversampling2 0.004290 0.004862 0.006231 "
1628
  ]
1629
  },
1630
+ "execution_count": 20,
1631
  "metadata": {},
1632
  "output_type": "execute_result"
1633
  }