add check for data imbalance
Browse files- .gitignore +2 -1
- Exploratory Data Analysis.ipynb +41 -7
.gitignore
CHANGED
@@ -3,4 +3,5 @@
|
|
3 |
catboost_info/
|
4 |
my_study.db
|
5 |
flagged/
|
6 |
-
.cache
|
|
|
|
3 |
catboost_info/
|
4 |
my_study.db
|
5 |
flagged/
|
6 |
+
.cache
|
7 |
+
*.pdf
|
Exploratory Data Analysis.ipynb
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
-
"execution_count":
|
14 |
"id": "6943c704",
|
15 |
"metadata": {},
|
16 |
"outputs": [],
|
@@ -23,7 +23,7 @@
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
-
"execution_count":
|
27 |
"id": "3591957a",
|
28 |
"metadata": {},
|
29 |
"outputs": [
|
@@ -37,7 +37,7 @@
|
|
37 |
" dtype='object')"
|
38 |
]
|
39 |
},
|
40 |
-
"execution_count":
|
41 |
"metadata": {},
|
42 |
"output_type": "execute_result"
|
43 |
}
|
@@ -752,7 +752,7 @@
|
|
752 |
},
|
753 |
{
|
754 |
"cell_type": "code",
|
755 |
-
"execution_count":
|
756 |
"id": "8bba46ff",
|
757 |
"metadata": {},
|
758 |
"outputs": [],
|
@@ -763,7 +763,7 @@
|
|
763 |
},
|
764 |
{
|
765 |
"cell_type": "code",
|
766 |
-
"execution_count":
|
767 |
"id": "d7bf5907",
|
768 |
"metadata": {},
|
769 |
"outputs": [
|
@@ -1050,7 +1050,7 @@
|
|
1050 |
"9 78.899 4 acoustic 1 "
|
1051 |
]
|
1052 |
},
|
1053 |
-
"execution_count":
|
1054 |
"metadata": {},
|
1055 |
"output_type": "execute_result"
|
1056 |
}
|
@@ -1059,6 +1059,40 @@
|
|
1059 |
"df.head(10)"
|
1060 |
]
|
1061 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1062 |
{
|
1063 |
"cell_type": "markdown",
|
1064 |
"id": "8058815a",
|
@@ -1292,7 +1326,7 @@
|
|
1292 |
"name": "python",
|
1293 |
"nbconvert_exporter": "python",
|
1294 |
"pygments_lexer": "ipython3",
|
1295 |
-
"version": "3.
|
1296 |
}
|
1297 |
},
|
1298 |
"nbformat": 4,
|
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
+
"execution_count": 2,
|
14 |
"id": "6943c704",
|
15 |
"metadata": {},
|
16 |
"outputs": [],
|
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
+
"execution_count": 3,
|
27 |
"id": "3591957a",
|
28 |
"metadata": {},
|
29 |
"outputs": [
|
|
|
37 |
" dtype='object')"
|
38 |
]
|
39 |
},
|
40 |
+
"execution_count": 3,
|
41 |
"metadata": {},
|
42 |
"output_type": "execute_result"
|
43 |
}
|
|
|
752 |
},
|
753 |
{
|
754 |
"cell_type": "code",
|
755 |
+
"execution_count": 4,
|
756 |
"id": "8bba46ff",
|
757 |
"metadata": {},
|
758 |
"outputs": [],
|
|
|
763 |
},
|
764 |
{
|
765 |
"cell_type": "code",
|
766 |
+
"execution_count": 6,
|
767 |
"id": "d7bf5907",
|
768 |
"metadata": {},
|
769 |
"outputs": [
|
|
|
1050 |
"9 78.899 4 acoustic 1 "
|
1051 |
]
|
1052 |
},
|
1053 |
+
"execution_count": 6,
|
1054 |
"metadata": {},
|
1055 |
"output_type": "execute_result"
|
1056 |
}
|
|
|
1059 |
"df.head(10)"
|
1060 |
]
|
1061 |
},
|
1062 |
+
{
|
1063 |
+
"attachments": {},
|
1064 |
+
"cell_type": "markdown",
|
1065 |
+
"id": "b90195eb",
|
1066 |
+
"metadata": {},
|
1067 |
+
"source": [
|
1068 |
+
"### Check for data imbalance using ratio of popular songs to non-popular songs"
|
1069 |
+
]
|
1070 |
+
},
|
1071 |
+
{
|
1072 |
+
"cell_type": "code",
|
1073 |
+
"execution_count": 8,
|
1074 |
+
"id": "60019910",
|
1075 |
+
"metadata": {},
|
1076 |
+
"outputs": [
|
1077 |
+
{
|
1078 |
+
"name": "stdout",
|
1079 |
+
"output_type": "stream",
|
1080 |
+
"text": [
|
1081 |
+
"The ratio of popular to non-popular songs is 0.32.\n"
|
1082 |
+
]
|
1083 |
+
}
|
1084 |
+
],
|
1085 |
+
"source": [
|
1086 |
+
"num_pop = sum(df['popularity_flag'] == 1)\n",
|
1087 |
+
"\n",
|
1088 |
+
"# Calculate the number of non-popular songs\n",
|
1089 |
+
"num_non_pop = sum(df['popularity_flag'] == 0)\n",
|
1090 |
+
"\n",
|
1091 |
+
"\n",
|
1092 |
+
"ratio = num_pop / num_non_pop\n",
|
1093 |
+
"print(f'The ratio of popular to non-popular songs is {ratio:.2f}.')\n"
|
1094 |
+
]
|
1095 |
+
},
|
1096 |
{
|
1097 |
"cell_type": "markdown",
|
1098 |
"id": "8058815a",
|
|
|
1326 |
"name": "python",
|
1327 |
"nbconvert_exporter": "python",
|
1328 |
"pygments_lexer": "ipython3",
|
1329 |
+
"version": "3.10.6"
|
1330 |
}
|
1331 |
},
|
1332 |
"nbformat": 4,
|