diivien commited on
Commit
0d57a2f
·
1 Parent(s): c11a255

add check for data imbalance

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. Exploratory Data Analysis.ipynb +41 -7
.gitignore CHANGED
@@ -3,4 +3,5 @@
3
  catboost_info/
4
  my_study.db
5
  flagged/
6
- .cache
 
 
3
  catboost_info/
4
  my_study.db
5
  flagged/
6
+ .cache
7
+ *.pdf
Exploratory Data Analysis.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 1,
14
  "id": "6943c704",
15
  "metadata": {},
16
  "outputs": [],
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": 2,
27
  "id": "3591957a",
28
  "metadata": {},
29
  "outputs": [
@@ -37,7 +37,7 @@
37
  " dtype='object')"
38
  ]
39
  },
40
- "execution_count": 2,
41
  "metadata": {},
42
  "output_type": "execute_result"
43
  }
@@ -752,7 +752,7 @@
752
  },
753
  {
754
  "cell_type": "code",
755
- "execution_count": 13,
756
  "id": "8bba46ff",
757
  "metadata": {},
758
  "outputs": [],
@@ -763,7 +763,7 @@
763
  },
764
  {
765
  "cell_type": "code",
766
- "execution_count": 14,
767
  "id": "d7bf5907",
768
  "metadata": {},
769
  "outputs": [
@@ -1050,7 +1050,7 @@
1050
  "9 78.899 4 acoustic 1 "
1051
  ]
1052
  },
1053
- "execution_count": 14,
1054
  "metadata": {},
1055
  "output_type": "execute_result"
1056
  }
@@ -1059,6 +1059,40 @@
1059
  "df.head(10)"
1060
  ]
1061
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1062
  {
1063
  "cell_type": "markdown",
1064
  "id": "8058815a",
@@ -1292,7 +1326,7 @@
1292
  "name": "python",
1293
  "nbconvert_exporter": "python",
1294
  "pygments_lexer": "ipython3",
1295
- "version": "3.9.12"
1296
  }
1297
  },
1298
  "nbformat": 4,
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 2,
14
  "id": "6943c704",
15
  "metadata": {},
16
  "outputs": [],
 
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 3,
27
  "id": "3591957a",
28
  "metadata": {},
29
  "outputs": [
 
37
  " dtype='object')"
38
  ]
39
  },
40
+ "execution_count": 3,
41
  "metadata": {},
42
  "output_type": "execute_result"
43
  }
 
752
  },
753
  {
754
  "cell_type": "code",
755
+ "execution_count": 4,
756
  "id": "8bba46ff",
757
  "metadata": {},
758
  "outputs": [],
 
763
  },
764
  {
765
  "cell_type": "code",
766
+ "execution_count": 6,
767
  "id": "d7bf5907",
768
  "metadata": {},
769
  "outputs": [
 
1050
  "9 78.899 4 acoustic 1 "
1051
  ]
1052
  },
1053
+ "execution_count": 6,
1054
  "metadata": {},
1055
  "output_type": "execute_result"
1056
  }
 
1059
  "df.head(10)"
1060
  ]
1061
  },
1062
+ {
1063
+ "attachments": {},
1064
+ "cell_type": "markdown",
1065
+ "id": "b90195eb",
1066
+ "metadata": {},
1067
+ "source": [
1068
+ "### Check for data imbalance using ratio of popular songs to non-popular songs"
1069
+ ]
1070
+ },
1071
+ {
1072
+ "cell_type": "code",
1073
+ "execution_count": 8,
1074
+ "id": "60019910",
1075
+ "metadata": {},
1076
+ "outputs": [
1077
+ {
1078
+ "name": "stdout",
1079
+ "output_type": "stream",
1080
+ "text": [
1081
+ "The ratio of popular to non-popular songs is 0.32.\n"
1082
+ ]
1083
+ }
1084
+ ],
1085
+ "source": [
1086
+ "num_pop = sum(df['popularity_flag'] == 1)\n",
1087
+ "\n",
1088
+ "# Calculate the number of non-popular songs\n",
1089
+ "num_non_pop = sum(df['popularity_flag'] == 0)\n",
1090
+ "\n",
1091
+ "\n",
1092
+ "ratio = num_pop / num_non_pop\n",
1093
+ "print(f'The ratio of popular to non-popular songs is {ratio:.2f}.')\n"
1094
+ ]
1095
+ },
1096
  {
1097
  "cell_type": "markdown",
1098
  "id": "8058815a",
 
1326
  "name": "python",
1327
  "nbconvert_exporter": "python",
1328
  "pygments_lexer": "ipython3",
1329
+ "version": "3.10.6"
1330
  }
1331
  },
1332
  "nbformat": 4,