{ "cells": [ { "cell_type": "markdown", "id": "1de28f74", "metadata": {}, "source": [ "# Data Cleaning" ] }, { "cell_type": "code", "execution_count": 1, "id": "bc4c415f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "6455bf8f", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('dataset.csv')\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "1c2440e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',\n", " 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',\n", " 'key', 'loudness', 'mode', 'speechiness', 'acousticness',\n", " 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',\n", " 'track_genre'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0track_idartistsalbum_nametrack_namepopularityduration_msexplicitdanceabilityenergy...loudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signaturetrack_genre
005SuOikwiRyPMVoIQDJUgSVGen HoshinoComedyComedy73230666False0.6760.4610...-6.74600.14300.03220.0000010.35800.71587.9174acoustic
114qPNDBW1i3p13qLCt0Ki3ABen WoodwardGhost (Acoustic)Ghost - Acoustic55149610False0.4200.1660...-17.23510.07630.92400.0000060.10100.26777.4894acoustic
221iJBSr7s7jYXzM8EGcbK5bIngrid Michaelson;ZAYNTo Begin AgainTo Begin Again57210826False0.4380.3590...-9.73410.05570.21000.0000000.11700.12076.3324acoustic
336lfxq3CG4xtTiEg7opyCyxKina GrannisCrazy Rich Asians (Original Motion Picture Sou...Can't Help Falling In Love71201933False0.2660.0596...-18.51510.03630.90500.0000710.13200.143181.7403acoustic
445vjLSffimiIP26QG5WcN2KChord OverstreetHold OnHold On82198853False0.6180.4430...-9.68110.05260.46900.0000000.08290.167119.9494acoustic
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 track_id artists \\\n", "0 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino \n", "1 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward \n", "2 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN \n", "3 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis \n", "4 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet \n", "\n", " album_name \\\n", "0 Comedy \n", "1 Ghost (Acoustic) \n", "2 To Begin Again \n", "3 Crazy Rich Asians (Original Motion Picture Sou... \n", "4 Hold On \n", "\n", " track_name popularity duration_ms explicit \\\n", "0 Comedy 73 230666 False \n", "1 Ghost - Acoustic 55 149610 False \n", "2 To Begin Again 57 210826 False \n", "3 Can't Help Falling In Love 71 201933 False \n", "4 Hold On 82 198853 False \n", "\n", " danceability energy ... loudness mode speechiness acousticness \\\n", "0 0.676 0.4610 ... -6.746 0 0.1430 0.0322 \n", "1 0.420 0.1660 ... -17.235 1 0.0763 0.9240 \n", "2 0.438 0.3590 ... -9.734 1 0.0557 0.2100 \n", "3 0.266 0.0596 ... -18.515 1 0.0363 0.9050 \n", "4 0.618 0.4430 ... -9.681 1 0.0526 0.4690 \n", "\n", " instrumentalness liveness valence tempo time_signature track_genre \n", "0 0.000001 0.3580 0.715 87.917 4 acoustic \n", "1 0.000006 0.1010 0.267 77.489 4 acoustic \n", "2 0.000000 0.1170 0.120 76.332 4 acoustic \n", "3 0.000071 0.1320 0.143 181.740 3 acoustic \n", "4 0.000000 0.0829 0.167 119.949 4 acoustic \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(df.columns)\n", "df.head()" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f1a88b42", "metadata": {}, "source": [ "### Remove unique columns" ] }, { "cell_type": "code", "execution_count": 4, "id": "ece13796", "metadata": {}, "outputs": [], "source": [ "df = df.drop(['Unnamed: 0','track_id', 'album_name'],axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "id": "060fbd33", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
artiststrack_namepopularityduration_msexplicitdanceabilityenergykeyloudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signaturetrack_genre
0Gen HoshinoComedy73230666False0.6760.46101-6.74600.14300.03220.0000010.35800.715087.9174acoustic
1Ben WoodwardGhost - Acoustic55149610False0.4200.16601-17.23510.07630.92400.0000060.10100.267077.4894acoustic
2Ingrid Michaelson;ZAYNTo Begin Again57210826False0.4380.35900-9.73410.05570.21000.0000000.11700.120076.3324acoustic
3Kina GrannisCan't Help Falling In Love71201933False0.2660.05960-18.51510.03630.90500.0000710.13200.1430181.7403acoustic
4Chord OverstreetHold On82198853False0.6180.44302-9.68110.05260.46900.0000000.08290.1670119.9494acoustic
5Tyrone WellsDays I Will Remember58214240False0.6880.48106-8.80710.10500.28900.0000000.18900.666098.0174acoustic
6A Great Big World;Christina AguileraSay Something74229400False0.4070.14702-8.82210.03550.85700.0000030.09130.0765141.2843acoustic
7Jason MrazI'm Yours80242946False0.7030.444011-9.33110.04170.55900.0000000.09730.7120150.9604acoustic
8Jason Mraz;Colbie CaillatLucky74189613False0.6250.41400-8.70010.03690.29400.0000000.15100.6690130.0884acoustic
9Ross CoppermanHunger56205594False0.4420.63201-6.77010.02950.42600.0041900.07350.196078.8994acoustic
\n", "
" ], "text/plain": [ " artists track_name \\\n", "0 Gen Hoshino Comedy \n", "1 Ben Woodward Ghost - Acoustic \n", "2 Ingrid Michaelson;ZAYN To Begin Again \n", "3 Kina Grannis Can't Help Falling In Love \n", "4 Chord Overstreet Hold On \n", "5 Tyrone Wells Days I Will Remember \n", "6 A Great Big World;Christina Aguilera Say Something \n", "7 Jason Mraz I'm Yours \n", "8 Jason Mraz;Colbie Caillat Lucky \n", "9 Ross Copperman Hunger \n", "\n", " popularity duration_ms explicit danceability energy key loudness \\\n", "0 73 230666 False 0.676 0.4610 1 -6.746 \n", "1 55 149610 False 0.420 0.1660 1 -17.235 \n", "2 57 210826 False 0.438 0.3590 0 -9.734 \n", "3 71 201933 False 0.266 0.0596 0 -18.515 \n", "4 82 198853 False 0.618 0.4430 2 -9.681 \n", "5 58 214240 False 0.688 0.4810 6 -8.807 \n", "6 74 229400 False 0.407 0.1470 2 -8.822 \n", "7 80 242946 False 0.703 0.4440 11 -9.331 \n", "8 74 189613 False 0.625 0.4140 0 -8.700 \n", "9 56 205594 False 0.442 0.6320 1 -6.770 \n", "\n", " mode speechiness acousticness instrumentalness liveness valence \\\n", "0 0 0.1430 0.0322 0.000001 0.3580 0.7150 \n", "1 1 0.0763 0.9240 0.000006 0.1010 0.2670 \n", "2 1 0.0557 0.2100 0.000000 0.1170 0.1200 \n", "3 1 0.0363 0.9050 0.000071 0.1320 0.1430 \n", "4 1 0.0526 0.4690 0.000000 0.0829 0.1670 \n", "5 1 0.1050 0.2890 0.000000 0.1890 0.6660 \n", "6 1 0.0355 0.8570 0.000003 0.0913 0.0765 \n", "7 1 0.0417 0.5590 0.000000 0.0973 0.7120 \n", "8 1 0.0369 0.2940 0.000000 0.1510 0.6690 \n", "9 1 0.0295 0.4260 0.004190 0.0735 0.1960 \n", "\n", " tempo time_signature track_genre \n", "0 87.917 4 acoustic \n", "1 77.489 4 acoustic \n", "2 76.332 4 acoustic \n", "3 181.740 3 acoustic \n", "4 119.949 4 acoustic \n", "5 98.017 4 acoustic \n", "6 141.284 3 acoustic \n", "7 150.960 4 acoustic \n", "8 130.088 4 acoustic \n", "9 78.899 4 acoustic " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(10)" ] }, { "cell_type": "code", "execution_count": 6, "id": "d801195c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "artists object\n", "track_name object\n", "popularity int64\n", "duration_ms int64\n", "explicit bool\n", "danceability float64\n", "energy float64\n", "key int64\n", "loudness float64\n", "mode int64\n", "speechiness float64\n", "acousticness float64\n", "instrumentalness float64\n", "liveness float64\n", "valence float64\n", "tempo float64\n", "time_signature int64\n", "track_genre object\n", "dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "attachments": {}, "cell_type": "markdown", "id": "aeb25f1a", "metadata": {}, "source": [ "### Drop Null Values" ] }, { "cell_type": "code", "execution_count": 7, "id": "ce3c3319", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "artists 1\n", "track_name 1\n", "popularity 0\n", "duration_ms 0\n", "explicit 0\n", "danceability 0\n", "energy 0\n", "key 0\n", "loudness 0\n", "mode 0\n", "speechiness 0\n", "acousticness 0\n", "instrumentalness 0\n", "liveness 0\n", "valence 0\n", "tempo 0\n", "time_signature 0\n", "track_genre 0\n", "dtype: int64\n" ] } ], "source": [ "print(df.isna().sum())\n", "df=df.dropna()" ] }, { "attachments": {}, "cell_type": "markdown", "id": "de7960de", "metadata": {}, "source": [ "### Drop Duplicated Rows (Same artists and track_name)" ] }, { "cell_type": "code", "execution_count": 8, "id": "eb46cc03", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
artiststrack_namepopularityduration_msexplicitdanceabilityenergykeyloudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signaturetrack_genre
18Jason Mraz;Colbie CaillatLucky68189613False0.6250.4140-8.70010.03690.294000.0000000.15100.6690130.0884acoustic
20Jason MrazI'm Yours75242946False0.7030.44411-9.33110.04170.559000.0000000.09730.7120150.9604acoustic
22A Great Big World;Christina AguileraSay Something70229400False0.4070.1472-8.82210.03550.857000.0000030.09130.0765141.2843acoustic
28Jason MrazWinter Wonderland0131760False0.6200.3095-9.20910.04950.788000.0000000.14600.6640145.3634acoustic
29Jason MrazWinter Wonderland0131760False0.6200.3095-9.20910.04950.788000.0000000.14600.6640145.3634acoustic
.........................................................
113845Hillsong Worship;Brooke LigertwoodKing Of Kings - Live at Hillsong Conference40291565False0.4540.4272-8.04910.02900.020500.0000000.69000.1840135.8874world-music
113882Bryan & Katie TorwaltGood News - Live23266632False0.4730.4746-9.17510.05580.395000.0000000.16300.2510140.7464world-music
113917Hillsong Worship;Mi-kaisha RoseNever Walk Alone - Live41348619False0.4200.5535-8.04910.03320.141000.0000000.10300.2140143.8044world-music
113951Passion;Kristian StanfillMore Like Jesus - Live44338694False0.4040.67610-5.46810.03540.027400.0000000.35200.1630144.0563world-music
113991Chris TomlinAt The Cross (Love Ran Red)32250629False0.3870.5318-4.78810.02900.003050.0000000.20100.1530146.0034world-music
\n", "

32656 rows × 18 columns

\n", "
" ], "text/plain": [ " artists \\\n", "18 Jason Mraz;Colbie Caillat \n", "20 Jason Mraz \n", "22 A Great Big World;Christina Aguilera \n", "28 Jason Mraz \n", "29 Jason Mraz \n", "... ... \n", "113845 Hillsong Worship;Brooke Ligertwood \n", "113882 Bryan & Katie Torwalt \n", "113917 Hillsong Worship;Mi-kaisha Rose \n", "113951 Passion;Kristian Stanfill \n", "113991 Chris Tomlin \n", "\n", " track_name popularity duration_ms \\\n", "18 Lucky 68 189613 \n", "20 I'm Yours 75 242946 \n", "22 Say Something 70 229400 \n", "28 Winter Wonderland 0 131760 \n", "29 Winter Wonderland 0 131760 \n", "... ... ... ... \n", "113845 King Of Kings - Live at Hillsong Conference 40 291565 \n", "113882 Good News - Live 23 266632 \n", "113917 Never Walk Alone - Live 41 348619 \n", "113951 More Like Jesus - Live 44 338694 \n", "113991 At The Cross (Love Ran Red) 32 250629 \n", "\n", " explicit danceability energy key loudness mode speechiness \\\n", "18 False 0.625 0.414 0 -8.700 1 0.0369 \n", "20 False 0.703 0.444 11 -9.331 1 0.0417 \n", "22 False 0.407 0.147 2 -8.822 1 0.0355 \n", "28 False 0.620 0.309 5 -9.209 1 0.0495 \n", "29 False 0.620 0.309 5 -9.209 1 0.0495 \n", "... ... ... ... ... ... ... ... \n", "113845 False 0.454 0.427 2 -8.049 1 0.0290 \n", "113882 False 0.473 0.474 6 -9.175 1 0.0558 \n", "113917 False 0.420 0.553 5 -8.049 1 0.0332 \n", "113951 False 0.404 0.676 10 -5.468 1 0.0354 \n", "113991 False 0.387 0.531 8 -4.788 1 0.0290 \n", "\n", " acousticness instrumentalness liveness valence tempo \\\n", "18 0.29400 0.000000 0.1510 0.6690 130.088 \n", "20 0.55900 0.000000 0.0973 0.7120 150.960 \n", "22 0.85700 0.000003 0.0913 0.0765 141.284 \n", "28 0.78800 0.000000 0.1460 0.6640 145.363 \n", "29 0.78800 0.000000 0.1460 0.6640 145.363 \n", "... ... ... ... ... ... \n", "113845 0.02050 0.000000 0.6900 0.1840 135.887 \n", "113882 0.39500 0.000000 0.1630 0.2510 140.746 \n", "113917 0.14100 0.000000 0.1030 0.2140 143.804 \n", "113951 0.02740 0.000000 0.3520 0.1630 144.056 \n", "113991 0.00305 0.000000 0.2010 0.1530 146.003 \n", "\n", " time_signature track_genre \n", "18 4 acoustic \n", "20 4 acoustic \n", "22 3 acoustic \n", "28 4 acoustic \n", "29 4 acoustic \n", "... ... ... \n", "113845 4 world-music \n", "113882 4 world-music \n", "113917 4 world-music \n", "113951 3 world-music \n", "113991 4 world-music \n", "\n", "[32656 rows x 18 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "duplicated_rows = df[df.duplicated(['artists', 'track_name'])]\n", "\n", "# print duplicated rows\n", "duplicated_rows" ] }, { "cell_type": "code", "execution_count": 9, "id": "251df65d", "metadata": {}, "outputs": [], "source": [ "df = df.drop_duplicates(['artists', 'track_name'], keep='first')" ] }, { "cell_type": "code", "execution_count": 10, "id": "d6eea5b5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "study 996\n", "black-metal 991\n", "comedy 987\n", "heavy-metal 985\n", "bluegrass 978\n", " ... \n", "rock 167\n", "reggae 166\n", "house 134\n", "indie 107\n", "reggaeton 63\n", "Name: track_genre, Length: 113, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape\n", "df['track_genre'].value_counts()" ] }, { "attachments": {}, "cell_type": "markdown", "id": "363cf332", "metadata": {}, "source": [ "### Drop artists and track name columns" ] }, { "cell_type": "code", "execution_count": 11, "id": "2f11bf72", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popularityduration_msexplicitdanceabilityenergykeyloudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signaturetrack_genre
073230666False0.6760.46101-6.74600.14300.03220.0000010.35800.71587.9174acoustic
155149610False0.4200.16601-17.23510.07630.92400.0000060.10100.26777.4894acoustic
257210826False0.4380.35900-9.73410.05570.21000.0000000.11700.12076.3324acoustic
371201933False0.2660.05960-18.51510.03630.90500.0000710.13200.143181.7403acoustic
482198853False0.6180.44302-9.68110.05260.46900.0000000.08290.167119.9494acoustic
\n", "
" ], "text/plain": [ " popularity duration_ms explicit danceability energy key loudness \\\n", "0 73 230666 False 0.676 0.4610 1 -6.746 \n", "1 55 149610 False 0.420 0.1660 1 -17.235 \n", "2 57 210826 False 0.438 0.3590 0 -9.734 \n", "3 71 201933 False 0.266 0.0596 0 -18.515 \n", "4 82 198853 False 0.618 0.4430 2 -9.681 \n", "\n", " mode speechiness acousticness instrumentalness liveness valence \\\n", "0 0 0.1430 0.0322 0.000001 0.3580 0.715 \n", "1 1 0.0763 0.9240 0.000006 0.1010 0.267 \n", "2 1 0.0557 0.2100 0.000000 0.1170 0.120 \n", "3 1 0.0363 0.9050 0.000071 0.1320 0.143 \n", "4 1 0.0526 0.4690 0.000000 0.0829 0.167 \n", "\n", " tempo time_signature track_genre \n", "0 87.917 4 acoustic \n", "1 77.489 4 acoustic \n", "2 76.332 4 acoustic \n", "3 181.740 3 acoustic \n", "4 119.949 4 acoustic " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.drop(['artists','track_name'],axis=1)\n", "df.head()" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e7d572f5", "metadata": {}, "source": [ "### Drop invalid tempo and time signature according to Spotify API" ] }, { "cell_type": "code", "execution_count": 12, "id": "69b1cceb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4 71986\n", "3 6944\n", "5 1488\n", "1 775\n", "0 150\n", "Name: time_signature, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['time_signature'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "id": "39a08b22", "metadata": {}, "outputs": [], "source": [ "df = df[df['time_signature'] >2]\n", "df = df[df['tempo'] > 0]" ] }, { "attachments": {}, "cell_type": "markdown", "id": "0b7c8cea", "metadata": {}, "source": [ "### Save the cleaned dataset into csv" ] }, { "cell_type": "code", "execution_count": 14, "id": "8c064cb0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popularityduration_msexplicitdanceabilityenergykeyloudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signaturetrack_genre
073230666False0.6760.46101-6.74600.14300.03220.0000010.35800.71587.9174acoustic
155149610False0.4200.16601-17.23510.07630.92400.0000060.10100.26777.4894acoustic
257210826False0.4380.35900-9.73410.05570.21000.0000000.11700.12076.3324acoustic
371201933False0.2660.05960-18.51510.03630.90500.0000710.13200.143181.7403acoustic
482198853False0.6180.44302-9.68110.05260.46900.0000000.08290.167119.9494acoustic
\n", "
" ], "text/plain": [ " popularity duration_ms explicit danceability energy key loudness \\\n", "0 73 230666 False 0.676 0.4610 1 -6.746 \n", "1 55 149610 False 0.420 0.1660 1 -17.235 \n", "2 57 210826 False 0.438 0.3590 0 -9.734 \n", "3 71 201933 False 0.266 0.0596 0 -18.515 \n", "4 82 198853 False 0.618 0.4430 2 -9.681 \n", "\n", " mode speechiness acousticness instrumentalness liveness valence \\\n", "0 0 0.1430 0.0322 0.000001 0.3580 0.715 \n", "1 1 0.0763 0.9240 0.000006 0.1010 0.267 \n", "2 1 0.0557 0.2100 0.000000 0.1170 0.120 \n", "3 1 0.0363 0.9050 0.000071 0.1320 0.143 \n", "4 1 0.0526 0.4690 0.000000 0.0829 0.167 \n", "\n", " tempo time_signature track_genre \n", "0 87.917 4 acoustic \n", "1 77.489 4 acoustic \n", "2 76.332 4 acoustic \n", "3 181.740 3 acoustic \n", "4 119.949 4 acoustic " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "12bea66e", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"cleaned_dataset.csv\",index = False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }