Bryan-Az commited on
Commit
9dd7d9c
·
1 Parent(s): 44725b7

linked mlops from git dir & added evaluation notebook

Browse files
mlops/notebooks/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Jupyter Notebooks, Exploratory Analysis and Prototyping
2
+
3
+ Jupyter colabs and python scripts in this directory are used to create a processing pipeline that help us generate data and load it into our project database. The cloud database chosen has data related to:
4
+
5
+ 1. User-to-Song rating/playback information
6
+ 2. Song-to-Song similarity information based on musicality.
7
+
8
+ These two sets of data will be used to train machine learning models within the MLOps phase of our project.
mlops/notebooks/label_encoder.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cd6ec677ead34e520071f2936debeaf08153f6437798670b574a8425884aa74
3
+ size 241117
mlops/notebooks/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66c599a48aa1280d790daa9f7cff45271edb5952ec9d8401d53e470af322801c
3
+ size 2772520
mlops/notebooks/model_training.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"5ByvVHnFr-s1"},"source":["Get million song subset data song list\n","Get metadata and join the data\n","\n","use artist similarity and artists to train the model on similarity\n","\n","use last.fm to get additional data on each song to augment this"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1048,"status":"ok","timestamp":1715387243880,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"dO4mavAdsELi","outputId":"d0064229-e1a9-4875-e8f1-ee2b19e36855"},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'google.colab'","output_type":"error","traceback":["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m","\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgoogle\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcolab\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m drive\n\u001b[0;32m 2\u001b[0m drive\u001b[38;5;241m.\u001b[39mmount(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/drive\u001b[39m\u001b[38;5;124m'\u001b[39m)\n","\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'google.colab'"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7109,"status":"ok","timestamp":1715387250988,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"RktUo1FTsTm4","outputId":"ef21ad0f-4c34-4693-f5de-e6f5617465d7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: python-dotenv in /usr/local/lib/python3.10/dist-packages (1.0.1)\n"]}],"source":["!pip install python-dotenv"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":1591,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"AkHtP67Sr-s2"},"outputs":[],"source":["# imports\n","import pandas as pd\n","import h5py\n","import os\n","from sqlalchemy import create_engine\n","import requests\n","import time\n","from dotenv import load_dotenv"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"vIJPMBNFr-s3"},"outputs":[],"source":["pd.set_option('display.max_rows', 100)"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"executionInfo":{"elapsed":2,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"kShsPDUoW0Tm","outputId":"5c3cb25e-e38f-4d9d-9102-1e18fd73618a"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'/content'"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["os.getcwd()"]},{"cell_type":"code","execution_count":6,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"kmdF3bjKW1kz"},"outputs":[],"source":["os.chdir('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/mlops/notebooks')"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":318,"status":"ok","timestamp":1715387252893,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"JKeiMiYMWwpM","outputId":"1eee2de4-7144-45fb-9269-a7eed567bc2d"},"outputs":[{"name":"stdout","output_type":"stream","text":["label_encoder.joblib model_training.ipynb scaler.joblib\t tracks_eda.ipynb\n","model.pth\t README.md\t\t test_spotify_api.ipynb users_eda.ipynb\n"]}],"source":["! ls"]},{"cell_type":"markdown","metadata":{"id":"sk1jv62kr-s3"},"source":["# Loading Data"]},{"cell_type":"markdown","metadata":{"id":"rTCKoervr-s3"},"source":["## Loading million song subset data"]},{"cell_type":"code","execution_count":9,"metadata":{"executionInfo":{"elapsed":162,"status":"ok","timestamp":1715364589548,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"zdtzGTb4r-s3"},"outputs":[],"source":["# load the data (only loading song_id, metadata contains the rest)\n","def read_song_features(file_path):\n"," with h5py.File(file_path, 'r') as f:\n"," song_id = f['metadata']['songs']['song_id'][0].decode('utf-8')\n"," return {'song_id': song_id}\n","\n","\n","# process all files in a directory into a df\n","def process_all_files_to_dataframe(root_dir):\n"," data = []\n"," print(f\"Checking directory: {root_dir}\")\n","\n"," for subdir, dirs, files in os.walk(root_dir):\n"," print(f\"Currently scanning {subdir} with {len(files)} files\")\n"," for file in files:\n"," if file.endswith('.h5'):\n"," file_path = os.path.join(subdir, file)\n"," print(f\"Processing file: {file_path}\")\n"," song_data = read_song_features(file_path)\n"," data.append(song_data)\n","\n"," if not data:\n"," print(\"No data to process.\")\n","\n"," df = pd.DataFrame(data)\n"," return df"]},{"cell_type":"code","execution_count":11,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":163,"status":"ok","timestamp":1715364643278,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"4WPOjZXOr-s3","outputId":"8ebedd7f-7741-4d5f-d46b-8f8399ff1c54"},"outputs":[{"name":"stdout","output_type":"stream","text":["Checking directory: ../../../data/\n","No data to process.\n"]}],"source":["root_dir = 'data/MillionSongSubset'\n","df = process_all_files_to_dataframe(root_dir)"]},{"cell_type":"markdown","metadata":{"id":"cM5Cf9MEr-s3"},"source":["### Loading million song subset metadata from sqlite db"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JBos93r4r-s3"},"outputs":[],"source":["# load metadata from sqlite\n","def load_data_from_sqlite(db_path, table_name):\n"," engine = create_engine(f'sqlite:///{db_path}')\n"," query = f\"SELECT * FROM {table_name}\"\n"," df = pd.read_sql_query(query, engine)\n"," return df\n","\n","# load metadata and merge with song data\n","db_path3 = 'data/MillionSongSubsetMetadata/track_metadata.db'\n","df3 = load_data_from_sqlite(db_path3, 'songs')\n","df = df.merge(df3, on='song_id', how='left')\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"B2zIZ4T6r-s4"},"outputs":[],"source":["columns_to_drop = ['track_id', 'artist_id', 'song_id', 'artist_mbid', 'track_7digitalid', 'shs_perf', 'shs_work']\n","\n","for column in columns_to_drop:\n"," if column in df.columns:\n"," df.drop(columns=[column], inplace=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NWP5gZpzr-s4"},"outputs":[],"source":["df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"TcgnhEffr-s4"},"outputs":[],"source":["df.head()"]},{"cell_type":"markdown","metadata":{"id":"9XEtFgNrr-s4"},"source":["## Loading last.fm data"]},{"cell_type":"code","execution_count":8,"metadata":{"executionInfo":{"elapsed":214,"status":"ok","timestamp":1715387284033,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ce8MTL8dr-s4"},"outputs":[],"source":["def fetch_data(api_key, method, params):\n"," base_url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params['api_key'] = api_key\n"," params['method'] = method\n"," params['format'] = 'json'\n"," response = requests.get(base_url, params=params)\n"," return response.json()\n","\n","\n","def get_artist_info(api_key, artist_name):\n"," params = {'artist': artist_name}\n"," return fetch_data(api_key, 'artist.getInfo', params)\n","\n","\n","def get_track_info(api_key, artist_name, track_name):\n"," params = {'artist': artist_name, 'track': track_name}\n"," return fetch_data(api_key, 'track.getInfo', params)\n","\n","\n","def batch_fetch_data(api_key, items, fetch_function, sleep_time=1):\n"," results = []\n"," for item in items:\n"," result = fetch_function(api_key, *item)\n"," results.append(result)\n"," # time.sleep(sleep_time)\n"," return results"]},{"cell_type":"code","execution_count":9,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1715387284241,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"4NqoJJOYr-s4"},"outputs":[],"source":["# load LASTFM_API_KEY from .env\n","import requests\n","load_dotenv()\n","api_key = os.getenv('LASTFM_API_KEY')\n","\n","\n","def fetch_lastfm_data(api_key, artist_name, track_name):\n"," base_url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params = {\n"," 'method': 'track.getInfo',\n"," 'api_key': api_key,\n"," 'artist': artist_name,\n"," 'track': track_name,\n"," 'format': 'json'\n"," }\n"," response = requests.get(base_url, params=params)\n"," if response.status_code == 200 and response.text.strip():\n"," return response.json()\n"," else:\n"," return None\n","\n","\n","def parse_lastfm_data(data):\n"," if data and 'track' in data:\n"," track = data['track']\n"," return {\n"," 'listeners': track.get('listeners', '0'),\n"," 'playcount': track.get('playcount', '0'),\n"," 'tags': ', '.join(tag['name'] for tag in track.get('toptags', {}).get('tag', [])),\n"," }\n"," return None"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":216},"executionInfo":{"elapsed":357,"status":"error","timestamp":1715387298868,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"prQYTEpGr-s4","outputId":"7f04e536-cd64-41d8-861f-1ef035e99cab"},"outputs":[{"ename":"NameError","evalue":"name 'df' is not defined","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-10-da14a3af7ef7>\u001b[0m in \u001b[0;36m<cell line: 6>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mapi_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'LASTFM_API_KEY'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msubset_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mtracks_skipped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"]}],"source":["from tqdm import tqdm\n","tqdm.pandas()\n","\n","load_dotenv()\n","api_key = os.getenv('LASTFM_API_KEY')\n","subset_df = df.head(1000)\n","\n","tracks_skipped = 0\n","\n","\n","def fetch_and_parse(row):\n"," global tracks_skipped\n"," data = fetch_lastfm_data(api_key, row['artist_name'], row['title'])\n"," if data is None:\n"," tracks_skipped += 1\n"," return None\n"," parsed_data = parse_lastfm_data(data)\n"," if parsed_data is None:\n"," tracks_skipped += 1\n"," return parsed_data\n","\n","\n","# Use progress_apply instead of apply\n","subset_df['lastfm_data'] = subset_df.progress_apply(fetch_and_parse, axis=1)\n","\n","# Remove rows where lastfm_data is None\n","subset_df = subset_df[subset_df['lastfm_data'].notna()]\n","\n","subset_df.reset_index(drop=True, inplace=True)\n","track_details_df = pd.json_normalize(subset_df['lastfm_data'])\n","mixed = pd.concat(\n"," [subset_df.drop(columns=['lastfm_data']), track_details_df], axis=1)\n","\n","print(f\"Tracks skipped: {tracks_skipped}\")\n","\n","mixed.to_csv('data/music_data_small.csv', index=False)"]},{"cell_type":"markdown","metadata":{"id":"0wxV_-P6r-s5"},"source":["## Data processing"]},{"cell_type":"code","execution_count":191,"metadata":{"executionInfo":{"elapsed":191,"status":"ok","timestamp":1715390440593,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"vMUDiJbjr-s5"},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"..\\..\\db\\data\\music_data.csv\")\n","df.dropna(inplace=True)"]},{"cell_type":"code","execution_count":192,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":258},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1715390440851,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"gHQ3NQr7rBN4","outputId":"6897914c-b362-4ea0-94e0-a5030f95fe88"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>title</th>\n"," <th>release</th>\n"," <th>artist_name</th>\n"," <th>duration</th>\n"," <th>artist_familiarity</th>\n"," <th>artist_hotttnesss</th>\n"," <th>year</th>\n"," <th>listeners</th>\n"," <th>playcount</th>\n"," <th>tags</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>100 Club 1996 ''We Love You Beatles'' - Live</td>\n"," <td>Sex Pistols - The Interviews</td>\n"," <td>Sex Pistols</td>\n"," <td>88.73751</td>\n"," <td>0.731184</td>\n"," <td>0.549204</td>\n"," <td>0</td>\n"," <td>172</td>\n"," <td>210</td>\n"," <td>The Beatles, title is a full sentence</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Yo Quiero Contigo</td>\n"," <td>Sentenciados - Platinum Edition</td>\n"," <td>Baby Rasta &amp; Gringo</td>\n"," <td>167.36608</td>\n"," <td>0.610186</td>\n"," <td>0.355320</td>\n"," <td>0</td>\n"," <td>9753</td>\n"," <td>16911</td>\n"," <td>Reggaeton, alexis y fido, Eliana, mis videos, ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Emerald</td>\n"," <td>Emerald</td>\n"," <td>Bedrock</td>\n"," <td>501.86404</td>\n"," <td>0.654039</td>\n"," <td>0.390625</td>\n"," <td>2004</td>\n"," <td>973</td>\n"," <td>2247</td>\n"," <td>dance</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>Karma</td>\n"," <td>The Diary Of Alicia Keys</td>\n"," <td>Alicia Keys</td>\n"," <td>255.99955</td>\n"," <td>0.933916</td>\n"," <td>0.778674</td>\n"," <td>2003</td>\n"," <td>250304</td>\n"," <td>1028356</td>\n"," <td>rnb, soul, Alicia Keys, female vocalists, Karma</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>Money Blues</td>\n"," <td>Slidetime</td>\n"," <td>Joanna Connor</td>\n"," <td>243.66975</td>\n"," <td>0.479218</td>\n"," <td>0.332857</td>\n"," <td>0</td>\n"," <td>429</td>\n"," <td>1008</td>\n"," <td>guitar girl, blues</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" title \\\n","0 100 Club 1996 ''We Love You Beatles'' - Live \n","1 Yo Quiero Contigo \n","4 Emerald \n","6 Karma \n","7 Money Blues \n","\n"," release artist_name duration \\\n","0 Sex Pistols - The Interviews Sex Pistols 88.73751 \n","1 Sentenciados - Platinum Edition Baby Rasta & Gringo 167.36608 \n","4 Emerald Bedrock 501.86404 \n","6 The Diary Of Alicia Keys Alicia Keys 255.99955 \n","7 Slidetime Joanna Connor 243.66975 \n","\n"," artist_familiarity artist_hotttnesss year listeners playcount \\\n","0 0.731184 0.549204 0 172 210 \n","1 0.610186 0.355320 0 9753 16911 \n","4 0.654039 0.390625 2004 973 2247 \n","6 0.933916 0.778674 2003 250304 1028356 \n","7 0.479218 0.332857 0 429 1008 \n","\n"," tags \n","0 The Beatles, title is a full sentence \n","1 Reggaeton, alexis y fido, Eliana, mis videos, ... \n","4 dance \n","6 rnb, soul, Alicia Keys, female vocalists, Karma \n","7 guitar girl, blues "]},"execution_count":192,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":193,"metadata":{"executionInfo":{"elapsed":142,"status":"ok","timestamp":1715390441226,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ltW1gvc7r-s5"},"outputs":[],"source":["import pandas as pd\n","import torch\n","from torch.utils.data import DataLoader\n","import torch.nn as nn\n","import torch.nn.functional as F\n","from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n","from sklearn.model_selection import train_test_split\n","import torch.optim as optim\n","\n","def label_encode_data(df):\n"," df = df.copy(deep=True)\n"," # Encode categorical data\n"," label_encoders = {}\n"," unknown_label = 'unknown' # Define an unknown label\n","\n"," for column in ['tags', 'title']:\n"," le = LabelEncoder()\n","\n"," # Get unique categories plus an 'unknown' category\n"," unique_categories = df[column].unique().tolist()\n"," # Add 'unknown' to the list of categories\n"," unique_categories.append(unknown_label)\n","\n"," # Fit the LabelEncoder to these categories\n"," le.fit(unique_categories)\n"," df[column] = le.transform(df[column].astype(str))\n","\n"," # Store the encoder\n"," label_encoders[column] = le\n","\n"," return df, label_encoders\n","\n","\n","# Normalize numerical features\n","scaler = MinMaxScaler()\n","df[['listeners', 'playcount']] = scaler.fit_transform(\n"," df[['listeners', 'playcount']])\n","\n","# Label encode categorical features\n","df_scaled, label_encoder_training = label_encode_data(df)\n","\n","# Split data into features and target\n","X = df_scaled[['tags']]\n","y = df_scaled['title']\n","\n","# Split the dataset into training and testing sets\n","X_train, X_test, y_train, y_test = train_test_split(\n"," X, y, test_size=0.2, random_state=42)"]},{"cell_type":"code","execution_count":194,"metadata":{"executionInfo":{"elapsed":166,"status":"ok","timestamp":1715390465207,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"G3RFJN6Ur-s5"},"outputs":[],"source":["class SongRecommender(nn.Module):\n"," def __init__(self):\n"," super(SongRecommender, self).__init__()\n"," self.fc1 = nn.Linear(1, 128) # Adjust input features if needed\n"," self.fc2 = nn.Linear(128, 256)\n"," self.fc3 = nn.Linear(256, 128)\n"," # Output size = number of unique titles including 'unknown'\n"," # Add 1 for the 'unknown' label\n"," self.output = nn.Linear(128, len(y.unique()) + 1)\n","\n"," def forward(self, x):\n"," x = F.relu(self.fc1(x))\n"," x = F.relu(self.fc2(x))\n"," x = F.relu(self.fc3(x))\n"," x = self.output(x)\n"," return x\n","\n","\n","model = SongRecommender()\n","optimizer = optim.Adam(model.parameters(), lr=0.001)\n","criterion = nn.CrossEntropyLoss()"]},{"cell_type":"code","execution_count":195,"metadata":{"executionInfo":{"elapsed":160,"status":"ok","timestamp":1715390466326,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"-HenNJLnr-s5"},"outputs":[],"source":["def train_model(model, X_train, y_train, X_test, y_test):\n"," train_loader = DataLoader(\n"," list(zip(X_train.values.astype(float), y_train)), batch_size=10, shuffle=True)\n"," test_loader = DataLoader(\n"," list(zip(X_test.values.astype(float), y_test)), batch_size=10, shuffle=False)\n","\n"," model.train()\n"," for epoch in range(10): # Number of epochs\n"," train_loss = 0\n"," for features, labels in train_loader:\n"," optimizer.zero_grad()\n"," outputs = model(torch.tensor(features).float())\n"," # Ensure labels are long type\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"," loss.backward()\n"," optimizer.step()\n"," train_loss += loss.item()\n","\n"," # Validation phase\n"," model.eval()\n"," validation_loss = 0\n"," for features, labels in test_loader:\n"," outputs = model(torch.tensor(features).float())\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"," validation_loss += loss.item()\n","\n"," print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}')"]},{"cell_type":"code","execution_count":196,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":138831,"status":"ok","timestamp":1715390606602,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"gNpxg0ANr-s5","outputId":"c7e9ce0c-3653-4e9a-b3ee-2b9d88da4364"},"outputs":[{"name":"stderr","output_type":"stream","text":["C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:12: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," outputs = model(torch.tensor(features).float())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:14: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," loss = criterion(outputs, torch.tensor(labels).long())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," outputs = model(torch.tensor(features).float())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:24: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"]},{"name":"stdout","output_type":"stream","text":["Epoch 1, Training Loss: 14.161600421387472, Validation Loss: 8.646175272324506\n","Epoch 2, Training Loss: 8.468926938374837, Validation Loss: 8.906991397633272\n","Epoch 3, Training Loss: 8.42033219749545, Validation Loss: 9.14518429251278\n","Epoch 4, Training Loss: 8.428513119544512, Validation Loss: 9.366180943507775\n","Epoch 5, Training Loss: 8.350075204872791, Validation Loss: 9.573424189698462\n","Epoch 6, Training Loss: 8.334989405267033, Validation Loss: 9.770331466899199\n","Epoch 7, Training Loss: 8.404972361340935, Validation Loss: 9.958629150016636\n","Epoch 8, Training Loss: 8.490517691624017, Validation Loss: 10.354363404068293\n","Epoch 9, Training Loss: 8.405202573611412, Validation Loss: 10.315738350737329\n","Epoch 10, Training Loss: 8.300552919175889, Validation Loss: 10.487916422825233\n"]}],"source":["train_model(model, X_train, y_train, X_test, y_test)"]},{"cell_type":"code","execution_count":197,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SongRecommender(\n"," (fc1): Linear(in_features=1, out_features=128, bias=True)\n"," (fc2): Linear(in_features=128, out_features=256, bias=True)\n"," (fc3): Linear(in_features=256, out_features=128, bias=True)\n"," (output): Linear(in_features=128, out_features=4855, bias=True)\n",")\n"]}],"source":["print(model)"]},{"cell_type":"code","execution_count":198,"metadata":{"executionInfo":{"elapsed":138,"status":"ok","timestamp":1715390703802,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"1i30qNdCr-s5"},"outputs":[],"source":["# save the model\n","torch.save(model.state_dict(), './model.pth')"]},{"cell_type":"code","execution_count":199,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1715390703994,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"iCAMAEj5r-s5"},"outputs":[],"source":["# load the model\n","model = SongRecommender()"]},{"cell_type":"code","execution_count":200,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":292,"status":"ok","timestamp":1715390704465,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"jjkMlHqDHS-Z","outputId":"469d9395-1aa0-4695-98e0-ee867cd31e6b"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>artist_name</th>\n"," <th>title</th>\n"," <th>tags</th>\n"," <th>listeners</th>\n"," <th>playcount</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Sex Pistols</td>\n"," <td>100 Club 1996 ''We Love You Beatles'' - Live</td>\n"," <td>The Beatles, title is a full sentence</td>\n"," <td>0.000070</td>\n"," <td>0.000009</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Baby Rasta &amp; Gringo</td>\n"," <td>Yo Quiero Contigo</td>\n"," <td>Reggaeton, alexis y fido, Eliana, mis videos, ...</td>\n"," <td>0.003978</td>\n"," <td>0.000729</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Bedrock</td>\n"," <td>Emerald</td>\n"," <td>dance</td>\n"," <td>0.000397</td>\n"," <td>0.000097</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>Alicia Keys</td>\n"," <td>Karma</td>\n"," <td>rnb, soul, Alicia Keys, female vocalists, Karma</td>\n"," <td>0.102103</td>\n"," <td>0.044359</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>Joanna Connor</td>\n"," <td>Money Blues</td>\n"," <td>guitar girl, blues</td>\n"," <td>0.000175</td>\n"," <td>0.000043</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" artist_name title \\\n","0 Sex Pistols 100 Club 1996 ''We Love You Beatles'' - Live \n","1 Baby Rasta & Gringo Yo Quiero Contigo \n","4 Bedrock Emerald \n","6 Alicia Keys Karma \n","7 Joanna Connor Money Blues \n","\n"," tags listeners playcount \n","0 The Beatles, title is a full sentence 0.000070 0.000009 \n","1 Reggaeton, alexis y fido, Eliana, mis videos, ... 0.003978 0.000729 \n","4 dance 0.000397 0.000097 \n","6 rnb, soul, Alicia Keys, female vocalists, Karma 0.102103 0.044359 \n","7 guitar girl, blues 0.000175 0.000043 "]},"execution_count":200,"metadata":{},"output_type":"execute_result"}],"source":["df.loc[:, ['artist_name', 'title', 'tags', 'listeners', 'playcount']].head()"]},{"cell_type":"code","execution_count":201,"metadata":{"executionInfo":{"elapsed":186,"status":"ok","timestamp":1715390829249,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"-W3SwScgr-s5"},"outputs":[],"source":["def label_encode_data(df):\n"," df = df.copy(deep=True)\n"," # Encode categorical data\n"," label_encoders = {}\n"," unknown_label = 'unknown' # Define an unknown label\n","\n"," for column in ['tags']:\n"," le = LabelEncoder()\n","\n"," # Get unique categories plus an 'unknown' category\n"," unique_categories = df[column].unique().tolist()\n"," # Add 'unknown' to the list of categories\n"," unique_categories.append(unknown_label)\n","\n"," # Fit the LabelEncoder to these categories\n"," le.fit(unique_categories)\n"," df[column] = le.transform(df[column].astype(str))\n","\n"," # Store the encoder\n"," label_encoders[column] = le\n","\n"," return df, label_encoders\n","\n","\n","def recommend_songs(model, user_data, full_data=df, train_encoder=label_encoder_training):\n"," model.eval()\n"," full_data = full_data.copy(deep=True)\n"," with torch.no_grad():\n","\n"," # Create a DataFrame with feature names\n"," text_features = user_data.loc[:, ['tags']]\n","\n"," # encoding using concatenated full dataset and evaluation set for inference\n"," df = full_data.loc[:, ['tags']]\n"," text_features_full = df.loc[:, ['tags']]\n","\n"," all_labels = pd.concat([text_features, text_features_full], axis=0)\n"," all_labels.reset_index(drop=True, inplace=True)\n","\n"," # Get the encoder based on all categorical features\n"," _, label_encoders = label_encode_data(all_labels)\n","\n"," # encode the user data\n"," label_encoded_data = text_features.copy(deep=True)\n"," for column in ['tags']:\n"," label_encoded_data[column] = label_encoders[column].transform(\n"," label_encoded_data[column].astype(str))\n","\n"," # converting label_encoded_data into a torch tensor as float dtype\n"," all_features = torch.tensor(\n"," label_encoded_data.to_numpy()).float().unsqueeze(0)\n","\n"," # Make predictions\n"," predictions = model(all_features)\n","\n"," predictions = predictions[0, :5, :] # selecting top 5\n"," for row in predictions:\n"," top_5_values, top_5_indices = row.topk(5)\n"," recommended_song_ids = top_5_indices.squeeze().tolist()\n","\n"," try:\n"," recommended_titles = label_encoders['title'].inverse_transform(\n"," recommended_song_ids)\n"," recommended_tags = label_encoders['tags'].inverse_transform(\n"," recommended_song_ids)\n"," except:\n"," recommended_titles = train_encoder['title'].inverse_transform(\n"," recommended_song_ids)[:5]\n"," recommended_tags = train_encoder['tags'].inverse_transform(recommended_song_ids)[\n"," :5]\n","\n"," return list(zip(recommended_titles, recommended_tags))"]},{"cell_type":"code","execution_count":202,"metadata":{},"outputs":[],"source":["user_preferences = pd.read_csv(\"..\\\\..\\\\db\\\\data\\\\user_preferences.csv\")\n","user_preferences.drop('level_0', axis=1, inplace=True)"]},{"cell_type":"code","execution_count":203,"metadata":{"executionInfo":{"elapsed":180,"status":"ok","timestamp":1715388203675,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"o89SnI9-r-s5"},"outputs":[],"source":["import requests\n","\n","\n","def fetch_song_data(api_key, artist_name, track_name):\n"," url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params = {\n"," 'method': 'track.getInfo',\n"," 'api_key': api_key,\n"," 'artist': artist_name,\n"," 'track': track_name,\n"," 'format': 'json'\n"," }\n"," response = requests.get(url, params=params)\n"," print(response.content)\n"," return response.json() if response.status_code == 200 else {}\n","\n","\n","def parse_song_data(song_data):\n"," if song_data and 'track' in song_data:\n"," track = song_data['track']\n"," return {\n"," 'artist_name': track['artist']['name'],\n"," 'tags': ', '.join([tag['name'] for tag in track.get('toptags', {}).get('tag', [])]),\n"," 'duration': float(track.get('duration', 0)),\n"," 'listeners': int(track.get('listeners', 0)),\n"," 'playcount': int(track.get('playcount', 0)),\n"," 'album': track.get('album', {}).get('title', 'Unknown')\n"," }\n"," return {}"]},{"cell_type":"markdown","metadata":{"id":"xm89R7m8Xh-G"},"source":["# Importing the User Data and Making Recommendations\n","Let's make recommendations using the sample user's preferences."]},{"cell_type":"code","execution_count":204,"metadata":{"executionInfo":{"elapsed":171,"status":"ok","timestamp":1715388938035,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ufzxTjO3YeXT"},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":205,"metadata":{"executionInfo":{"elapsed":379,"status":"ok","timestamp":1715388938616,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"y3hXSZHnXhA7"},"outputs":[],"source":["user_preferences = pd.read_csv(\"..\\\\..\\\\db\\\\data\\\\user_preferences.csv\")\n","user_preferences.drop('level_0', axis=1, inplace=True)"]},{"cell_type":"code","execution_count":206,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":293},"executionInfo":{"elapsed":232,"status":"ok","timestamp":1715388938847,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"OU7aafogGu2t","outputId":"27703100-6719-44ca-8429-cec20e133bbe"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>songID</th>\n"," <th>artist</th>\n"," <th>song</th>\n"," <th>link</th>\n"," <th>text</th>\n"," <th>userID</th>\n"," <th>listeners</th>\n"," <th>playcount</th>\n"," <th>tags</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>19632</td>\n"," <td>Toto</td>\n"," <td>You Are The Flower</td>\n"," <td>/t/toto/you+are+the+flower_20139737.html</td>\n"," <td>You never lose a minute, if in it there is lov...</td>\n"," <td>0</td>\n"," <td>25307</td>\n"," <td>87344</td>\n"," <td>AOR, rock, soft rock, 70s, pop rock</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>19632</td>\n"," <td>Toto</td>\n"," <td>You Are The Flower</td>\n"," <td>/t/toto/you+are+the+flower_20139737.html</td>\n"," <td>You never lose a minute, if in it there is lov...</td>\n"," <td>0</td>\n"," <td>25307</td>\n"," <td>87344</td>\n"," <td>AOR, rock, soft rock, 70s, pop rock</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>25284</td>\n"," <td>Billie Holiday</td>\n"," <td>I Only Have Eyes For You</td>\n"," <td>/b/billie+holiday/i+only+have+eyes+for+you_200...</td>\n"," <td>My love must be a kind of blind love, \\r\\nI c...</td>\n"," <td>0</td>\n"," <td>60356</td>\n"," <td>178625</td>\n"," <td>jazz, female vocal, vocal jazz, blues, female ...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>43594</td>\n"," <td>Michael Bolton</td>\n"," <td>Only A Woman Like You</td>\n"," <td>/m/michael+bolton/only+a+woman+like+you_101792...</td>\n"," <td>It's beautiful, your honesty \\r\\nYou cry when...</td>\n"," <td>0</td>\n"," <td>4595</td>\n"," <td>13266</td>\n"," <td>Ballad, romantic, soul, pop, cool</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>50200</td>\n"," <td>Rascal Flatts</td>\n"," <td>The Day Before You</td>\n"," <td>/r/rascal+flatts/the+day+before+you_10238985.html</td>\n"," <td>I had all but given up \\r\\nOn finding the one...</td>\n"," <td>0</td>\n"," <td>22077</td>\n"," <td>86012</td>\n"," <td>country, rock, contemporary country, seen live...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" songID artist song \\\n","0 19632 Toto You Are The Flower \n","1 19632 Toto You Are The Flower \n","2 25284 Billie Holiday I Only Have Eyes For You \n","3 43594 Michael Bolton Only A Woman Like You \n","4 50200 Rascal Flatts The Day Before You \n","\n"," link \\\n","0 /t/toto/you+are+the+flower_20139737.html \n","1 /t/toto/you+are+the+flower_20139737.html \n","2 /b/billie+holiday/i+only+have+eyes+for+you_200... \n","3 /m/michael+bolton/only+a+woman+like+you_101792... \n","4 /r/rascal+flatts/the+day+before+you_10238985.html \n","\n"," text userID listeners \\\n","0 You never lose a minute, if in it there is lov... 0 25307 \n","1 You never lose a minute, if in it there is lov... 0 25307 \n","2 My love must be a kind of blind love, \\r\\nI c... 0 60356 \n","3 It's beautiful, your honesty \\r\\nYou cry when... 0 4595 \n","4 I had all but given up \\r\\nOn finding the one... 0 22077 \n","\n"," playcount tags \n","0 87344 AOR, rock, soft rock, 70s, pop rock \n","1 87344 AOR, rock, soft rock, 70s, pop rock \n","2 178625 jazz, female vocal, vocal jazz, blues, female ... \n","3 13266 Ballad, romantic, soul, pop, cool \n","4 86012 country, rock, contemporary country, seen live... "]},"execution_count":206,"metadata":{},"output_type":"execute_result"}],"source":["user_preferences.head()"]},{"cell_type":"code","execution_count":216,"metadata":{"executionInfo":{"elapsed":166,"status":"ok","timestamp":1715388941345,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"jVeX1VX9YXj9"},"outputs":[],"source":["sample_user = user_preferences.where(user_preferences['userID'] == np.random.randint(*(0, 9))).dropna()"]},{"cell_type":"markdown","metadata":{"id":"Yu9C90x4Y9lY"},"source":["Hopefully, the neural network makes recommendations of artists that fall into the top 5 for the user."]},{"cell_type":"code","execution_count":217,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":148,"status":"ok","timestamp":1715391053989,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"wnANm0R3YrVa","outputId":"9f890890-055b-41b9-c630-8986442dcaf9"},"outputs":[{"data":{"text/plain":["tags\n","romantic, Love, pop, easy listening, michael bolton 5.0\n","pop 4.0\n","loneliness after dusk, Madonna, demo, never let you go, rebel heart 3.0\n","alternative rock, pop, alternative, pop rock, OneRepublic 3.0\n","pop, boybands, dance, backstreet boys, love at first listen 3.0\n","dtype: float64"]},"execution_count":217,"metadata":{},"output_type":"execute_result"}],"source":["top_5 = sample_user.groupby('tags').count().mean(axis=1).sort_values(ascending=False)[:5]\n","top_5"]},{"cell_type":"code","execution_count":218,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":397,"status":"ok","timestamp":1715390877851,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"m9k_0jVCr-s5","outputId":"9610f359-503f-4b06-d36a-f07d4a79a687"},"outputs":[{"name":"stdout","output_type":"stream","text":["#### RECOMMENDATIONS ###\n"]},{"data":{"text/plain":["[('Blueberry Hill', 'Hip-Hop, hip hop, rap, underground hip-hop, political'),\n"," ('Prognosis', 'mpb, pop, 80s, latin, California'),\n"," ('Money Blues', 'hip hop, rap, Hip-Hop, LL Cool J, Timbaland'),\n"," ('Facedown', 'blues, Old Blues, guitar, slide guitar, gospel'),\n"," ('CB4', 'Kanye West, rnb, 00s, janet jackson, pop')]"]},"execution_count":218,"metadata":{},"output_type":"execute_result"}],"source":["print(\"#### RECOMMENDATIONS ###\")\n","song_recs = recommend_songs(model, sample_user, df) # requires giving main song df for finding embeddings\n","song_recs"]},{"cell_type":"code","execution_count":210,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":276},"executionInfo":{"elapsed":219,"status":"ok","timestamp":1715390907498,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"Klx_gv2v4i6x","outputId":"c2b4dd3e-a48e-411b-9a2c-7963ef922075"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>title</th>\n"," <th>release</th>\n"," <th>artist_name</th>\n"," <th>duration</th>\n"," <th>artist_familiarity</th>\n"," <th>artist_hotttnesss</th>\n"," <th>year</th>\n"," <th>listeners</th>\n"," <th>playcount</th>\n"," <th>tags</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [title, release, artist_name, duration, artist_familiarity, artist_hotttnesss, year, listeners, playcount, tags]\n","Index: []"]},"execution_count":210,"metadata":{},"output_type":"execute_result"}],"source":["# finding the song artist in the main dataset\n","df.loc[df['title'].isin(song_recs)]"]},{"cell_type":"code","execution_count":211,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"elapsed":428,"status":"ok","timestamp":1715391084128,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"AmOx_KyQRALU","outputId":"2010ab1c-ea64-4a00-8dc6-186d584d4868"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>songID</th>\n"," <th>artist</th>\n"," <th>song</th>\n"," <th>link</th>\n"," <th>text</th>\n"," <th>userID</th>\n"," <th>listeners</th>\n"," <th>playcount</th>\n"," <th>tags</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [songID, artist, song, link, text, userID, listeners, playcount, tags]\n","Index: []"]},"execution_count":211,"metadata":{},"output_type":"execute_result"}],"source":["# lets see how it compares to sample user\n","sample_user.where(sample_user['artist'].isin(top_5.index)).dropna()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"rqkW9oKPRe05"},"outputs":[],"source":[]}],"metadata":{"colab":{"collapsed_sections":["rTCKoervr-s3","9XEtFgNrr-s4"],"provenance":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
mlops/notebooks/scaler.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fec4063e0e914dee18fee94336dd03b1ff78c991d7b22fd4e7aa0c9761bc91f6
3
+ size 1159
mlops/notebooks/test_spotify_api.ipynb ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "view-in-github",
7
+ "colab_type": "text"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/Bryan-Az/Neurobytes/blob/main/notebooks/test_spotify_api.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 9,
16
+ "metadata": {
17
+ "colab": {
18
+ "base_uri": "https://localhost:8080/"
19
+ },
20
+ "id": "zQVPVULlQjLy",
21
+ "outputId": "d949a567-1ee8-4b51-e544-5e6c1888f4e8"
22
+ },
23
+ "outputs": [
24
+ {
25
+ "output_type": "stream",
26
+ "name": "stdout",
27
+ "text": [
28
+ "Requirement already satisfied: python-dotenv in /usr/local/lib/python3.10/dist-packages (1.0.1)\n"
29
+ ]
30
+ },
31
+ {
32
+ "output_type": "execute_result",
33
+ "data": {
34
+ "text/plain": [
35
+ "True"
36
+ ]
37
+ },
38
+ "metadata": {},
39
+ "execution_count": 9
40
+ }
41
+ ],
42
+ "source": [
43
+ "import requests\n",
44
+ "# access colab secrets .env\n",
45
+ "!pip install python-dotenv\n",
46
+ "from dotenv import load_dotenv\n",
47
+ "load_dotenv()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "source": [
53
+ "# Data Related to the Spotify API"
54
+ ],
55
+ "metadata": {
56
+ "id": "rx--dKrrWFAX"
57
+ }
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "source": [
62
+ "Main Discovery Objective:\n",
63
+ "To identify data that can provide Song-to-Song similarity information based on musicality.\n",
64
+ "\n",
65
+ "\n",
66
+ "---\n",
67
+ "\n",
68
+ "The spotify API is able to retrieve data for select artists, albums and shows and other collections within Spotify's content like podcasts or playlists that may help us build alternative sets of information outside of the API from where we can begin."
69
+ ],
70
+ "metadata": {
71
+ "id": "0fZ70UJFWtQr"
72
+ }
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "source": [
77
+ "import os\n",
78
+ "SPOTIFY_CLIENT_ID = os.getenv('client_id')\n",
79
+ "SPOTIFY_CLIENT_SECRET = os.getenv('client_secret')"
80
+ ],
81
+ "metadata": {
82
+ "id": "rj6PkYhtRbex"
83
+ },
84
+ "execution_count": 10,
85
+ "outputs": []
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 88,
90
+ "metadata": {
91
+ "id": "n8zRio2XQjLz"
92
+ },
93
+ "outputs": [],
94
+ "source": [
95
+ "def get_access_token(client_id, client_secret):\n",
96
+ " url = 'https://accounts.spotify.com/api/token'\n",
97
+ " headers = {\n",
98
+ " 'Content-Type': 'application/x-www-form-urlencoded'\n",
99
+ " }\n",
100
+ " payload = {\n",
101
+ " 'grant_type': 'client_credentials'\n",
102
+ " }\n",
103
+ " response = requests.post(url, headers=headers, data=payload, auth=(client_id, client_secret))\n",
104
+ " if response.status_code == 200:\n",
105
+ " return response.json()['access_token']\n",
106
+ " else:\n",
107
+ " raise Exception(\"Failed to retrieve access token\")\n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 89,
113
+ "metadata": {
114
+ "id": "LGblk66xQjL0"
115
+ },
116
+ "outputs": [],
117
+ "source": [
118
+ "access_token = get_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "source": [
124
+ "## Artist-Specific Data"
125
+ ],
126
+ "metadata": {
127
+ "id": "20wfCc7pV-Tk"
128
+ }
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 95,
133
+ "metadata": {
134
+ "id": "KU5bUwqjQjL1"
135
+ },
136
+ "outputs": [],
137
+ "source": [
138
+ "def get_artist_data(access_token, artist_id):\n",
139
+ " '''\n",
140
+ " This function retrieves artist data from the Spotify API using the provided access token and artist ID.\n",
141
+ " '''\n",
142
+ " url = f'https://api.spotify.com/v1/artists/{artist_id}'\n",
143
+ " headers = {\n",
144
+ " 'Authorization': f'Bearer {access_token}'\n",
145
+ " }\n",
146
+ " response = requests.get(url, headers=headers)\n",
147
+ " if response.status_code == 200:\n",
148
+ " return response.json()\n",
149
+ " else:\n",
150
+ " if response.status_code == 404:\n",
151
+ " raise Exception(\"Artist not found\")\n",
152
+ " elif response.status_code == 401:\n",
153
+ " get_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)\n",
154
+ " return get_artist_data(access_token, artist_id)\n",
155
+ " else:\n",
156
+ " raise Exception(\"Failed to retrieve artist data\")\n"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 91,
162
+ "metadata": {
163
+ "colab": {
164
+ "base_uri": "https://localhost:8080/"
165
+ },
166
+ "id": "DP3CBI7EQjL2",
167
+ "outputId": "ad4edf96-d819-4bd3-e6e4-09e65895c753"
168
+ },
169
+ "outputs": [
170
+ {
171
+ "output_type": "stream",
172
+ "name": "stdout",
173
+ "text": [
174
+ "{'external_urls': {'spotify': 'https://open.spotify.com/artist/1vCWHaC5f2uS3yhpwWbIA6'}, 'followers': {'href': None, 'total': 22658238}, 'genres': ['dance pop', 'edm', 'pop', 'pop dance'], 'href': 'https://api.spotify.com/v1/artists/1vCWHaC5f2uS3yhpwWbIA6', 'id': '1vCWHaC5f2uS3yhpwWbIA6', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab6761610000e5ebae07171f989fb39736674113', 'width': 640}, {'height': 320, 'url': 'https://i.scdn.co/image/ab67616100005174ae07171f989fb39736674113', 'width': 320}, {'height': 160, 'url': 'https://i.scdn.co/image/ab6761610000f178ae07171f989fb39736674113', 'width': 160}], 'name': 'Avicii', 'popularity': 78, 'type': 'artist', 'uri': 'spotify:artist:1vCWHaC5f2uS3yhpwWbIA6'}\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "artist_id = '1vCWHaC5f2uS3yhpwWbIA6' # This is the artist ID for Avicii\n",
180
+ "artist_data = get_artist_data(access_token, artist_id)\n",
181
+ "print(artist_data)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "source": [
187
+ "artist_data.keys()"
188
+ ],
189
+ "metadata": {
190
+ "colab": {
191
+ "base_uri": "https://localhost:8080/"
192
+ },
193
+ "id": "lM4V2WtzYrx_",
194
+ "outputId": "818de95e-ee31-48a0-cbbb-5951f84df08b"
195
+ },
196
+ "execution_count": 96,
197
+ "outputs": [
198
+ {
199
+ "output_type": "execute_result",
200
+ "data": {
201
+ "text/plain": [
202
+ "dict_keys(['external_urls', 'followers', 'genres', 'href', 'id', 'images', 'name', 'popularity', 'type', 'uri'])"
203
+ ]
204
+ },
205
+ "metadata": {},
206
+ "execution_count": 96
207
+ }
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "source": [
213
+ "artist_data['genres']"
214
+ ],
215
+ "metadata": {
216
+ "colab": {
217
+ "base_uri": "https://localhost:8080/"
218
+ },
219
+ "id": "ZoG-4QUemGg3",
220
+ "outputId": "bac86f37-b181-4867-9e02-83d8258aed08"
221
+ },
222
+ "execution_count": 127,
223
+ "outputs": [
224
+ {
225
+ "output_type": "execute_result",
226
+ "data": {
227
+ "text/plain": [
228
+ "['dance pop', 'edm', 'pop', 'pop dance']"
229
+ ]
230
+ },
231
+ "metadata": {},
232
+ "execution_count": 127
233
+ }
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "source": [
239
+ "def get_artist_albums(access_token, artist_id):\n",
240
+ " '''\n",
241
+ " This function retrieves a list of albums for a given artist using the Spotify API.\n",
242
+ " returns: list of albums for the artist\n",
243
+ " '''\n",
244
+ " # part 1 extracts the album information without the tracks\n",
245
+ " include_groups = 'album'\n",
246
+ " url = f'https://api.spotify.com/v1/artists/{artist_id}/albums/?include_groups={include_groups}'\n",
247
+ " headers = {\n",
248
+ " 'Authorization': f'Bearer {access_token}'\n",
249
+ " }\n",
250
+ " response = requests.get(url, headers=headers)\n",
251
+ " if response.status_code == 200:\n",
252
+ " return response.json()\n",
253
+ " else:\n",
254
+ " if response.status_code == 404:\n",
255
+ " raise Exception(\"Artist not found\")\n",
256
+ " elif response.status_code == 401:\n",
257
+ " get_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)\n",
258
+ " return 'Try again, new token generated'\n",
259
+ " else:\n",
260
+ " raise Exception(\"Failed to retrieve artist data\")"
261
+ ],
262
+ "metadata": {
263
+ "id": "lRMDVG0CY_xr"
264
+ },
265
+ "execution_count": 122,
266
+ "outputs": []
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "source": [
271
+ "# setting include_group to album only to return only this artists own albums\n",
272
+ "artist_albums = get_artist_albums(access_token, artist_id)"
273
+ ],
274
+ "metadata": {
275
+ "id": "IXZFbWG-ZGvN"
276
+ },
277
+ "execution_count": 123,
278
+ "outputs": []
279
+ },
280
+ {
281
+ "cell_type": "markdown",
282
+ "source": [
283
+ "The available metadata available for the artist album's returned."
284
+ ],
285
+ "metadata": {
286
+ "id": "KzKBzVnBa2Gc"
287
+ }
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "source": [
292
+ "# might be redundant information if the include_group is set\n",
293
+ "for i in artist_albums:\n",
294
+ " print(i['name'] + ' ' + i['album_group'] + ' '+ i['album_type'])"
295
+ ],
296
+ "metadata": {
297
+ "colab": {
298
+ "base_uri": "https://localhost:8080/"
299
+ },
300
+ "id": "lfEf-p2ybHWW",
301
+ "outputId": "fe39c159-cefa-4633-83a6-798e0ce1a651"
302
+ },
303
+ "execution_count": 78,
304
+ "outputs": [
305
+ {
306
+ "output_type": "stream",
307
+ "name": "stdout",
308
+ "text": [
309
+ "TIM album album\n",
310
+ "Stories album album\n",
311
+ "True: Avicii By Avicii album album\n",
312
+ "The Days / Nights album album\n",
313
+ "True (Bonus Edition) album album\n",
314
+ "True album album\n",
315
+ "Malo (The Cube Guys Remix) single single\n",
316
+ "Street Dancer (Sgt Slick's Discotizer 2022 Remix) single single\n",
317
+ "My Feelings For You (Mark Knight Remix) single single\n",
318
+ "My Feelings For You (Don Diablo Remix) single single\n",
319
+ "Forever Yours (Avicii Tribute) single single\n",
320
+ "Fades Away (feat. MishCatt) [Tribute Concert Version] single single\n",
321
+ "Heaven (David Guetta & MORTEN Remix) single single\n",
322
+ "Tough Love (Tiësto Remix) single single\n",
323
+ "SOS (Laidback Luke Tribute Remix) single single\n",
324
+ "Tough Love (feat. Vargas & Lagola) single single\n",
325
+ "SOS (feat. Aloe Blacc) single single\n",
326
+ "Lonely Together (Remixes) single single\n",
327
+ "Lonely Together (Acoustic) single single\n",
328
+ "Without You (Remixes) single single\n"
329
+ ]
330
+ }
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "source": [],
336
+ "metadata": {
337
+ "id": "G4IPmxegYmHN"
338
+ },
339
+ "execution_count": null,
340
+ "outputs": []
341
+ }
342
+ ],
343
+ "metadata": {
344
+ "kernelspec": {
345
+ "display_name": "Python 3",
346
+ "language": "python",
347
+ "name": "python3"
348
+ },
349
+ "language_info": {
350
+ "codemirror_mode": {
351
+ "name": "ipython",
352
+ "version": 3
353
+ },
354
+ "file_extension": ".py",
355
+ "mimetype": "text/x-python",
356
+ "name": "python",
357
+ "nbconvert_exporter": "python",
358
+ "pygments_lexer": "ipython3",
359
+ "version": "3.8.19"
360
+ },
361
+ "colab": {
362
+ "provenance": [],
363
+ "include_colab_link": true
364
+ }
365
+ },
366
+ "nbformat": 4,
367
+ "nbformat_minor": 0
368
+ }
mlops/notebooks/tracks_eda.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mlops/notebooks/users_eda.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mlops/scripts/create_user_preferences/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==1.3.3
2
+ nltk
3
+ scikit-learn
4
+ pydrive
5
+ oauth2client
6
+ google-cloud-storage
7
+ google-auth
8
+ google-auth-oauthlib
9
+ tqdm
10
+ python-dotenv
11
+ os
mlops/scripts/create_user_preferences/run.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Step a: Install dependencies
4
+ python3 -m venv venv
5
+ source venv/bin/activate
6
+ pip install -r requirements.txt
7
+
8
+ # Step b: Run necessary parts of the codebase
9
+ #python src/document_term_matrix.py &
10
+ #python src/cosine_similarity.py &
11
+ python src/synthetic_user_data.py &
12
+
13
+ wait
mlops/scripts/create_user_preferences/run_search.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Step a: Install dependencies
4
+ python3 -m venv venv
5
+ source venv/bin/activate
6
+ pip install -r requirements.txt
7
+
8
+ python3 src/data_loader.py &
9
+
10
+ wait
mlops/scripts/create_user_preferences/src/config.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Configuration for data paths
4
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
5
+ DATA_DIR = os.path.dirname(os.path.abspath('../../../db/data/README.md'))
6
+ PROJECT_DIR = os.path.dirname(os.path.abspath('../../../README.md'))
mlops/scripts/create_user_preferences/src/cosine_similarity.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ from document_term_matrix import create_document_term_matrix
4
+ from data_loader import load_data
5
+ from config import DATA_DIR
6
+ def calculate_cosine_similarity(dtm):
7
+ """ Calculate the cosine similarity matrix from a document-term matrix. """
8
+ return cosine_similarity(dtm)
9
+
10
+ if __name__ == "__main__":
11
+ data = load_data('millionsong_dataset.zip')
12
+ dtm, features = create_document_term_matrix(data['text'])
13
+ similarity_matrix = calculate_cosine_similarity(dtm)
mlops/scripts/create_user_preferences/src/data_loader.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from config import DATA_DIR, PROJECT_DIR
3
+ import os
4
+ import io
5
+ import google
6
+ from google.oauth2 import credentials
7
+ from google.oauth2.credentials import Credentials
8
+ from google_auth_oauthlib.flow import InstalledAppFlow
9
+ from googleapiclient.discovery import build
10
+ from googleapiclient.http import MediaIoBaseDownload
11
+ from googleapiclient.http import MediaFileUpload
12
+ import socket
13
+
14
+ def is_port_in_use(port):
15
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
16
+ return s.connect_ex(('localhost', port)) == 0
17
+
18
+ def load_credentials():
19
+ """Load or create new credentials."""
20
+ SCOPES = ['https://www.googleapis.com/auth/drive']
21
+ creds = None
22
+ # The file token.json stores the user's access and refresh tokens, and is
23
+ # created automatically when the authorization flow completes for the first
24
+ # time.
25
+ try:
26
+ creds, _ = google.auth.default()
27
+ except google.auth.exceptions.DefaultCredentialsError:
28
+ # Run the flow using the client secrets file
29
+ if not is_port_in_use(8080):
30
+ path_to_json = PROJECT_DIR + "/client_secrets.json" # Path relative to your main application file
31
+ print(path_to_json)
32
+ flow = InstalledAppFlow.from_client_secrets_file(path_to_json, SCOPES)
33
+ creds = flow.run_local_server(port=8080, access_type='offline', prompt='consent')
34
+ with open('token.json', 'w') as token:
35
+ token.write(creds.to_json())
36
+ creds = credentials.Credentials.from_authorized_user_file('token.json', SCOPES)
37
+ if isinstance(creds, Credentials):
38
+ print("creds is a valid Credentials object.")
39
+ else:
40
+ print("creds is not a valid Credentials object.")
41
+ else:
42
+ print('In else in load creds')
43
+ # Load the credentials from token.json if they exist and check for expiry
44
+ try:
45
+ with open('token.json', 'r') as token:
46
+ creds = credentials.Credentials.from_authorized_user_file('token.json', SCOPES)
47
+ if isinstance(creds, Credentials):
48
+ print("creds is a valid Credentials object.")
49
+ else:
50
+ print("creds is not a valid Credentials object.")
51
+ except FileNotFoundError:
52
+ print("Token file not found. Please re-run the authentication flow.")
53
+ raise
54
+ return creds
55
+
56
+ def search_files(service, file_name):
57
+ '''
58
+ helps search for files in the neurobytes google folder
59
+ '''
60
+ items = []
61
+ try:
62
+ creds = load_credentials()
63
+ print('credentials loaded')
64
+ # Link PyDrive to use the credentials
65
+ service = build('drive', 'v3', credentials=creds)
66
+ print('auth with gdrive')
67
+
68
+ results = service.files().list(
69
+ pageSize=10,
70
+ fields="nextPageToken, files(id, name)",
71
+ q=f"name='{file_name}' and trashed=false"
72
+ ).execute()
73
+ items = results.get('files', [])
74
+
75
+ if not items:
76
+ print('No files found.')
77
+ else:
78
+ print('Files:')
79
+ for item in items:
80
+ print(u'{0} ({1})'.format(item['name'], item['id']))
81
+
82
+ except Exception as e:
83
+ print('An error occured in auth with google')
84
+ print(e)
85
+ return items
86
+
87
+ def load_data(file_id, file_name, n=10000):
88
+ """
89
+ Load data from a specified file within the data directory.
90
+ Only a sample of 2000 rows will be used to load data
91
+ as the original dataset is very large.
92
+ Args:
93
+ file_name (str): The name of the file to load.
94
+
95
+ Returns:
96
+ DataFrame: A pandas DataFrame containing the loaded data.
97
+ """
98
+ if os.path.exists(DATA_DIR + '/' + file_name):
99
+ file_path = os.path.join(DATA_DIR, file_name)
100
+ data = pd.read_csv(file_path)
101
+ print(f"Data loaded successfully from {file_path}")
102
+ sample_data = data.sample(n, replace=False).reset_index(drop=False)
103
+ return sample_data
104
+ else:
105
+ try:
106
+ creds = load_credentials()
107
+ print('credentials loaded')
108
+ # Link PyDrive to use the credentials
109
+ service = build('drive', 'v3', credentials=creds)
110
+ print('auth with gdrive')
111
+ request = service.files().get_media(fileId=file_id)
112
+ fh = io.BytesIO()
113
+ downloader = MediaIoBaseDownload(fh, request, chunksize=204800) # Adjust chunk size as needed
114
+ done = False
115
+ try:
116
+ while not done:
117
+ status, done = downloader.next_chunk()
118
+ print("Download progress: {0}".format(status.progress() * 100))
119
+ fh.seek(0)
120
+ with open(DATA_DIR + '/'+ file_name, 'wb') as f:
121
+ f.write(fh.read())
122
+ print(f"Download of '{DATA_DIR + '/'+ file_name}' complete.")
123
+ except Exception as e:
124
+ print(f"An error occurred: {e}")
125
+
126
+ print('Data accessed from glink')
127
+ file_path = os.path.join(DATA_DIR, file_name)
128
+ data = pd.read_csv(file_path)
129
+ print(f"Data loaded successfully from {file_path}")
130
+ sample_data = data.sample(n, replace=False).reset_index(drop=False)
131
+ return sample_data
132
+
133
+ except Exception as e:
134
+ print('An error occured in auth with google')
135
+ print(e)
136
+
137
+ def upload_data(filename, creds):
138
+ """Uploads a file to Google Drive."""
139
+ SCOPES = ['https://www.googleapis.com/auth/drive.file']
140
+ service = build('drive', 'v3', credentials=creds)
141
+ neurobytes_folder = '1VxknqmOtEsoCM3R0DQEOYpjMVUOHW24H'
142
+ file_metadata = {
143
+ 'name': os.path.basename(filename),
144
+ 'parents': [neurobytes_folder]
145
+ }
146
+ media = MediaFileUpload(filename, mimetype='text/csv')
147
+ file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
148
+ print(f"File ID: {file.get('id')}")
149
+
150
+ if __name__ == '__main__':
151
+ creds = load_credentials()
152
+ service = build('drive', 'v3', credentials=creds)
153
+ file_name = "CMPE-258: Team Neurobytes"
154
+ search_files(service, file_name)
mlops/scripts/create_user_preferences/src/document_term_matrix.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from config import DATA_DIR
4
+ from data_loader import load_data
5
+
6
+ def create_document_term_matrix(data):
7
+ """ Convert text data into a document-term matrix. """
8
+ text_corpus = create_text_corpus(data)
9
+ vectorizer = CountVectorizer()
10
+ dtm = vectorizer.fit_transform(text_corpus)
11
+ return dtm, vectorizer.get_feature_names_out()
12
+
13
+ from nltk.tokenize import word_tokenize
14
+ from nltk.probability import FreqDist
15
+ import nltk
16
+ nltk.download('punkt')
17
+ nltk.download('stopwords')
18
+ def create_text_corpus(data, lyric_chars=1000):
19
+ """
20
+ Prepares the text data before input to the document-term matrix.
21
+ It incorporates artist name and a subset of lyrics.
22
+ """
23
+
24
+ def top_n_words(text, n=5):
25
+ '''
26
+ Applied on the lyrics to shrink weight of lyrics on similarity
27
+ '''
28
+ freq_dist = FreqDist(word_tokenize(text))
29
+ # remove filler words
30
+ freq_dist = FreqDist({key: val for key, val in freq_dist.items() if key not in nltk.corpus.stopwords.words('english')})
31
+ # select only the words from the (word, freq) tuple list
32
+ words = ' '.join([word for word, freq in freq_dist.most_common()])
33
+ return words
34
+
35
+ artist_names = data['artist']
36
+ song_name = data['song']
37
+ lyrics = data['text']
38
+ top_words = lyrics.apply(lambda x: top_n_words(x))
39
+
40
+ return artist_names + ' ' + song_name + ' ' + top_words
41
+
42
+
43
+ if __name__ == "__main__":
44
+ data = load_data('millionsong_dataset.zip')
45
+ dtm, features = create_document_term_matrix(data)
mlops/scripts/create_user_preferences/src/integrate_lastfm.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ # load LASTFM_API_KEY from .env
3
+ import requests
4
+ import os
5
+
6
+ def fetch_data(api_key, method, params):
7
+ base_url = "http://ws.audioscrobbler.com/2.0/"
8
+ params['api_key'] = api_key
9
+ params['method'] = method
10
+ params['format'] = 'json'
11
+ response = requests.get(base_url, params=params)
12
+ return response.json()
13
+
14
+
15
+ def get_artist_info(api_key, artist_name):
16
+ params = {'artist': artist_name}
17
+ return fetch_data(api_key, 'artist.getInfo', params)
18
+
19
+
20
+ def get_track_info(api_key, artist_name, track_name):
21
+ params = {'artist': artist_name, 'track': track_name}
22
+ return fetch_data(api_key, 'track.getInfo', params)
23
+
24
+
25
+ def batch_fetch_data(api_key, items, fetch_function, sleep_time=1):
26
+ results = []
27
+ for item in items:
28
+ result = fetch_function(api_key, *item)
29
+ results.append(result)
30
+ # time.sleep(sleep_time)
31
+ return results
32
+
33
+ api_key = os.getenv('LASTFM_API_KEY')
34
+
35
+
36
+ def fetch_lastfm_data(api_key, artist_name, track_name):
37
+ base_url = "http://ws.audioscrobbler.com/2.0/"
38
+ params = {
39
+ 'method': 'track.getInfo',
40
+ 'api_key': api_key,
41
+ 'artist': artist_name,
42
+ 'track': track_name,
43
+ 'format': 'json'
44
+ }
45
+ response = requests.get(base_url, params=params)
46
+ if response.status_code == 200 and response.text.strip():
47
+ return response.json()
48
+ else:
49
+ return None
50
+
51
+
52
+ def parse_lastfm_data(data):
53
+ if data and 'track' in data:
54
+ track = data['track']
55
+ return {
56
+ 'listeners': track.get('listeners', '0'),
57
+ 'playcount': track.get('playcount', '0'),
58
+ 'tags': ', '.join(tag['name'] for tag in track.get('toptags', {}).get('tag', [])),
59
+ }
60
+ return None
61
+
62
+ from tqdm import tqdm
63
+ tqdm.pandas()
64
+
65
+ load_dotenv()
66
+ api_key = os.getenv('LASTFM_API_KEY')
67
+ tracks_skipped = 0
68
+
69
+ def print_tracks_skipped():
70
+ print(f"Tracks skipped: {tracks_skipped}")
71
+
72
+
73
+ def fetch_and_parse(row):
74
+ global tracks_skipped
75
+ data = fetch_lastfm_data(api_key, row['artist'], row['song'])
76
+ if data is None:
77
+ tracks_skipped += 1
78
+ return None
79
+ parsed_data = parse_lastfm_data(data)
80
+ if parsed_data is None:
81
+ tracks_skipped += 1
82
+ return parsed_data
mlops/scripts/create_user_preferences/src/synthetic_user_data.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from config import DATA_DIR
3
+ import pandas as pd
4
+ import os
5
+ from data_loader import load_credentials, load_data, upload_data
6
+ from cosine_similarity import create_document_term_matrix, calculate_cosine_similarity
7
+ from integrate_lastfm import *
8
+
9
+ creds = load_credentials()
10
+
11
+ def save_similarity_matrix(matrix, feature_names, filepath):
12
+ matrix = pd.DataFrame(matrix, index=feature_names, columns=feature_names)
13
+ matrix.to_csv(filepath)
14
+ return matrix
15
+
16
+ def generate_user_preferences(similarity_matrix, feature_names, num_users=10, songs_per_user=(50,100), top_similar=(5, 10)):
17
+ """ Generate synthetic user data based on artist similarity. """
18
+ sim_df = save_similarity_matrix(similarity_matrix, feature_names, DATA_DIR + '/song_similarity_matrix.csv')
19
+ song_index = similarity_matrix.shape[0]
20
+ # dictionary assigning masks of indices-to-song in the similarity matrix to users
21
+ # requires selecting the data from the similarity matrix using the masks for semantics
22
+ user_data = {}
23
+ top_similar_n = np.random.randint(*top_similar)
24
+ for user_id in range(num_users):
25
+ num_songs = np.random.randint(*songs_per_user)
26
+ # samples 40-50 starter songs
27
+ starter_songs = np.random.choice(song_index, size=num_songs, replace=False)
28
+
29
+ # selects these indices from the pandas df
30
+ user_sim_df = sim_df.iloc[starter_songs,:]
31
+ def select_top_similar(row, n=top_similar_n):
32
+ '''
33
+ selects top n similar songs to the starter song and adds it to user preferences
34
+ '''
35
+
36
+ curr_song_index = row.name # include starter song
37
+ row = row.to_numpy()
38
+ # excluding songs with same lyrics as starter
39
+ non_starters = row != 1.0
40
+ row = row[non_starters]
41
+ top_n_indices = np.array([])
42
+ if len(row) >= n:
43
+ # selecting top 3 similarities
44
+ top_n_indices_mask = np.argpartition(row, -n)[-n-1:-1]
45
+ # sorting the top 3 similarities
46
+ top_n_indices_mask = top_n_indices_mask[np.argsort(row[top_n_indices_mask])[::-1]]
47
+ # returning the indices
48
+ top_n_indices = np.where(non_starters)[0][top_n_indices_mask]
49
+ # adding original starter song to preferences
50
+ top_n_indices = np.insert(top_n_indices, 0, curr_song_index)
51
+ if user_id not in user_data:
52
+ user_data[user_id] = top_n_indices.tolist()
53
+ else:
54
+ user_data[user_id].extend(top_n_indices.tolist())
55
+ user_sim_df.apply(lambda row: select_top_similar(row), axis=1)
56
+
57
+ return user_data
58
+
59
+ def transform_mask_to_songs(row_index, array, data):
60
+ curr_user_index = row_index
61
+ row = np.array(array[1])
62
+ user_songs = data.iloc[row, :].copy(deep=True)
63
+ user_songs['userID'] = curr_user_index
64
+
65
+ return user_songs
66
+
67
+ def save_preferences(user_preferences):
68
+ '''
69
+ saves data locally and then uploads to google drive
70
+ '''
71
+ filename = DATA_DIR + '/user_preferences.csv'
72
+ user_preferences.reset_index(drop=False, inplace=True)
73
+ user_preferences.rename(columns={'index':'songID'}, inplace=True)
74
+ user_preferences.to_csv(filename, index=False)
75
+ print('Synthetic User Data Saved to Local File: ' + DATA_DIR + '/user_preferences.csv')
76
+ upload_data(filename, creds=creds)
77
+ print('Synthetic User Data Saved to Remote Drive.')
78
+
79
+ def integrate_lastfm(user_preferences):
80
+ ### Adding additional LastFM Data for inference with NN Model ###
81
+ user_preferences['lastfm_data'] = user_preferences.progress_apply(fetch_and_parse, axis=1)
82
+ user_preferences = user_preferences[user_preferences['lastfm_data'].notna()]
83
+ user_preferences.reset_index(drop=True, inplace=True)
84
+ track_details_df = pd.json_normalize(user_preferences['lastfm_data'])
85
+ mixed = pd.concat(
86
+ [user_preferences.drop(columns=['lastfm_data']), track_details_df], axis=1)
87
+ print_tracks_skipped()
88
+ return mixed
89
+
90
+
91
+ if __name__ == "__main__":
92
+ file_id = '1EL4vYhO4A0Cgm8akBgAfDrWOGvtF6Xvo'
93
+ file_name = 'millionsong_dataset.zip'
94
+ data = load_data(file_id=file_id, file_name=file_name)
95
+
96
+ ### Creating the Synthetic user preferences
97
+ dtm, lyric_term_features = create_document_term_matrix(data)
98
+ song_artist_index = list(data.index)
99
+ similarity_matrix = calculate_cosine_similarity(dtm)
100
+ user_preference_masks = generate_user_preferences(similarity_matrix, feature_names=song_artist_index)
101
+ user_preferences = pd.concat([transform_mask_to_songs(row, array, data) for row, array in enumerate(user_preference_masks.items())], axis = 0)
102
+
103
+ ### Connecting user data to additional LastFM features for model inference
104
+ user_preferences = integrate_lastfm(user_preferences)
105
+ save_preferences(user_preferences=user_preferences)
106
+
107
+
108
+
mlops/scripts/train_model/train_model.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install python-dotenv
2
+ # imports
3
+ import pandas as pd
4
+ import h5py
5
+ import os
6
+ from sqlalchemy import create_engine
7
+ import requests
8
+ import time
9
+ from dotenv import load_dotenv
10
+ import pandas as pd
11
+
12
+ df = pd.read_csv('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/db/data/music_data.csv')
13
+ df.dropna(inplace=True)
14
+
15
+ import pandas as pd
16
+ import torch
17
+ from torch.utils.data import DataLoader
18
+ import torch.nn as nn
19
+ import torch.nn.functional as F
20
+ from sklearn.preprocessing import LabelEncoder, MinMaxScaler
21
+ from sklearn.model_selection import train_test_split
22
+ import torch.optim as optim
23
+
24
+ # Encode categorical data
25
+ label_encoders = {}
26
+ unknown_label = 'unknown' # Define an unknown label
27
+
28
+ for column in ['artist_name', 'tags', 'title']:
29
+ le = LabelEncoder()
30
+
31
+ # Get unique categories plus an 'unknown' category
32
+ unique_categories = df[column].unique().tolist()
33
+ # Add 'unknown' to the list of categories
34
+ unique_categories.append(unknown_label)
35
+
36
+ # Fit the LabelEncoder to these categories
37
+ le.fit(unique_categories)
38
+ df[column] = le.transform(df[column].astype(str))
39
+
40
+ # Store the encoder
41
+ label_encoders[column] = le
42
+
43
+
44
+ # Normalize numerical features
45
+ scaler = MinMaxScaler()
46
+ df[['duration', 'listeners', 'playcount']] = scaler.fit_transform(
47
+ df[['duration', 'listeners', 'playcount']])
48
+
49
+ # Split data into features and target
50
+ X = df[['artist_name', 'tags', 'duration', 'listeners', 'playcount']]
51
+ y = df['title']
52
+
53
+ # Split the dataset into training and testing sets
54
+ X_train, X_test, y_train, y_test = train_test_split(
55
+ X, y, test_size=0.2, random_state=42)
56
+
57
+ class SongRecommender(nn.Module):
58
+ def __init__(self):
59
+ super(SongRecommender, self).__init__()
60
+ self.fc1 = nn.Linear(5, 128) # Adjust input features if needed
61
+ self.fc2 = nn.Linear(128, 256)
62
+ self.fc3 = nn.Linear(256, 128)
63
+ # Output size = number of unique titles including 'unknown'
64
+ # Add 1 for the 'unknown' label
65
+ self.output = nn.Linear(128, len(y.unique()) + 1)
66
+
67
+ def forward(self, x):
68
+ x = F.relu(self.fc1(x))
69
+ x = F.relu(self.fc2(x))
70
+ x = F.relu(self.fc3(x))
71
+ x = self.output(x)
72
+ return x
73
+
74
+
75
+ model = SongRecommender()
76
+ optimizer = optim.Adam(model.parameters(), lr=0.001)
77
+ criterion = nn.CrossEntropyLoss()
78
+
79
+ def train_model(model, X_train, y_train, X_test, y_test):
80
+ train_loader = DataLoader(
81
+ list(zip(X_train.values.astype(float), y_train)), batch_size=50, shuffle=True)
82
+ test_loader = DataLoader(
83
+ list(zip(X_test.values.astype(float), y_test)), batch_size=50, shuffle=False)
84
+
85
+ model.train()
86
+ for epoch in range(50): # Number of epochs
87
+ train_loss = 0
88
+ for features, labels in train_loader:
89
+ optimizer.zero_grad()
90
+ outputs = model(torch.tensor(features).float())
91
+ # Ensure labels are long type
92
+ loss = criterion(outputs, torch.tensor(labels).long())
93
+ loss.backward()
94
+ optimizer.step()
95
+ train_loss += loss.item()
96
+
97
+ # Validation phase
98
+ model.eval()
99
+ validation_loss = 0
100
+ for features, labels in test_loader:
101
+ outputs = model(torch.tensor(features).float())
102
+ loss = criterion(outputs, torch.tensor(labels).long())
103
+ validation_loss += loss.item()
104
+
105
+ print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}')
106
+ train_model(model, X_train, y_train, X_test, y_test)
107
+ # save the model
108
+ torch.save(model.state_dict(), 'model.pth')
109
+ # load the model
110
+ model = SongRecommender()
111
+ def recommend_songs(model, input_features):
112
+ model.eval()
113
+ print(input_features)
114
+ with torch.no_grad():
115
+ try:
116
+ artist_index = label_encoders['artist_name'].transform(
117
+ [input_features['artist_name']])
118
+ except ValueError:
119
+ artist_index = label_encoders['artist_name'].transform(['unknown'])
120
+
121
+ try:
122
+ tags_index = label_encoders['tags'].transform(
123
+ [input_features['tags']])
124
+ except ValueError:
125
+ tags_index = label_encoders['tags'].transform(['unknown'])
126
+
127
+ # Create a DataFrame with feature names
128
+ scaled_features = pd.DataFrame(
129
+ [[input_features['duration'], input_features['listeners'],
130
+ input_features['playcount']]],
131
+ columns=['duration', 'listeners', 'playcount']
132
+ )
133
+ scaled_features = scaler.transform(scaled_features)[0]
134
+
135
+ features = torch.tensor(
136
+ [artist_index[0], tags_index[0], *scaled_features]).float().unsqueeze(0)
137
+ predictions = model(features)
138
+ top_5_values, top_5_indices = predictions.topk(5)
139
+ recommended_song_ids = top_5_indices.squeeze().tolist()
140
+
141
+ return label_encoders['title'].inverse_transform(recommended_song_ids)
142
+
143
+ import requests
144
+
145
+ def fetch_song_data(api_key, artist_name, track_name):
146
+ url = "http://ws.audioscrobbler.com/2.0/"
147
+ params = {
148
+ 'method': 'track.getInfo',
149
+ 'api_key': api_key,
150
+ 'artist': artist_name,
151
+ 'track': track_name,
152
+ 'format': 'json'
153
+ }
154
+ response = requests.get(url, params=params)
155
+ print(response.content)
156
+ return response.json() if response.status_code == 200 else {}
157
+
158
+
159
+ def parse_song_data(song_data):
160
+ if song_data and 'track' in song_data:
161
+ track = song_data['track']
162
+ return {
163
+ 'artist_name': track['artist']['name'],
164
+ 'tags': ', '.join([tag['name'] for tag in track.get('toptags', {}).get('tag', [])]),
165
+ 'duration': float(track.get('duration', 0)),
166
+ 'listeners': int(track.get('listeners', 0)),
167
+ 'playcount': int(track.get('playcount', 0)),
168
+ 'album': track.get('album', {}).get('title', 'Unknown')
169
+ }
170
+ return {}
171
+ from dotenv import load_dotenv
172
+ import os
173
+
174
+ load_dotenv()
175
+ api_key = os.getenv('LASTFM_API_KEY')
176
+
177
+ artist_name = 'Lagy Gaga'
178
+ track_name = 'Poker Face'
179
+
180
+ # Fetch and parse song data
181
+ song_data = fetch_song_data(api_key, artist_name, track_name)
182
+ parsed_data = parse_song_data(song_data)
183
+
184
+ print(song_data)
185
+ # if the song is not found, or the tags column is empty, print a message
186
+ if not parsed_data or not parsed_data['tags']:
187
+ print("Song not found or tags not available.")
188
+
189
+ else:
190
+ recommend_songs(model, parsed_data)
src/model_evaluation_v2.ipynb ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# **Evaluating the Recommendation Model**"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
+ " from .autonotebook import tqdm as notebook_tqdm\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "import gradio as gr\n",
26
+ "import torch\n",
27
+ "import torch.nn as nn\n",
28
+ "from joblib import load\n",
29
+ "import sklearn"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 2,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "# Define the same neural network model\n",
39
+ "class ImprovedSongRecommender(nn.Module):\n",
40
+ " def __init__(self, input_size, num_titles):\n",
41
+ " super(ImprovedSongRecommender, self).__init__()\n",
42
+ " self.fc1 = nn.Linear(input_size, 128)\n",
43
+ " self.bn1 = nn.BatchNorm1d(128)\n",
44
+ " self.fc2 = nn.Linear(128, 256)\n",
45
+ " self.bn2 = nn.BatchNorm1d(256)\n",
46
+ " self.fc3 = nn.Linear(256, 128)\n",
47
+ " self.bn3 = nn.BatchNorm1d(128)\n",
48
+ " self.output = nn.Linear(128, num_titles)\n",
49
+ " self.dropout = nn.Dropout(0.5)\n",
50
+ "\n",
51
+ " def forward(self, x):\n",
52
+ " x = torch.relu(self.bn1(self.fc1(x)))\n",
53
+ " x = self.dropout(x)\n",
54
+ " x = torch.relu(self.bn2(self.fc2(x)))\n",
55
+ " x = self.dropout(x)\n",
56
+ " x = torch.relu(self.bn3(self.fc3(x)))\n",
57
+ " x = self.dropout(x)\n",
58
+ " x = self.output(x)\n",
59
+ " return x\n",
60
+ "\n",
61
+ "# Load the trained model\n",
62
+ "model_path = \"../models/improved_model.pth\"\n",
63
+ "num_unique_titles = 4855 "
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles) \n",
73
+ "model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))\n",
74
+ "model.eval()"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 3,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "name": "stderr",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator LabelEncoder from version 1.2.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
87
+ "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
88
+ " warnings.warn(\n",
89
+ "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator MinMaxScaler from version 1.2.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
90
+ "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
91
+ " warnings.warn(\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "# Load the label encoders and scaler\n",
97
+ "label_encoders_path = \"data/new_label_encoders.joblib\"\n",
98
+ "scaler_path = \"data/new_scaler.joblib\"\n",
99
+ "\n",
100
+ "label_encoders = load(label_encoders_path)\n",
101
+ "scaler = load(scaler_path)\n",
102
+ "\n",
103
+ "# Create a mapping from encoded indices to actual song titles\n",
104
+ "index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}\n"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": []
113
+ }
114
+ ],
115
+ "metadata": {
116
+ "kernelspec": {
117
+ "display_name": "base",
118
+ "language": "python",
119
+ "name": "python3"
120
+ },
121
+ "language_info": {
122
+ "codemirror_mode": {
123
+ "name": "ipython",
124
+ "version": 3
125
+ },
126
+ "file_extension": ".py",
127
+ "mimetype": "text/x-python",
128
+ "name": "python",
129
+ "nbconvert_exporter": "python",
130
+ "pygments_lexer": "ipython3",
131
+ "version": "3.8.1"
132
+ }
133
+ },
134
+ "nbformat": 4,
135
+ "nbformat_minor": 2
136
+ }
src/model_training_v2.ipynb CHANGED
@@ -1,36 +1,22 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": []
7
- },
8
- "kernelspec": {
9
- "name": "python3",
10
- "display_name": "Python 3"
11
- },
12
- "language_info": {
13
- "name": "python"
14
- }
15
- },
16
  "cells": [
17
  {
18
  "cell_type": "markdown",
19
- "source": [
20
- "# **Music recommender**"
21
- ],
22
  "metadata": {
23
  "id": "DDADPl-phDUC"
24
- }
 
 
 
25
  },
26
  {
27
  "cell_type": "markdown",
28
- "source": [
29
- "# **Load Data**"
30
- ],
31
  "metadata": {
32
  "id": "E7Cu5Fmqct7J"
33
- }
 
 
 
34
  },
35
  {
36
  "cell_type": "code",
@@ -45,11 +31,7 @@
45
  },
46
  "outputs": [
47
  {
48
- "output_type": "display_data",
49
  "data": {
50
- "text/plain": [
51
- "<IPython.core.display.HTML object>"
52
- ],
53
  "text/html": [
54
  "\n",
55
  " <input type=\"file\" id=\"files-793c32c8-99a6-4873-9585-738e1d4b2ab1\" name=\"files[]\" multiple disabled\n",
@@ -234,13 +216,17 @@
234
  "};\n",
235
  "})(self);\n",
236
  "</script> "
 
 
 
237
  ]
238
  },
239
- "metadata": {}
 
240
  },
241
  {
242
- "output_type": "stream",
243
  "name": "stdout",
 
244
  "text": [
245
  "Saving music_data.csv to music_data.csv\n",
246
  " title \\\n",
@@ -293,9 +279,7 @@
293
  },
294
  {
295
  "cell_type": "code",
296
- "source": [
297
- "df.head()"
298
- ],
299
  "metadata": {
300
  "colab": {
301
  "base_uri": "https://localhost:8080/",
@@ -304,40 +288,14 @@
304
  "id": "9E3in0U3dK5I",
305
  "outputId": "c1d5362a-6a33-4543-ff4d-4e11cf8220ec"
306
  },
307
- "execution_count": null,
308
  "outputs": [
309
  {
310
- "output_type": "execute_result",
311
  "data": {
312
- "text/plain": [
313
- " title \\\n",
314
- "0 100 Club 1996 ''We Love You Beatles'' - Live \n",
315
- "1 Yo Quiero Contigo \n",
316
- "4 Emerald \n",
317
- "6 Karma \n",
318
- "7 Money Blues \n",
319
- "\n",
320
- " release artist_name duration \\\n",
321
- "0 Sex Pistols - The Interviews Sex Pistols 88.73751 \n",
322
- "1 Sentenciados - Platinum Edition Baby Rasta & Gringo 167.36608 \n",
323
- "4 Emerald Bedrock 501.86404 \n",
324
- "6 The Diary Of Alicia Keys Alicia Keys 255.99955 \n",
325
- "7 Slidetime Joanna Connor 243.66975 \n",
326
- "\n",
327
- " artist_familiarity artist_hotttnesss year listeners playcount \\\n",
328
- "0 0.731184 0.549204 0 172 210 \n",
329
- "1 0.610186 0.355320 0 9753 16911 \n",
330
- "4 0.654039 0.390625 2004 973 2247 \n",
331
- "6 0.933916 0.778674 2003 250304 1028356 \n",
332
- "7 0.479218 0.332857 0 429 1008 \n",
333
- "\n",
334
- " tags \n",
335
- "0 The Beatles, title is a full sentence \n",
336
- "1 Reggaeton, alexis y fido, Eliana, mis videos, ... \n",
337
- "4 dance \n",
338
- "6 rnb, soul, Alicia Keys, female vocalists, Karma \n",
339
- "7 guitar girl, blues "
340
- ],
341
  "text/html": [
342
  "\n",
343
  " <div id=\"df-b9e5c35d-1534-4ad7-8661-887b39a472e9\" class=\"colab-df-container\">\n",
@@ -650,31 +608,48 @@
650
  " </div>\n",
651
  " </div>\n"
652
  ],
653
- "application/vnd.google.colaboratory.intrinsic+json": {
654
- "type": "dataframe",
655
- "variable_name": "df",
656
- "summary": "{\n \"name\": \"df\",\n \"rows\": 5063,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4854,\n \"samples\": [\n \"I Wish I Had A Girl\",\n \"Jump [Jacques Lu Cont Edit]\",\n \"Mulin' Around\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"release\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4187,\n \"samples\": [\n \"Le Bordel Magnifique\",\n \"Charlotte's Web (OST)\",\n \"X.O. Experience\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2461,\n \"samples\": [\n \"Lee Ritenour\",\n \"Pennywise\",\n \"Anneli Drecker\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"duration\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 107.73289375974717,\n \"min\": 1.04444,\n \"max\": 1815.2224,\n \"num_unique_values\": 3939,\n \"samples\": [\n 294.24281,\n 240.79628,\n 115.53914\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_familiarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14886096792686204,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2474,\n \"samples\": [\n 0.787098355481,\n 0.481771820142,\n 0.374024633035\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_hotttnesss\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1347303774485448,\n \"min\": 0.0,\n \"max\": 1.08250255673,\n \"num_unique_values\": 2398,\n \"samples\": [\n 0.376018761952,\n 0.355667956383,\n 0.289970666912\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 917,\n \"min\": 0,\n \"max\": 2010,\n \"num_unique_values\": 69,\n \"samples\": [\n 1979,\n 0,\n 1965\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"listeners\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 150513,\n \"min\": 0,\n \"max\": 2451482,\n \"num_unique_values\": 3914,\n \"samples\": [\n 781546,\n 6216,\n 396579\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"playcount\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1115103,\n \"min\": 0,\n \"max\": 23182516,\n \"num_unique_values\": 4422,\n \"samples\": [\n 62736,\n 1305,\n 17033\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tags\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4583,\n \"samples\": [\n \"dance, 90s, trance, House, jungle\",\n \"country, favorite songs, classic country, linedance, Martina McBride\",\n \"90s, heavy metal, thrash metal, metal, punk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
657
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
658
  },
 
659
  "metadata": {},
660
- "execution_count": 2
661
  }
 
 
 
662
  ]
663
  },
664
  {
665
  "cell_type": "code",
666
- "source": [
667
- "# Display basic information about the dataset\n",
668
- "print(df.info())\n",
669
- "\n",
670
- "# Display summary statistics for numerical columns\n",
671
- "print(df.describe())\n",
672
- "\n",
673
- "# Display unique values for categorical columns\n",
674
- "print(\"Unique values in 'title':\", df['title'].nunique())\n",
675
- "print(\"Unique values in 'artist_name':\", df['artist_name'].nunique())\n",
676
- "print(\"Unique values in 'tags':\", df['tags'].nunique())"
677
- ],
678
  "metadata": {
679
  "colab": {
680
  "base_uri": "https://localhost:8080/"
@@ -682,11 +657,10 @@
682
  "id": "b_sSacbdHcn6",
683
  "outputId": "f745b028-fd97-4b19-b9f0-9e041621e5d3"
684
  },
685
- "execution_count": null,
686
  "outputs": [
687
  {
688
- "output_type": "stream",
689
  "name": "stdout",
 
690
  "text": [
691
  "<class 'pandas.core.frame.DataFrame'>\n",
692
  "Index: 5063 entries, 0 to 9530\n",
@@ -730,19 +704,48 @@
730
  "Unique values in 'tags': 4583\n"
731
  ]
732
  }
 
 
 
 
 
 
 
 
 
 
 
 
733
  ]
734
  },
735
  {
736
  "cell_type": "markdown",
737
- "source": [
738
- "# **Preprocessing**"
739
- ],
740
  "metadata": {
741
  "id": "wPVFDtk9g9ox"
742
- }
 
 
 
743
  },
744
  {
745
  "cell_type": "code",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  "source": [
747
  "import pandas as pd\n",
748
  "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
@@ -790,27 +793,32 @@
790
  "joblib.dump(scaler, \"/content/new_scaler.joblib\")\n",
791
  "\n",
792
  "print(\"Label encoders and scaler saved successfully.\")\n"
793
- ],
 
 
 
 
794
  "metadata": {
795
  "colab": {
796
  "base_uri": "https://localhost:8080/"
797
  },
798
- "id": "3fsU1IvylyZg",
799
- "outputId": "c2ba3adc-c077-454a-94de-ca9bb0ba4807"
800
  },
801
- "execution_count": null,
802
  "outputs": [
803
  {
804
- "output_type": "stream",
805
  "name": "stdout",
 
806
  "text": [
807
- "Label encoders and scaler saved successfully.\n"
 
 
 
 
 
808
  ]
809
  }
810
- ]
811
- },
812
- {
813
- "cell_type": "code",
814
  "source": [
815
  "from sklearn.model_selection import train_test_split\n",
816
  "\n",
@@ -846,41 +854,41 @@
846
  "# Print the maximum values after clipping\n",
847
  "print(\"Maximum value in y_train after clipping:\", y_train.max())\n",
848
  "print(\"Maximum value in y_test after clipping:\", y_test.max())\n"
849
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  "metadata": {
851
  "colab": {
852
  "base_uri": "https://localhost:8080/"
853
  },
854
- "id": "JBWZWp_8Jr82",
855
- "outputId": "73a312c1-3615-4a87-965b-c2fc41fc50e7"
856
  },
857
- "execution_count": null,
858
  "outputs": [
859
  {
860
- "output_type": "stream",
861
  "name": "stdout",
 
862
  "text": [
863
- "Data split into training and testing sets.\n",
864
- "Maximum value in y_train: 4854\n",
865
- "Maximum value in y_test: 4850\n",
866
- "Number of unique titles: 4855\n",
867
- "Maximum value in y_train after clipping: 4854\n",
868
- "Maximum value in y_test after clipping: 4850\n"
869
  ]
870
  }
871
- ]
872
- },
873
- {
874
- "cell_type": "markdown",
875
- "source": [
876
- "# **Training**"
877
  ],
878
- "metadata": {
879
- "id": "syYhdUbxgA-K"
880
- }
881
- },
882
- {
883
- "cell_type": "code",
884
  "source": [
885
  "import torch\n",
886
  "import torch.nn as nn\n",
@@ -981,41 +989,36 @@
981
  "torch.save(model.state_dict(), model_path)\n",
982
  "\n",
983
  "print(\"Improved model trained and saved successfully.\")\n"
984
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
985
  "metadata": {
986
  "colab": {
987
  "base_uri": "https://localhost:8080/"
988
  },
989
- "id": "aaR1IGymKQq2",
990
- "outputId": "9e5115a5-1a75-4672-a0b3-4fdd314e1a79"
991
  },
992
- "execution_count": null,
993
  "outputs": [
994
  {
995
- "output_type": "stream",
996
  "name": "stdout",
 
997
  "text": [
998
- "Epoch 1, Training Loss: 8.921830113728841, Validation Loss: 8.836441385979747\n",
999
- "Epoch 2, Training Loss: 8.331391870239635, Validation Loss: 9.148561271966672\n",
1000
- "Epoch 3, Training Loss: 7.494005516429007, Validation Loss: 10.484928570541681\n",
1001
- "Epoch 4, Training Loss: 6.704833826606657, Validation Loss: 11.745069999320835\n",
1002
- "Early stopping triggered\n",
1003
- "Improved model trained and saved successfully.\n"
1004
  ]
1005
  }
1006
- ]
1007
- },
1008
- {
1009
- "cell_type": "markdown",
1010
- "source": [
1011
- "# **Testing**"
1012
  ],
1013
- "metadata": {
1014
- "id": "g4hJVlNXf5Vu"
1015
- }
1016
- },
1017
- {
1018
- "cell_type": "code",
1019
  "source": [
1020
  "import torch\n",
1021
  "from joblib import load\n",
@@ -1095,27 +1098,28 @@
1095
  "\n",
1096
  "recommendations = recommend_songs(tags, artist_name)\n",
1097
  "print(\"Recommendations:\", recommendations)\n"
1098
- ],
 
 
 
 
1099
  "metadata": {
1100
  "colab": {
1101
  "base_uri": "https://localhost:8080/"
1102
  },
1103
- "id": "KwqV-HnCOvtz",
1104
- "outputId": "d412ce92-3ab8-4f3d-df83-22ef9e857203"
1105
  },
1106
- "execution_count": null,
1107
  "outputs": [
1108
  {
1109
- "output_type": "stream",
1110
  "name": "stdout",
 
1111
  "text": [
1112
- "Recommendations: ['Betrayal Is A Symptom', 'The Earth Will Shake', 'Saturday', 'Firehouse Rock', 'Breathe Easy']\n"
 
1113
  ]
1114
  }
1115
- ]
1116
- },
1117
- {
1118
- "cell_type": "code",
1119
  "source": [
1120
  "import torch\n",
1121
  "from joblib import load\n",
@@ -1202,25 +1206,22 @@
1202
  "\n",
1203
  "recommendations = recommend_songs(tags, artist_name)\n",
1204
  "print(\"Recommendations:\", recommendations)\n"
1205
- ],
1206
- "metadata": {
1207
- "colab": {
1208
- "base_uri": "https://localhost:8080/"
1209
- },
1210
- "id": "3HzLKv5mPxOv",
1211
- "outputId": "62b37d04-4857-44fb-b5c4-8ead55db9b1a"
1212
- },
1213
- "execution_count": null,
1214
- "outputs": [
1215
- {
1216
- "output_type": "stream",
1217
- "name": "stdout",
1218
- "text": [
1219
- "Recommendations: ['Betrayal Is A Symptom', 'Carnival (from \"Black Orpheus\")', 'Saturday', 'The Earth Will Shake', 'Start!']\n",
1220
- "Recommendations: ['Old Friends', 'Betrayal Is A Symptom', 'Between Love & Hate', 'Carnival (from \"Black Orpheus\")', 'Satin Doll']\n"
1221
- ]
1222
- }
1223
  ]
1224
  }
1225
- ]
1226
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "markdown",
 
 
 
5
  "metadata": {
6
  "id": "DDADPl-phDUC"
7
+ },
8
+ "source": [
9
+ "# **Music recommender**"
10
+ ]
11
  },
12
  {
13
  "cell_type": "markdown",
 
 
 
14
  "metadata": {
15
  "id": "E7Cu5Fmqct7J"
16
+ },
17
+ "source": [
18
+ "# **Load Data**"
19
+ ]
20
  },
21
  {
22
  "cell_type": "code",
 
31
  },
32
  "outputs": [
33
  {
 
34
  "data": {
 
 
 
35
  "text/html": [
36
  "\n",
37
  " <input type=\"file\" id=\"files-793c32c8-99a6-4873-9585-738e1d4b2ab1\" name=\"files[]\" multiple disabled\n",
 
216
  "};\n",
217
  "})(self);\n",
218
  "</script> "
219
+ ],
220
+ "text/plain": [
221
+ "<IPython.core.display.HTML object>"
222
  ]
223
  },
224
+ "metadata": {},
225
+ "output_type": "display_data"
226
  },
227
  {
 
228
  "name": "stdout",
229
+ "output_type": "stream",
230
  "text": [
231
  "Saving music_data.csv to music_data.csv\n",
232
  " title \\\n",
 
279
  },
280
  {
281
  "cell_type": "code",
282
+ "execution_count": null,
 
 
283
  "metadata": {
284
  "colab": {
285
  "base_uri": "https://localhost:8080/",
 
288
  "id": "9E3in0U3dK5I",
289
  "outputId": "c1d5362a-6a33-4543-ff4d-4e11cf8220ec"
290
  },
 
291
  "outputs": [
292
  {
 
293
  "data": {
294
+ "application/vnd.google.colaboratory.intrinsic+json": {
295
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 5063,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4854,\n \"samples\": [\n \"I Wish I Had A Girl\",\n \"Jump [Jacques Lu Cont Edit]\",\n \"Mulin' Around\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"release\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4187,\n \"samples\": [\n \"Le Bordel Magnifique\",\n \"Charlotte's Web (OST)\",\n \"X.O. Experience\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2461,\n \"samples\": [\n \"Lee Ritenour\",\n \"Pennywise\",\n \"Anneli Drecker\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"duration\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 107.73289375974717,\n \"min\": 1.04444,\n \"max\": 1815.2224,\n \"num_unique_values\": 3939,\n \"samples\": [\n 294.24281,\n 240.79628,\n 115.53914\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_familiarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14886096792686204,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2474,\n \"samples\": [\n 0.787098355481,\n 0.481771820142,\n 0.374024633035\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"artist_hotttnesss\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1347303774485448,\n \"min\": 0.0,\n \"max\": 1.08250255673,\n \"num_unique_values\": 2398,\n \"samples\": [\n 0.376018761952,\n 0.355667956383,\n 0.289970666912\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 917,\n \"min\": 0,\n \"max\": 2010,\n \"num_unique_values\": 69,\n \"samples\": [\n 1979,\n 0,\n 1965\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"listeners\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 150513,\n \"min\": 0,\n \"max\": 2451482,\n \"num_unique_values\": 3914,\n \"samples\": [\n 781546,\n 6216,\n 396579\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"playcount\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1115103,\n \"min\": 0,\n \"max\": 23182516,\n \"num_unique_values\": 4422,\n \"samples\": [\n 62736,\n 1305,\n 17033\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tags\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4583,\n \"samples\": [\n \"dance, 90s, trance, House, jungle\",\n \"country, favorite songs, classic country, linedance, Martina McBride\",\n \"90s, heavy metal, thrash metal, metal, punk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
296
+ "type": "dataframe",
297
+ "variable_name": "df"
298
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  "text/html": [
300
  "\n",
301
  " <div id=\"df-b9e5c35d-1534-4ad7-8661-887b39a472e9\" class=\"colab-df-container\">\n",
 
608
  " </div>\n",
609
  " </div>\n"
610
  ],
611
+ "text/plain": [
612
+ " title \\\n",
613
+ "0 100 Club 1996 ''We Love You Beatles'' - Live \n",
614
+ "1 Yo Quiero Contigo \n",
615
+ "4 Emerald \n",
616
+ "6 Karma \n",
617
+ "7 Money Blues \n",
618
+ "\n",
619
+ " release artist_name duration \\\n",
620
+ "0 Sex Pistols - The Interviews Sex Pistols 88.73751 \n",
621
+ "1 Sentenciados - Platinum Edition Baby Rasta & Gringo 167.36608 \n",
622
+ "4 Emerald Bedrock 501.86404 \n",
623
+ "6 The Diary Of Alicia Keys Alicia Keys 255.99955 \n",
624
+ "7 Slidetime Joanna Connor 243.66975 \n",
625
+ "\n",
626
+ " artist_familiarity artist_hotttnesss year listeners playcount \\\n",
627
+ "0 0.731184 0.549204 0 172 210 \n",
628
+ "1 0.610186 0.355320 0 9753 16911 \n",
629
+ "4 0.654039 0.390625 2004 973 2247 \n",
630
+ "6 0.933916 0.778674 2003 250304 1028356 \n",
631
+ "7 0.479218 0.332857 0 429 1008 \n",
632
+ "\n",
633
+ " tags \n",
634
+ "0 The Beatles, title is a full sentence \n",
635
+ "1 Reggaeton, alexis y fido, Eliana, mis videos, ... \n",
636
+ "4 dance \n",
637
+ "6 rnb, soul, Alicia Keys, female vocalists, Karma \n",
638
+ "7 guitar girl, blues "
639
+ ]
640
  },
641
+ "execution_count": 2,
642
  "metadata": {},
643
+ "output_type": "execute_result"
644
  }
645
+ ],
646
+ "source": [
647
+ "df.head()"
648
  ]
649
  },
650
  {
651
  "cell_type": "code",
652
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
653
  "metadata": {
654
  "colab": {
655
  "base_uri": "https://localhost:8080/"
 
657
  "id": "b_sSacbdHcn6",
658
  "outputId": "f745b028-fd97-4b19-b9f0-9e041621e5d3"
659
  },
 
660
  "outputs": [
661
  {
 
662
  "name": "stdout",
663
+ "output_type": "stream",
664
  "text": [
665
  "<class 'pandas.core.frame.DataFrame'>\n",
666
  "Index: 5063 entries, 0 to 9530\n",
 
704
  "Unique values in 'tags': 4583\n"
705
  ]
706
  }
707
+ ],
708
+ "source": [
709
+ "# Display basic information about the dataset\n",
710
+ "print(df.info())\n",
711
+ "\n",
712
+ "# Display summary statistics for numerical columns\n",
713
+ "print(df.describe())\n",
714
+ "\n",
715
+ "# Display unique values for categorical columns\n",
716
+ "print(\"Unique values in 'title':\", df['title'].nunique())\n",
717
+ "print(\"Unique values in 'artist_name':\", df['artist_name'].nunique())\n",
718
+ "print(\"Unique values in 'tags':\", df['tags'].nunique())"
719
  ]
720
  },
721
  {
722
  "cell_type": "markdown",
 
 
 
723
  "metadata": {
724
  "id": "wPVFDtk9g9ox"
725
+ },
726
+ "source": [
727
+ "# **Preprocessing**"
728
+ ]
729
  },
730
  {
731
  "cell_type": "code",
732
+ "execution_count": null,
733
+ "metadata": {
734
+ "colab": {
735
+ "base_uri": "https://localhost:8080/"
736
+ },
737
+ "id": "3fsU1IvylyZg",
738
+ "outputId": "c2ba3adc-c077-454a-94de-ca9bb0ba4807"
739
+ },
740
+ "outputs": [
741
+ {
742
+ "name": "stdout",
743
+ "output_type": "stream",
744
+ "text": [
745
+ "Label encoders and scaler saved successfully.\n"
746
+ ]
747
+ }
748
+ ],
749
  "source": [
750
  "import pandas as pd\n",
751
  "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
 
793
  "joblib.dump(scaler, \"/content/new_scaler.joblib\")\n",
794
  "\n",
795
  "print(\"Label encoders and scaler saved successfully.\")\n"
796
+ ]
797
+ },
798
+ {
799
+ "cell_type": "code",
800
+ "execution_count": null,
801
  "metadata": {
802
  "colab": {
803
  "base_uri": "https://localhost:8080/"
804
  },
805
+ "id": "JBWZWp_8Jr82",
806
+ "outputId": "73a312c1-3615-4a87-965b-c2fc41fc50e7"
807
  },
 
808
  "outputs": [
809
  {
 
810
  "name": "stdout",
811
+ "output_type": "stream",
812
  "text": [
813
+ "Data split into training and testing sets.\n",
814
+ "Maximum value in y_train: 4854\n",
815
+ "Maximum value in y_test: 4850\n",
816
+ "Number of unique titles: 4855\n",
817
+ "Maximum value in y_train after clipping: 4854\n",
818
+ "Maximum value in y_test after clipping: 4850\n"
819
  ]
820
  }
821
+ ],
 
 
 
822
  "source": [
823
  "from sklearn.model_selection import train_test_split\n",
824
  "\n",
 
854
  "# Print the maximum values after clipping\n",
855
  "print(\"Maximum value in y_train after clipping:\", y_train.max())\n",
856
  "print(\"Maximum value in y_test after clipping:\", y_test.max())\n"
857
+ ]
858
+ },
859
+ {
860
+ "cell_type": "markdown",
861
+ "metadata": {
862
+ "id": "syYhdUbxgA-K"
863
+ },
864
+ "source": [
865
+ "# **Training**"
866
+ ]
867
+ },
868
+ {
869
+ "cell_type": "code",
870
+ "execution_count": null,
871
  "metadata": {
872
  "colab": {
873
  "base_uri": "https://localhost:8080/"
874
  },
875
+ "id": "aaR1IGymKQq2",
876
+ "outputId": "9e5115a5-1a75-4672-a0b3-4fdd314e1a79"
877
  },
 
878
  "outputs": [
879
  {
 
880
  "name": "stdout",
881
+ "output_type": "stream",
882
  "text": [
883
+ "Epoch 1, Training Loss: 8.921830113728841, Validation Loss: 8.836441385979747\n",
884
+ "Epoch 2, Training Loss: 8.331391870239635, Validation Loss: 9.148561271966672\n",
885
+ "Epoch 3, Training Loss: 7.494005516429007, Validation Loss: 10.484928570541681\n",
886
+ "Epoch 4, Training Loss: 6.704833826606657, Validation Loss: 11.745069999320835\n",
887
+ "Early stopping triggered\n",
888
+ "Improved model trained and saved successfully.\n"
889
  ]
890
  }
 
 
 
 
 
 
891
  ],
 
 
 
 
 
 
892
  "source": [
893
  "import torch\n",
894
  "import torch.nn as nn\n",
 
989
  "torch.save(model.state_dict(), model_path)\n",
990
  "\n",
991
  "print(\"Improved model trained and saved successfully.\")\n"
992
+ ]
993
+ },
994
+ {
995
+ "cell_type": "markdown",
996
+ "metadata": {
997
+ "id": "g4hJVlNXf5Vu"
998
+ },
999
+ "source": [
1000
+ "# **Testing**"
1001
+ ]
1002
+ },
1003
+ {
1004
+ "cell_type": "code",
1005
+ "execution_count": null,
1006
  "metadata": {
1007
  "colab": {
1008
  "base_uri": "https://localhost:8080/"
1009
  },
1010
+ "id": "KwqV-HnCOvtz",
1011
+ "outputId": "d412ce92-3ab8-4f3d-df83-22ef9e857203"
1012
  },
 
1013
  "outputs": [
1014
  {
 
1015
  "name": "stdout",
1016
+ "output_type": "stream",
1017
  "text": [
1018
+ "Recommendations: ['Betrayal Is A Symptom', 'The Earth Will Shake', 'Saturday', 'Firehouse Rock', 'Breathe Easy']\n"
 
 
 
 
 
1019
  ]
1020
  }
 
 
 
 
 
 
1021
  ],
 
 
 
 
 
 
1022
  "source": [
1023
  "import torch\n",
1024
  "from joblib import load\n",
 
1098
  "\n",
1099
  "recommendations = recommend_songs(tags, artist_name)\n",
1100
  "print(\"Recommendations:\", recommendations)\n"
1101
+ ]
1102
+ },
1103
+ {
1104
+ "cell_type": "code",
1105
+ "execution_count": null,
1106
  "metadata": {
1107
  "colab": {
1108
  "base_uri": "https://localhost:8080/"
1109
  },
1110
+ "id": "3HzLKv5mPxOv",
1111
+ "outputId": "62b37d04-4857-44fb-b5c4-8ead55db9b1a"
1112
  },
 
1113
  "outputs": [
1114
  {
 
1115
  "name": "stdout",
1116
+ "output_type": "stream",
1117
  "text": [
1118
+ "Recommendations: ['Betrayal Is A Symptom', 'Carnival (from \"Black Orpheus\")', 'Saturday', 'The Earth Will Shake', 'Start!']\n",
1119
+ "Recommendations: ['Old Friends', 'Betrayal Is A Symptom', 'Between Love & Hate', 'Carnival (from \"Black Orpheus\")', 'Satin Doll']\n"
1120
  ]
1121
  }
1122
+ ],
 
 
 
1123
  "source": [
1124
  "import torch\n",
1125
  "from joblib import load\n",
 
1206
  "\n",
1207
  "recommendations = recommend_songs(tags, artist_name)\n",
1208
  "print(\"Recommendations:\", recommendations)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209
  ]
1210
  }
1211
+ ],
1212
+ "metadata": {
1213
+ "colab": {
1214
+ "provenance": []
1215
+ },
1216
+ "kernelspec": {
1217
+ "display_name": "Python 3",
1218
+ "name": "python3"
1219
+ },
1220
+ "language_info": {
1221
+ "name": "python",
1222
+ "version": "3.8.1"
1223
+ }
1224
+ },
1225
+ "nbformat": 4,
1226
+ "nbformat_minor": 0
1227
+ }