diff --git "a/experiment.ipynb" "b/experiment.ipynb" --- "a/experiment.ipynb" +++ "b/experiment.ipynb" @@ -16,22 +16,28 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "from __future__ import annotations\n", - "\n", "from collections import Counter\n", "\n", "import joblib\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", + "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import StratifiedKFold, train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.tree import DecisionTreeClassifier\n", "from tqdm.notebook import tqdm\n", "from wordcloud import WordCloud\n", "\n", "from app.constants import CACHE_DIR\n", - "from app.data import load_data, tokenize" + "from app.data import load_data, tokenize\n", + "from app.model import _get_vectorizer" ] }, { @@ -93,27 +99,27 @@ " \n", " \n", " 0\n", - " Cooking microwave pizzas, yummy\n", + " MC, happy mother`s day to your mom ;).. love yah\n", " 1\n", " \n", " \n", " 1\n", - " I love the humor, I just reworded it. Like sa...\n", - " 1\n", + " A year from now is graduation....i am pretty s...\n", + " 0\n", " \n", " \n", " 2\n", - " That sucks to hear. I hate days like that\n", - " 0\n", + " Great for organising my work life balance\n", + " 1\n", " \n", " \n", " 3\n", - " Umm yeah. That`s probably a pretty good note ...\n", + " remember the guy who 1st #tweetbud you! ~> _2...\n", " 1\n", " \n", " \n", " 4\n", - " That would panic me a little! Maybe you can ...\n", + " She! Maybe that was our first mistake. Not e...\n", " 0\n", " \n", " \n", @@ -122,11 +128,11 @@ ], "text/plain": [ " text sentiment\n", - "0 Cooking microwave pizzas, yummy 1\n", - "1 I love the humor, I just reworded it. Like sa... 1\n", - "2 That sucks to hear. I hate days like that 0\n", - "3 Umm yeah. That`s probably a pretty good note ... 1\n", - "4 That would panic me a little! Maybe you can ... 0" + "0 MC, happy mother`s day to your mom ;).. love yah 1\n", + "1 A year from now is graduation....i am pretty s... 0\n", + "2 Great for organising my work life balance 1\n", + "3 remember the guy who 1st #tweetbud you! ~> _2... 1\n", + "4 She! Maybe that was our first mistake. Not e... 0" ] }, "execution_count": 4, @@ -150,8 +156,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n", - "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n" + "Cleaning: 100%|██████████| 3276/3276 [00:02<00:00, 1205.57doc/s]\n", + "Lemmatization: 100%|██████████| 3276/3276 [00:06<00:00, 508.76doc/s] \n" ] }, { @@ -183,33 +189,33 @@ " \n", " \n", " 0\n", - " Cooking microwave pizzas, yummy\n", + " MC, happy mother`s day to your mom ;).. love yah\n", " 1\n", - " cook microwave pizza yummy\n", + " happy mother day mom love yah\n", " \n", " \n", " 1\n", - " I love the humor, I just reworded it. Like sa...\n", - " 1\n", - " love humor reword like say group therapy inste...\n", + " A year from now is graduation....i am pretty s...\n", + " 0\n", + " year graduationi pretty sure ready\n", " \n", " \n", " 2\n", - " That sucks to hear. I hate days like that\n", - " 0\n", - " suck hear hate day like\n", + " Great for organising my work life balance\n", + " 1\n", + " great organise work life balance\n", " \n", " \n", " 3\n", - " Umm yeah. That`s probably a pretty good note ...\n", + " remember the guy who 1st #tweetbud you! ~> _2...\n", " 1\n", - " umm yeah probably pretty good note self eeeeee...\n", + " remember guy help flwrs smile\n", " \n", " \n", " 4\n", - " That would panic me a little! Maybe you can ...\n", + " She! Maybe that was our first mistake. Not e...\n", " 0\n", - " panic little maybe read orbitron gym like dowh...\n", + " maybe mistake cool brown nose moment\n", " \n", " \n", "\n", @@ -217,18 +223,18 @@ ], "text/plain": [ " text sentiment \\\n", - "0 Cooking microwave pizzas, yummy 1 \n", - "1 I love the humor, I just reworded it. Like sa... 1 \n", - "2 That sucks to hear. I hate days like that 0 \n", - "3 Umm yeah. That`s probably a pretty good note ... 1 \n", - "4 That would panic me a little! Maybe you can ... 0 \n", + "0 MC, happy mother`s day to your mom ;).. love yah 1 \n", + "1 A year from now is graduation....i am pretty s... 0 \n", + "2 Great for organising my work life balance 1 \n", + "3 remember the guy who 1st #tweetbud you! ~> _2... 1 \n", + "4 She! Maybe that was our first mistake. Not e... 0 \n", "\n", - " tokens \n", - "0 cook microwave pizza yummy \n", - "1 love humor reword like say group therapy inste... \n", - "2 suck hear hate day like \n", - "3 umm yeah probably pretty good note self eeeeee... \n", - "4 panic little maybe read orbitron gym like dowh... " + " tokens \n", + "0 happy mother day mom love yah \n", + "1 year graduationi pretty sure ready \n", + "2 great organise work life balance \n", + "3 remember guy help flwrs smile \n", + "4 maybe mistake cool brown nose moment " ] }, "execution_count": 5, @@ -259,20 +265,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", @@ -293,34 +288,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a885e681eaf14751b11088566e643a3e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/19583 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Gather all the unique words in the dataset\n", "word_freq = Counter()\n", @@ -348,34 +318,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "04f9a50519654e7188f59c62645572ff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/19583 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Gather all the unique tokens in the dataset\n", "token_freq = Counter()\n", @@ -403,48 +348,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "283ee0b586574489bf14a8ef0105ef78", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/9105 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", "\n", @@ -465,7 +371,109 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Vectorization" + "### Token frequency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_, ax = plt.subplots(figsize=(6, 4))\n", + "\n", + "dataset[\"text\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"Before Tokenization\")\n", + "dataset[\"tokens\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"After Tokenization\")\n", + "\n", + "ax.set_xlabel(\"Number of tokens\")\n", + "ax.set_ylabel(\"Count\")\n", + "ax.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vocabulary size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Vocabulary size before tokenization: {len(word_freq)}\")\n", + "print(f\"Vocabulary size after tokenization: {len(token_freq)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vectorization\n", + "\n", + "The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Define vectorizers\n", + "vectorizers = {\n", + " \"hashing\": _get_vectorizer(\"hashing\", n_features=2**20),\n", + " \"count\": _get_vectorizer(\"count\", 20_000),\n", + " \"tfidf\": _get_vectorizer(\"tfidf\", 20_000),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hashing: (3276, 1048576)\n", + "count: (3276, 1084)\n", + "tfidf: (3276, 1084)\n" + ] + } + ], + "source": [ + "# Fit and vectorize the tokens\n", + "token_list = dataset[\"tokens\"].str.split().tolist()\n", + "X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}\n", + "\n", + "# Display the shape of the vectorized data\n", + "for name, data in X.items():\n", + " print(f\"{name}: {data.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ability' 'able' 'absolutely' 'access' 'accomplish' 'account' 'ace'\n", + " 'active' 'activity' 'actually']\n" + ] + } + ], + "source": [ + "# Print the first 10 features of count and tfidf vectorizers\n", + "features = vectorizers[\"count\"].get_feature_names_out()[:10]\n", + "print(features)" ] }, { @@ -475,6 +483,74 @@ "## Classification" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Define classifiers\n", + "classifiers = [\n", + " (LogisticRegression(max_iter=1000, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", + " (LinearSVC(max_iter=10000, dual=False, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", + " (KNeighborsClassifier(), {\"n_neighbors\": np.arange(1, 10)}),\n", + " (DecisionTreeClassifier(random_state=SEED), {\"max_depth\": np.arange(1, 10)}),\n", + " (RandomForestClassifier(random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n", + " (GradientBoostingClassifier(random_state=SEED), {\"n_estimators\": np.arange(100, 500, 25)}),\n", + " (\n", + " VotingClassifier(\n", + " estimators=[\n", + " (\"lr\", LogisticRegression(max_iter=1000, random_state=SEED)),\n", + " (\"svc\", LinearSVC(max_iter=10000, dual=False, random_state=SEED)),\n", + " (\"rf\", RandomForestClassifier(random_state=SEED)),\n", + " ],\n", + " ),\n", + " {\n", + " \"lr__C\": np.logspace(-3, 3, 20),\n", + " \"svc__C\": np.logspace(-3, 3, 20),\n", + " \"rf__n_estimators\": np.arange(10, 500, 50),\n", + " },\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data into training and testing sets\n", + "X_split = {}\n", + "for name, data in X.items():\n", + " X_train, X_test, y_train, y_test = train_test_split(data, dataset[\"sentiment\"], test_size=0.2, random_state=SEED)\n", + " X_split[name] = (X_train, X_test, y_train, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the cross-validation strategy\n", + "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {},