diff --git "a/experiment.ipynb" "b/experiment.ipynb" --- "a/experiment.ipynb" +++ "b/experiment.ipynb" @@ -16,22 +16,28 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "from __future__ import annotations\n", - "\n", "from collections import Counter\n", "\n", "import joblib\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", + "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import StratifiedKFold, train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.tree import DecisionTreeClassifier\n", "from tqdm.notebook import tqdm\n", "from wordcloud import WordCloud\n", "\n", "from app.constants import CACHE_DIR\n", - "from app.data import load_data, tokenize" + "from app.data import load_data, tokenize\n", + "from app.model import _get_vectorizer" ] }, { @@ -93,27 +99,27 @@ " \n", " \n", " 0\n", - " Cooking microwave pizzas, yummy\n", + " MC, happy mother`s day to your mom ;).. love yah\n", " 1\n", " \n", " \n", " 1\n", - " I love the humor, I just reworded it. Like sa...\n", - " 1\n", + " A year from now is graduation....i am pretty s...\n", + " 0\n", " \n", " \n", " 2\n", - " That sucks to hear. I hate days like that\n", - " 0\n", + " Great for organising my work life balance\n", + " 1\n", " \n", " \n", " 3\n", - " Umm yeah. That`s probably a pretty good note ...\n", + " remember the guy who 1st #tweetbud you! ~> _2...\n", " 1\n", " \n", " \n", " 4\n", - " That would panic me a little! Maybe you can ...\n", + " She! Maybe that was our first mistake. Not e...\n", " 0\n", " \n", " \n", @@ -122,11 +128,11 @@ ], "text/plain": [ " text sentiment\n", - "0 Cooking microwave pizzas, yummy 1\n", - "1 I love the humor, I just reworded it. Like sa... 1\n", - "2 That sucks to hear. I hate days like that 0\n", - "3 Umm yeah. That`s probably a pretty good note ... 1\n", - "4 That would panic me a little! Maybe you can ... 0" + "0 MC, happy mother`s day to your mom ;).. love yah 1\n", + "1 A year from now is graduation....i am pretty s... 0\n", + "2 Great for organising my work life balance 1\n", + "3 remember the guy who 1st #tweetbud you! ~> _2... 1\n", + "4 She! Maybe that was our first mistake. Not e... 0" ] }, "execution_count": 4, @@ -150,8 +156,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n", - "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n" + "Cleaning: 100%|██████████| 3276/3276 [00:02<00:00, 1205.57doc/s]\n", + "Lemmatization: 100%|██████████| 3276/3276 [00:06<00:00, 508.76doc/s] \n" ] }, { @@ -183,33 +189,33 @@ " \n", " \n", " 0\n", - " Cooking microwave pizzas, yummy\n", + " MC, happy mother`s day to your mom ;).. love yah\n", " 1\n", - " cook microwave pizza yummy\n", + " happy mother day mom love yah\n", " \n", " \n", " 1\n", - " I love the humor, I just reworded it. Like sa...\n", - " 1\n", - " love humor reword like say group therapy inste...\n", + " A year from now is graduation....i am pretty s...\n", + " 0\n", + " year graduationi pretty sure ready\n", " \n", " \n", " 2\n", - " That sucks to hear. I hate days like that\n", - " 0\n", - " suck hear hate day like\n", + " Great for organising my work life balance\n", + " 1\n", + " great organise work life balance\n", " \n", " \n", " 3\n", - " Umm yeah. That`s probably a pretty good note ...\n", + " remember the guy who 1st #tweetbud you! ~> _2...\n", " 1\n", - " umm yeah probably pretty good note self eeeeee...\n", + " remember guy help flwrs smile\n", " \n", " \n", " 4\n", - " That would panic me a little! Maybe you can ...\n", + " She! Maybe that was our first mistake. Not e...\n", " 0\n", - " panic little maybe read orbitron gym like dowh...\n", + " maybe mistake cool brown nose moment\n", " \n", " \n", "\n", @@ -217,18 +223,18 @@ ], "text/plain": [ " text sentiment \\\n", - "0 Cooking microwave pizzas, yummy 1 \n", - "1 I love the humor, I just reworded it. Like sa... 1 \n", - "2 That sucks to hear. I hate days like that 0 \n", - "3 Umm yeah. That`s probably a pretty good note ... 1 \n", - "4 That would panic me a little! Maybe you can ... 0 \n", + "0 MC, happy mother`s day to your mom ;).. love yah 1 \n", + "1 A year from now is graduation....i am pretty s... 0 \n", + "2 Great for organising my work life balance 1 \n", + "3 remember the guy who 1st #tweetbud you! ~> _2... 1 \n", + "4 She! Maybe that was our first mistake. Not e... 0 \n", "\n", - " tokens \n", - "0 cook microwave pizza yummy \n", - "1 love humor reword like say group therapy inste... \n", - "2 suck hear hate day like \n", - "3 umm yeah probably pretty good note self eeeeee... \n", - "4 panic little maybe read orbitron gym like dowh... " + " tokens \n", + "0 happy mother day mom love yah \n", + "1 year graduationi pretty sure ready \n", + "2 great organise work life balance \n", + "3 remember guy help flwrs smile \n", + "4 maybe mistake cool brown nose moment " ] }, "execution_count": 5, @@ -259,20 +265,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi4AAAFzCAYAAAAZsoJrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtUUlEQVR4nO3de1RVdd7H8Q8XuXg5B68ghUKaF8o0tVGsbEwSi1paTGWSWpGWgaVmJjPeairU8pKXR3J68jKjj1ZTjqmRhKml5IXGvOStRtNSIFM4YQnI2c8fDXt5kkwRO/z0/VrrrOXZv+/+ne8+q50ff2effXwsy7IEAABgAF9vNwAAAHCuCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGP4e7uBS4Xb7dbhw4dVp04d+fj4eLsdAACMYVmWfvjhB4WHh8vX9+xrKgSXKnL48GFFRER4uw0AAIx16NAhXXnllWetIbhUkTp16kj6+U13OBxe7gYAAHO4XC5FRETYf5eeDcGlipR/PORwOAguAABUwrlcasHFuQAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBr9VhPMWOWqFt1tAFTswId7bLQDAOWHFBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYw6vBZd26dbrrrrsUHh4uHx8fLV261GPcsiyNHTtWjRs3VnBwsGJjY7Vv3z6PmmPHjikxMVEOh0MhISFKSkpSUVGRR822bdt08803KygoSBEREZo0adIZvbz11ltq1aqVgoKC1KZNG61cubLKjxcAAFwYrwaXEydOqG3btpo1a1aF45MmTdL06dOVnp6ujRs3qlatWoqLi9PJkyftmsTERO3cuVOZmZlavny51q1bp0GDBtnjLpdLPXr0UNOmTZWTk6OXX35Z48eP15w5c+yaDRs26IEHHlBSUpL+/e9/q3fv3urdu7d27Nhx8Q4eAACcNx/LsixvNyFJPj4+evfdd9W7d29JP6+2hIeH6+mnn9aIESMkSYWFhQoNDdW8efPUp08f7dq1S9HR0dq8ebM6duwoScrIyNAdd9yhb775RuHh4Zo9e7b+8pe/KDc3VwEBAZKkUaNGaenSpdq9e7ck6f7779eJEye0fPlyu5/OnTurXbt2Sk9PP6f+XS6XnE6nCgsL5XA4quptqZb4raJLD79VBMCbzufv0Gp7jcv+/fuVm5ur2NhYe5vT6VSnTp2UnZ0tScrOzlZISIgdWiQpNjZWvr6+2rhxo13TtWtXO7RIUlxcnPbs2aPjx4/bNae/TnlN+etUpLi4WC6Xy+MBAAAurmobXHJzcyVJoaGhHttDQ0PtsdzcXDVq1Mhj3N/fX/Xq1fOoqWiO01/j12rKxyuSlpYmp9NpPyIiIs73EAEAwHmqtsGluktNTVVhYaH9OHTokLdbAgDgkldtg0tYWJgkKS8vz2N7Xl6ePRYWFqb8/HyP8VOnTunYsWMeNRXNcfpr/FpN+XhFAgMD5XA4PB4AAODiqrbBJSoqSmFhYcrKyrK3uVwubdy4UTExMZKkmJgYFRQUKCcnx65ZvXq13G63OnXqZNesW7dOpaWldk1mZqZatmypunXr2jWnv055TfnrAACA6sGrwaWoqEhbt27V1q1bJf18Qe7WrVt18OBB+fj4aOjQoXrhhRe0bNkybd++Xf3791d4eLj9zaPWrVurZ8+eGjhwoDZt2qT169crJSVFffr0UXh4uCSpb9++CggIUFJSknbu3KklS5bo1Vdf1fDhw+0+nnrqKWVkZGjy5MnavXu3xo8fry1btiglJeX3fksAAMBZ+Hvzxbds2aJu3brZz8vDxIABAzRv3jyNHDlSJ06c0KBBg1RQUKCbbrpJGRkZCgoKsvdZuHChUlJS1L17d/n6+iohIUHTp0+3x51Op1atWqXk5GR16NBBDRo00NixYz3u9dKlSxctWrRIo0eP1p///GddffXVWrp0qa699trf4V0AAADnqtrcx8V03McFJuM+LgC86ZK4jwsAAMAvEVwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMfy93QAAoGpFjlrh7RZQhQ5MiPd2C9UKKy4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGCMah1cysrKNGbMGEVFRSk4OFjNmjXTX//6V1mWZddYlqWxY8eqcePGCg4OVmxsrPbt2+cxz7Fjx5SYmCiHw6GQkBAlJSWpqKjIo2bbtm26+eabFRQUpIiICE2aNOl3OUYAAHDuqnVwmThxombPnq2ZM2dq165dmjhxoiZNmqQZM2bYNZMmTdL06dOVnp6ujRs3qlatWoqLi9PJkyftmsTERO3cuVOZmZlavny51q1bp0GDBtnjLpdLPXr0UNOmTZWTk6OXX35Z48eP15w5c37X4wUAAGfn7+0GzmbDhg3q1auX4uPjJUmRkZH6v//7P23atEnSz6st06ZN0+jRo9WrVy9J0oIFCxQaGqqlS5eqT58+2rVrlzIyMrR582Z17NhRkjRjxgzdcccdeuWVVxQeHq6FCxeqpKREb7zxhgICAnTNNddo69atmjJlikfAAQAA3lWtV1y6dOmirKws7d27V5L0+eef65NPPtHtt98uSdq/f79yc3MVGxtr7+N0OtWpUydlZ2dLkrKzsxUSEmKHFkmKjY2Vr6+vNm7caNd07dpVAQEBdk1cXJz27Nmj48ePV9hbcXGxXC6XxwMAAFxc1XrFZdSoUXK5XGrVqpX8/PxUVlamF198UYmJiZKk3NxcSVJoaKjHfqGhofZYbm6uGjVq5DHu7++vevXqedRERUWdMUf5WN26dc/oLS0tTc8991wVHCUAADhX1XrF5c0339TChQu1aNEiffbZZ5o/f75eeeUVzZ8/39utKTU1VYWFhfbj0KFD3m4JAIBLXrVecXnmmWc0atQo9enTR5LUpk0bff3110pLS9OAAQMUFhYmScrLy1Pjxo3t/fLy8tSuXTtJUlhYmPLz8z3mPXXqlI4dO2bvHxYWpry8PI+a8uflNb8UGBiowMDACz9IAABwzqr1isuPP/4oX1/PFv38/OR2uyVJUVFRCgsLU1ZWlj3ucrm0ceNGxcTESJJiYmJUUFCgnJwcu2b16tVyu93q1KmTXbNu3TqVlpbaNZmZmWrZsmWFHxMBAADvqNbB5a677tKLL76oFStW6MCBA3r33Xc1ZcoU3X333ZIkHx8fDR06VC+88IKWLVum7du3q3///goPD1fv3r0lSa1bt1bPnj01cOBAbdq0SevXr1dKSor69Omj8PBwSVLfvn0VEBCgpKQk7dy5U0uWLNGrr76q4cOHe+vQAQBABar1R0UzZszQmDFj9MQTTyg/P1/h4eF67LHHNHbsWLtm5MiROnHihAYNGqSCggLddNNNysjIUFBQkF2zcOFCpaSkqHv37vL19VVCQoKmT59ujzudTq1atUrJycnq0KGDGjRooLFjx/JVaAAAqhkf6/Tb0KLSXC6XnE6nCgsL5XA4vN3ORRU5aoW3W0AVOzAh3tstoApxjl5aLofz83z+Dq3WHxUBAACcjuACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMSoVXK666ip9//33Z2wvKCjQVVdddcFNne7bb7/Vgw8+qPr16ys4OFht2rTRli1b7HHLsjR27Fg1btxYwcHBio2N1b59+zzmOHbsmBITE+VwOBQSEqKkpCQVFRV51Gzbtk0333yzgoKCFBERoUmTJlXpcQAAgAtXqeBy4MABlZWVnbG9uLhY33777QU3Ve748eO68cYbVaNGDb3//vv64osvNHnyZNWtW9eumTRpkqZPn6709HRt3LhRtWrVUlxcnE6ePGnXJCYmaufOncrMzNTy5cu1bt06DRo0yB53uVzq0aOHmjZtqpycHL388ssaP3685syZU2XHAgAALpz/+RQvW7bM/vMHH3wgp9NpPy8rK1NWVpYiIyOrrLmJEycqIiJCc+fOtbdFRUXZf7YsS9OmTdPo0aPVq1cvSdKCBQsUGhqqpUuXqk+fPtq1a5cyMjK0efNmdezYUZI0Y8YM3XHHHXrllVcUHh6uhQsXqqSkRG+88YYCAgJ0zTXXaOvWrZoyZYpHwAEAAN51XsGld+/ekiQfHx8NGDDAY6xGjRqKjIzU5MmTq6y5ZcuWKS4uTvfee6/Wrl2rK664Qk888YQGDhwoSdq/f79yc3MVGxtr7+N0OtWpUydlZ2erT58+ys7OVkhIiB1aJCk2Nla+vr7auHGj7r77bmVnZ6tr164KCAiwa+Li4jRx4kQdP37cY4WnXHFxsYqLi+3nLperyo4bAABU7Lw+KnK73XK73WrSpIny8/Pt5263W8XFxdqzZ4/uvPPOKmvuP//5j2bPnq2rr75aH3zwgQYPHqwnn3xS8+fPlyTl5uZKkkJDQz32Cw0Ntcdyc3PVqFEjj3F/f3/Vq1fPo6aiOU5/jV9KS0uT0+m0HxERERd4tAAA4LdU6hqX/fv3q0GDBlXdyxncbrfat2+vl156Sddff70GDRqkgQMHKj09/aK/9m9JTU1VYWGh/Th06JC3WwIA4JJ3Xh8VnS4rK0tZWVn2ysvp3njjjQtuTJIaN26s6Ohoj22tW7fWP//5T0lSWFiYJCkvL0+NGze2a/Ly8tSuXTu7Jj8/32OOU6dO6dixY/b+YWFhysvL86gpf15e80uBgYEKDAys5JEBAIDKqNSKy3PPPacePXooKytLR48e1fHjxz0eVeXGG2/Unj17PLbt3btXTZs2lfTzhbphYWHKysqyx10ulzZu3KiYmBhJUkxMjAoKCpSTk2PXrF69Wm63W506dbJr1q1bp9LSUrsmMzNTLVu2rPD6FgAA4B2VWnFJT0/XvHnz1K9fv6rux8OwYcPUpUsXvfTSS7rvvvu0adMmzZkzx/6aso+Pj4YOHaoXXnhBV199taKiojRmzBiFh4fbFxK3bt1aPXv2tD9iKi0tVUpKivr06aPw8HBJUt++ffXcc88pKSlJzz77rHbs2KFXX31VU6dOvajHBwAAzk+lgktJSYm6dOlS1b2c4YYbbtC7776r1NRUPf/884qKitK0adOUmJho14wcOVInTpzQoEGDVFBQoJtuukkZGRkKCgqyaxYuXKiUlBR1795dvr6+SkhI0PTp0+1xp9OpVatWKTk5WR06dFCDBg00duxYvgoNAEA142NZlnW+Oz377LOqXbu2xowZczF6MpLL5ZLT6VRhYaEcDoe327moIket8HYLqGIHJsR7uwVUIc7RS8vlcH6ez9+hlVpxOXnypObMmaMPP/xQ1113nWrUqOExPmXKlMpMCwAAcFaVCi7btm2zv7WzY8cOjzEfH58LbgoAAKAilQouH330UVX3AQAA8Jsq9XVoAAAAb6jUiku3bt3O+pHQ6tWrK90QAADAr6lUcCm/vqVcaWmptm7dqh07dpzx44sAAABVpVLB5dduzDZ+/HgVFRVdUEMAAAC/pkqvcXnwwQer7HeKAAAAfqlKg0t2drbHHWsBAACqUqU+Krrnnns8nluWpSNHjmjLli3cTRcAAFw0lQouTqfT47mvr69atmyp559/Xj169KiSxgAAAH6pUsFl7ty5Vd0HAADAb6pUcCmXk5OjXbt2SZKuueYaXX/99VXSFAAAQEUqFVzy8/PVp08frVmzRiEhIZKkgoICdevWTYsXL1bDhg2rskcAAABJlfxW0ZAhQ/TDDz9o586dOnbsmI4dO6YdO3bI5XLpySefrOoeAQAAJFVyxSUjI0MffvihWrdubW+Ljo7WrFmzuDgXAABcNJVacXG73apRo8YZ22vUqCG3233BTQEAAFSkUsHl1ltv1VNPPaXDhw/b27799lsNGzZM3bt3r7LmAAAATlep4DJz5ky5XC5FRkaqWbNmatasmaKiouRyuTRjxoyq7hEAAEBSJa9xiYiI0GeffaYPP/xQu3fvliS1bt1asbGxVdocAADA6c5rxWX16tWKjo6Wy+WSj4+PbrvtNg0ZMkRDhgzRDTfcoGuuuUYff/zxxeoVAABc5s4ruEybNk0DBw6Uw+E4Y8zpdOqxxx7TlClTqqw5AACA051XcPn888/Vs2fPXx3v0aOHcnJyLrgpAACAipxXcMnLy6vwa9Dl/P399d13311wUwAAABU5r+ByxRVXaMeOHb86vm3bNjVu3PiCmwIAAKjIeQWXO+64Q2PGjNHJkyfPGPvpp580btw43XnnnVXWHAAAwOnO6+vQo0eP1jvvvKMWLVooJSVFLVu2lCTt3r1bs2bNUllZmf7yl79clEYBAADOK7iEhoZqw4YNGjx4sFJTU2VZliTJx8dHcXFxmjVrlkJDQy9KowAAAOd9A7qmTZtq5cqVOn78uL788ktZlqWrr75adevWvRj9AQAA2Cp151xJqlu3rm644Yaq7AUAAOCsKvVbRQAAAN5AcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwhlHBZcKECfLx8dHQoUPtbSdPnlRycrLq16+v2rVrKyEhQXl5eR77HTx4UPHx8apZs6YaNWqkZ555RqdOnfKoWbNmjdq3b6/AwEA1b95c8+bN+x2OCAAAnA9jgsvmzZv12muv6brrrvPYPmzYML333nt66623tHbtWh0+fFj33HOPPV5WVqb4+HiVlJRow4YNmj9/vubNm6exY8faNfv371d8fLy6deumrVu3aujQoXr00Uf1wQcf/G7HBwAAfpsRwaWoqEiJiYn629/+prp169rbCwsL9b//+7+aMmWKbr31VnXo0EFz587Vhg0b9Omnn0qSVq1apS+++EL/+Mc/1K5dO91+++3661//qlmzZqmkpESSlJ6erqioKE2ePFmtW7dWSkqK/vSnP2nq1KleOV4AAFAxI4JLcnKy4uPjFRsb67E9JydHpaWlHttbtWqlJk2aKDs7W5KUnZ2tNm3aKDQ01K6Ji4uTy+XSzp077Zpfzh0XF2fPUZHi4mK5XC6PBwAAuLj8vd3Ab1m8eLE+++wzbd68+Yyx3NxcBQQEKCQkxGN7aGiocnNz7ZrTQ0v5ePnY2WpcLpd++uknBQcHn/HaaWlpeu655yp9XAAA4PxV6xWXQ4cO6amnntLChQsVFBTk7XY8pKamqrCw0H4cOnTI2y0BAHDJq9bBJScnR/n5+Wrfvr38/f3l7++vtWvXavr06fL391doaKhKSkpUUFDgsV9eXp7CwsIkSWFhYWd8y6j8+W/VOByOCldbJCkwMFAOh8PjAQAALq5qHVy6d++u7du3a+vWrfajY8eOSkxMtP9co0YNZWVl2fvs2bNHBw8eVExMjCQpJiZG27dvV35+vl2TmZkph8Oh6Ohou+b0OcpryucAAADVQ7W+xqVOnTq69tprPbbVqlVL9evXt7cnJSVp+PDhqlevnhwOh4YMGaKYmBh17txZktSjRw9FR0erX79+mjRpknJzczV69GglJycrMDBQkvT4449r5syZGjlypB555BGtXr1ab775plasWPH7HjAAADirah1czsXUqVPl6+urhIQEFRcXKy4uTv/zP/9jj/v5+Wn58uUaPHiwYmJiVKtWLQ0YMEDPP/+8XRMVFaUVK1Zo2LBhevXVV3XllVfq9ddfV1xcnDcOCQAA/Aofy7IsbzdxKXC5XHI6nSosLLzkr3eJHMVK1KXmwIR4b7eAKsQ5emm5HM7P8/k7tFpf4wIAAHA6ggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGqNbBJS0tTTfccIPq1KmjRo0aqXfv3tqzZ49HzcmTJ5WcnKz69eurdu3aSkhIUF5enkfNwYMHFR8fr5o1a6pRo0Z65plndOrUKY+aNWvWqH379goMDFTz5s01b968i314AADgPFXr4LJ27VolJyfr008/VWZmpkpLS9WjRw+dOHHCrhk2bJjee+89vfXWW1q7dq0OHz6se+65xx4vKytTfHy8SkpKtGHDBs2fP1/z5s3T2LFj7Zr9+/crPj5e3bp109atWzV06FA9+uij+uCDD37X4wUAAGfnY1mW5e0mztV3332nRo0aae3ateratasKCwvVsGFDLVq0SH/6058kSbt371br1q2VnZ2tzp076/3339edd96pw4cPKzQ0VJKUnp6uZ599Vt99950CAgL07LPPasWKFdqxY4f9Wn369FFBQYEyMjLOqTeXyyWn06nCwkI5HI6qP/hqJHLUCm+3gCp2YEK8t1tAFeIcvbRcDufn+fwdWq1XXH6psLBQklSvXj1JUk5OjkpLSxUbG2vXtGrVSk2aNFF2drYkKTs7W23atLFDiyTFxcXJ5XJp586dds3pc5TXlM9RkeLiYrlcLo8HAAC4uIwJLm63W0OHDtWNN96oa6+9VpKUm5urgIAAhYSEeNSGhoYqNzfXrjk9tJSPl4+drcblcumnn36qsJ+0tDQ5nU77ERERccHHCAAAzs6Y4JKcnKwdO3Zo8eLF3m5FkpSamqrCwkL7cejQIW+3BADAJc/f2w2ci5SUFC1fvlzr1q3TlVdeaW8PCwtTSUmJCgoKPFZd8vLyFBYWZtds2rTJY77ybx2dXvPLbyLl5eXJ4XAoODi4wp4CAwMVGBh4wccGAADOXbVecbEsSykpKXr33Xe1evVqRUVFeYx36NBBNWrUUFZWlr1tz549OnjwoGJiYiRJMTEx2r59u/Lz8+2azMxMORwORUdH2zWnz1FeUz4HAACoHqr1iktycrIWLVqkf/3rX6pTp459TYrT6VRwcLCcTqeSkpI0fPhw1atXTw6HQ0OGDFFMTIw6d+4sSerRo4eio6PVr18/TZo0Sbm5uRo9erSSk5PtFZPHH39cM2fO1MiRI/XII49o9erVevPNN7ViBVfmAwBQnVTrFZfZs2ersLBQf/zjH9W4cWP7sWTJErtm6tSpuvPOO5WQkKCuXbsqLCxM77zzjj3u5+en5cuXy8/PTzExMXrwwQfVv39/Pf/883ZNVFSUVqxYoczMTLVt21aTJ0/W66+/rri4uN/1eAEAwNkZdR+X6oz7uMBkl8N9Ii4nnKOXlsvh/Lxk7+MCAAAubwQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFw+YVZs2YpMjJSQUFB6tSpkzZt2uTtlgAAwH8RXE6zZMkSDR8+XOPGjdNnn32mtm3bKi4uTvn5+d5uDQAAiODiYcqUKRo4cKAefvhhRUdHKz09XTVr1tQbb7zh7dYAAIAkf283UF2UlJQoJydHqamp9jZfX1/FxsYqOzv7jPri4mIVFxfbzwsLCyVJLpfr4jfrZe7iH73dAqrY5fDf7eWEc/TScjmcn+XHaFnWb9YSXP7r6NGjKisrU2hoqMf20NBQ7d69+4z6tLQ0Pffcc2dsj4iIuGg9AheLc5q3OwDway6n8/OHH36Q0+k8aw3BpZJSU1M1fPhw+7nb7daxY8dUv359+fj4eLEzVAWXy6WIiAgdOnRIDofD2+0A+AXO0UuLZVn64YcfFB4e/pu1BJf/atCggfz8/JSXl+exPS8vT2FhYWfUBwYGKjAw0GNbSEjIxWwRXuBwOPifIlCNcY5eOn5rpaUcF+f+V0BAgDp06KCsrCx7m9vtVlZWlmJiYrzYGQAAKMeKy2mGDx+uAQMGqGPHjvrDH/6gadOm6cSJE3r44Ye93RoAABDBxcP999+v7777TmPHjlVubq7atWunjIyMMy7YxaUvMDBQ48aNO+PjQADVA+fo5cvHOpfvHgEAAFQDXOMCAACMQXABAADGILgAAABjEFyAKhIZGalp06Z5uw3gkrZmzRr5+PiooKDgrHWcj5cugguM8NBDD8nHx0cTJkzw2L506dLf/U7F8+bNq/Bmg5s3b9agQYN+116A6qr8nPXx8VFAQICaN2+u559/XqdOnbqgebt06aIjR47YNyvjfLz8EFxgjKCgIE2cOFHHjx/3disVatiwoWrWrOntNoBqo2fPnjpy5Ij27dunp59+WuPHj9fLL798QXMGBAQoLCzsN//Bwvl46SK4wBixsbEKCwtTWlrar9Z88sknuvnmmxUcHKyIiAg9+eSTOnHihD1+5MgRxcfHKzg4WFFRUVq0aNEZS8pTpkxRmzZtVKtWLUVEROiJJ55QUVGRpJ+XqR9++GEVFhba/5ocP368JM+l6b59++r+++/36K20tFQNGjTQggULJP18Z+a0tDRFRUUpODhYbdu21dtvv10F7xRQPQQGBiosLExNmzbV4MGDFRsbq2XLlun48ePq37+/6tatq5o1a+r222/Xvn377P2+/vpr3XXXXapbt65q1aqla665RitXrpTk+VER5+PlieACY/j5+emll17SjBkz9M0335wx/tVXX6lnz55KSEjQtm3btGTJEn3yySdKSUmxa/r376/Dhw9rzZo1+uc//6k5c+YoPz/fYx5fX19Nnz5dO3fu1Pz587V69WqNHDlS0s/L1NOmTZPD4dCRI0d05MgRjRgx4oxeEhMT9d5779mBR5I++OAD/fjjj7r77rsl/fwL4wsWLFB6erp27typYcOG6cEHH9TatWur5P0Cqpvg4GCVlJTooYce0pYtW7Rs2TJlZ2fLsizdcccdKi0tlSQlJyeruLhY69at0/bt2zVx4kTVrl37jPk4Hy9TFmCAAQMGWL169bIsy7I6d+5sPfLII5ZlWda7775rlf9nnJSUZA0aNMhjv48//tjy9fW1fvrpJ2vXrl2WJGvz5s32+L59+yxJ1tSpU3/1td966y2rfv369vO5c+daTqfzjLqmTZva85SWlloNGjSwFixYYI8/8MAD1v33329ZlmWdPHnSqlmzprVhwwaPOZKSkqwHHnjg7G8GYIDTz1m3221lZmZagYGBVu/evS1J1vr16+3ao0ePWsHBwdabb75pWZZltWnTxho/fnyF83700UeWJOv48eOWZXE+Xo645T+MM3HiRN16661n/Mvq888/17Zt27Rw4UJ7m2VZcrvd2r9/v/bu3St/f3+1b9/eHm/evLnq1q3rMc+HH36otLQ07d69Wy6XS6dOndLJkyf1448/nvNn5v7+/rrvvvu0cOFC9evXTydOnNC//vUvLV68WJL05Zdf6scff9Rtt93msV9JSYmuv/7683o/gOpq+fLlql27tkpLS+V2u9W3b1/dc889Wr58uTp16mTX1a9fXy1bttSuXbskSU8++aQGDx6sVatWKTY2VgkJCbruuusq3Qfn46WF4ALjdO3aVXFxcUpNTdVDDz1kby8qKtJjjz2mJ5988ox9mjRpor179/7m3AcOHNCdd96pwYMH68UXX1S9evX0ySefKCkpSSUlJed1sV9iYqJuueUW5efnKzMzU8HBwerZs6fdqyStWLFCV1xxhcd+/PYKLhXdunXT7NmzFRAQoPDwcPn7+2vZsmW/ud+jjz6quLg4rVixQqtWrVJaWpomT56sIUOGVLoXzsdLB8EFRpowYYLatWunli1b2tvat2+vL774Qs2bN69wn5YtW+rUqVP697//rQ4dOkj6+V9ap39LKScnR263W5MnT5av78+XgL355pse8wQEBKisrOw3e+zSpYsiIiK0ZMkSvf/++7r33ntVo0YNSVJ0dLQCAwN18OBB3XLLLed38IAhatWqdcb52Lp1a506dUobN25Uly5dJEnff/+99uzZo+joaLsuIiJCjz/+uB5//HGlpqbqb3/7W4XBhfPx8kNwgZHatGmjxMRETZ8+3d727LPPqnPnzkpJSdGjjz6qWrVq6YsvvlBmZqZmzpypVq1aKTY2VoMGDdLs2bNVo0YNPf300woODra/Wtm8eXOVlpZqxowZuuuuu7R+/Xqlp6d7vHZkZKSKioqUlZWltm3bqmbNmr+6EtO3b1+lp6dr7969+uijj+ztderU0YgRIzRs2DC53W7ddNNNKiws1Pr16+VwODRgwICL8K4B3nf11VerV69eGjhwoF577TXVqVNHo0aN0hVXXKFevXpJkoYOHarbb79dLVq00PHjx/XRRx+pdevWFc7H+XgZ8vZFNsC5OP1Cv3L79++3AgICrNP/M960aZN12223WbVr17Zq1aplXXfdddaLL75ojx8+fNi6/fbbrcDAQKtp06bWokWLrEaNGlnp6el2zZQpU6zGjRtbwcHBVlxcnLVgwQKPiwEty7Ief/xxq379+pYka9y4cZZleV4MWO6LL76wJFlNmza13G63x5jb7bamTZtmtWzZ0qpRo4bVsGFDKy4uzlq7du2FvVlANVDROVvu2LFjVr9+/Syn02mfZ3v37rXHU1JSrGbNmlmBgYFWw4YNrX79+llHjx61LOvMi3Mti/PxcuNjWZblxdwEeNU333yjiIgIffjhh+revbu32wEA/AaCCy4rq1evVlFRkdq0aaMjR45o5MiR+vbbb7V37177824AQPXFNS64rJSWlurPf/6z/vOf/6hOnTrq0qWLFi5cSGgBAEOw4gIAAIzBLf8BAIAxCC4AAMAYBBcAAGAMggsAADAGwQXAJWvNmjXy8fFRQUGBt1sBUEUILgAuuu+++06DBw9WkyZNFBgYqLCwMMXFxWn9+vVV9hp//OMfNXToUI9tXbp00ZEjR+R0OqvsdSrroYceUu/evb3dBmA87uMC4KJLSEhQSUmJ5s+fr6uuukp5eXnKysrS999/f1FfNyAgQGFhYRf1NQD8zrz5ewMALn3Hjx+3JFlr1qw5a01SUpLVoEEDq06dOla3bt2srVu32uPjxo2z2rZtay1YsMBq2rSp5XA4rPvvv99yuVyWZf38uziSPB779+8/43dt5s6dazmdTuu9996zWrRoYQUHB1sJCQnWiRMnrHnz5llNmza1QkJCrCFDhlinTp2yX//kyZPW008/bYWHh1s1a9a0/vCHP1gfffSRPV4+b0ZGhtWqVSurVq1aVlxcnHX48GG7/1/2d/r+AM4dHxUBuKhq166t2rVra+nSpSouLq6w5t5771V+fr7ef/995eTkqH379urevbuOHTtm13z11VdaunSpli9fruXLl2vt2rWaMGGCJOnVV19VTEyMBg4cqCNHjujIkSOKiIio8LV+/PFHTZ8+XYsXL1ZGRobWrFmju+++WytXrtTKlSv197//Xa+99prefvtte5+UlBRlZ2dr8eLF2rZtm+6991717NlT+/bt85j3lVde0d///netW7dOBw8e1IgRIyRJI0aM0H333aeePXva/XXp0uWC31vgsuTt5ATg0vf2229bdevWtYKCgqwuXbpYqamp1ueff25ZlmV9/PHHlsPhsE6ePOmxT7NmzazXXnvNsqyfVyxq1qxpr7BYlmU988wzVqdOneznt9xyi/XUU095zFHRiosk68svv7RrHnvsMatmzZrWDz/8YG+Li4uzHnvsMcuyLOvrr7+2/Pz8rG+//dZj7u7du1upqam/Ou+sWbOs0NBQ+/nZfi0ZwLnjGhcAF11CQoLi4+P18ccf69NPP9X777+vSZMm6fXXX9eJEydUVFSk+vXre+zz008/6auvvrKfR0ZGqk6dOvbzxo0bKz8//7x7qVmzppo1a2Y/Dw0NVWRkpGrXru2xrXzu7du3q6ysTC1atPCYp7i42KPnX85b2f4AnB3BBcDvIigoSLfddptuu+02jRkzRo8++qjGjRunJ554Qo0bN9aaNWvO2CckJMT+8y9/CNPHx0dut/u8+6honrPNXVRUJD8/P+Xk5MjPz8+j7vSwU9EcFj8FB1Q5ggsAr4iOjtbSpUvVvn175ebmyt/fX5GRkZWeLyAgQGVlZVXX4H9df/31KisrU35+vm6++eZKz3Ox+gMuN1ycC+Ci+v7773XrrbfqH//4h7Zt26b9+/frrbfe0qRJk9SrVy/FxsYqJiZGvXv31qpVq3TgwAFt2LBBf/nLX7Rly5Zzfp3IyEht3LhRBw4c0NGjRyu1GlORFi1aKDExUf3799c777yj/fv3a9OmTUpLS9OKFSvOq79t27Zpz549Onr0qEpLS6ukP+ByQ3ABcFHVrl1bnTp10tSpU9W1a1dde+21GjNmjAYOHKiZM2fKx8dHK1euVNeuXfXwww+rRYsW6tOnj77++muFhoae8+uMGDFCfn5+io6OVsOGDXXw4MEqO4a5c+eqf//+evrpp9WyZUv17t1bmzdvVpMmTc55joEDB6ply5bq2LGjGjZsWKU33wMuJz4WH8ICAABDsOICAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDH+H3RDbpqsrEWNAAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", @@ -293,34 +288,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a885e681eaf14751b11088566e643a3e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/19583 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Gather all the unique words in the dataset\n", "word_freq = Counter()\n", @@ -348,34 +318,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "04f9a50519654e7188f59c62645572ff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/19583 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Gather all the unique tokens in the dataset\n", "token_freq = Counter()\n", @@ -403,48 +348,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "283ee0b586574489bf14a8ef0105ef78", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/9105 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", "\n", @@ -465,7 +371,109 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Vectorization" + "### Token frequency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_, ax = plt.subplots(figsize=(6, 4))\n", + "\n", + "dataset[\"text\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"Before Tokenization\")\n", + "dataset[\"tokens\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"After Tokenization\")\n", + "\n", + "ax.set_xlabel(\"Number of tokens\")\n", + "ax.set_ylabel(\"Count\")\n", + "ax.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vocabulary size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Vocabulary size before tokenization: {len(word_freq)}\")\n", + "print(f\"Vocabulary size after tokenization: {len(token_freq)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vectorization\n", + "\n", + "The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Define vectorizers\n", + "vectorizers = {\n", + " \"hashing\": _get_vectorizer(\"hashing\", n_features=2**20),\n", + " \"count\": _get_vectorizer(\"count\", 20_000),\n", + " \"tfidf\": _get_vectorizer(\"tfidf\", 20_000),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hashing: (3276, 1048576)\n", + "count: (3276, 1084)\n", + "tfidf: (3276, 1084)\n" + ] + } + ], + "source": [ + "# Fit and vectorize the tokens\n", + "token_list = dataset[\"tokens\"].str.split().tolist()\n", + "X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}\n", + "\n", + "# Display the shape of the vectorized data\n", + "for name, data in X.items():\n", + " print(f\"{name}: {data.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ability' 'able' 'absolutely' 'access' 'accomplish' 'account' 'ace'\n", + " 'active' 'activity' 'actually']\n" + ] + } + ], + "source": [ + "# Print the first 10 features of count and tfidf vectorizers\n", + "features = vectorizers[\"count\"].get_feature_names_out()[:10]\n", + "print(features)" ] }, { @@ -475,6 +483,74 @@ "## Classification" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Define classifiers\n", + "classifiers = [\n", + " (LogisticRegression(max_iter=1000, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", + " (LinearSVC(max_iter=10000, dual=False, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", + " (KNeighborsClassifier(), {\"n_neighbors\": np.arange(1, 10)}),\n", + " (DecisionTreeClassifier(random_state=SEED), {\"max_depth\": np.arange(1, 10)}),\n", + " (RandomForestClassifier(random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n", + " (GradientBoostingClassifier(random_state=SEED), {\"n_estimators\": np.arange(100, 500, 25)}),\n", + " (\n", + " VotingClassifier(\n", + " estimators=[\n", + " (\"lr\", LogisticRegression(max_iter=1000, random_state=SEED)),\n", + " (\"svc\", LinearSVC(max_iter=10000, dual=False, random_state=SEED)),\n", + " (\"rf\", RandomForestClassifier(random_state=SEED)),\n", + " ],\n", + " ),\n", + " {\n", + " \"lr__C\": np.logspace(-3, 3, 20),\n", + " \"svc__C\": np.logspace(-3, 3, 20),\n", + " \"rf__n_estimators\": np.arange(10, 500, 50),\n", + " },\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data into training and testing sets\n", + "X_split = {}\n", + "for name, data in X.items():\n", + " X_train, X_test, y_train, y_test = train_test_split(data, dataset[\"sentiment\"], test_size=0.2, random_state=SEED)\n", + " X_split[name] = (X_train, X_test, y_train, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the cross-validation strategy\n", + "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {},