{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports, constants and setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import time\n", "import tracemalloc\n", "import warnings\n", "from collections import Counter\n", "\n", "import joblib\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n", "from sklearn.exceptions import ConvergenceWarning\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score\n", "from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.tree import DecisionTreeClassifier\n", "from tqdm.notebook import tqdm\n", "from wordcloud import WordCloud\n", "\n", "from app.constants import CACHE_DIR, DATA_DIR\n", "from app.data import load_data, tokenize\n", "from app.model import _get_vectorizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tqdm.pandas()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "SEED = 42\n", "CACHE = joblib.Memory(CACHE_DIR, verbose=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data loading" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentiment
0The impact of educational reforms remains unce...1
1Critics argue that recent improvements in the ...0
2Innovative teaching methods have led to unexpe...1
3Despite budget constraints, the school has man...1
4The true effectiveness of online learning plat...0
\n", "
" ], "text/plain": [ " text sentiment\n", "0 The impact of educational reforms remains unce... 1\n", "1 Critics argue that recent improvements in the ... 0\n", "2 Innovative teaching methods have led to unexpe... 1\n", "3 Despite budget constraints, the school has man... 1\n", "4 The true effectiveness of online learning plat... 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load data and convert to pandas DataFrame\n", "text_data, label_data = load_data(\"test\")\n", "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cleaning: 100%|██████████| 209/209 [00:01<00:00, 119.42doc/s]\n", "Lemmatization: 100%|██████████| 209/209 [00:00<00:00, 395.78doc/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentimenttokens
0The impact of educational reforms remains unce...1impact educational reform remain uncertain des...
1Critics argue that recent improvements in the ...0critic argue recent improvement school system ...
2Innovative teaching methods have led to unexpe...1innovative teaching method lead unexpected cha...
3Despite budget constraints, the school has man...1despite budget constraint school manage mainta...
4The true effectiveness of online learning plat...0true effectiveness online learning platform ma...
\n", "
" ], "text/plain": [ " text sentiment \\\n", "0 The impact of educational reforms remains unce... 1 \n", "1 Critics argue that recent improvements in the ... 0 \n", "2 Innovative teaching methods have led to unexpe... 1 \n", "3 Despite budget constraints, the school has man... 1 \n", "4 The true effectiveness of online learning plat... 0 \n", "\n", " tokens \n", "0 impact educational reform remain uncertain des... \n", "1 critic argue recent improvement school system ... \n", "2 innovative teaching method lead unexpected cha... \n", "3 despite budget constraint school manage mainta... \n", "4 true effectiveness online learning platform ma... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Tokenize text data\n", "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n", "dataset[\"tokens\"] = tokens.apply(\" \".join)\n", "dataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data exploration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sentiment distribution" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhwAAAFzCAYAAAB1tNBuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlWUlEQVR4nO3de1zUdb7H8TfIXWC8NkCLQua1TDNbg9paFUNze+jGo3KzUtfUTDQ1142zqdWpUE8pR2Ol9ux62aPHLmdzzYxSvFWSF6zMMrVW01UumcIIBiLzPX90nHUSLJGvA/h6Ph7zeDS/Gx94+MuXP34z42eMMQIAALDI39cDAACAxo/gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUBvh6gPnC73Tpy5IgiIiLk5+fn63EAAGgwjDE6ceKEYmJi5O9f83UMgkPSkSNHFBsb6+sxAABosA4dOqSf/exnNa4nOCRFRERI+v6HFRkZ6eNpAABoOFwul2JjYz1/l9aE4JA8v0aJjIwkOAAAqIUfuyWBm0YBAIB1BAcAALCO4AAAANYRHAAAwDqCAwAAWEdwAAAA6wgOAABgHcEBAACsIzgAAIB1BAcAALCO4AAAANbxWSqXmbjH3/L1CKhDB2YO9PUIAPCTcIUDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHU+DY5NmzbpzjvvVExMjPz8/LRixQqv9cYYTZ8+XdHR0QoNDVVSUpL27dvntc2xY8c0dOhQRUZGqlmzZho5cqRKS0sv4XcBAAB+jE+Do6ysTN26dVNmZma162fPnq158+YpKytLW7ZsUdOmTZWcnKzy8nLPNkOHDtVnn32mNWvWaNWqVdq0aZNGjx59qb4FAADwEwT48osPGDBAAwYMqHadMUYZGRl64oknNGjQIEnSkiVL5HQ6tWLFCg0ZMkS7d+9Wdna2tm3bpp49e0qS5s+frzvuuEPPP/+8YmJiLtn3AgAAalZv7+HYv3+/CgoKlJSU5FnmcDjUq1cv5ebmSpJyc3PVrFkzT2xIUlJSkvz9/bVly5Yaj11RUSGXy+X1AAAA9tTb4CgoKJAkOZ1Or+VOp9OzrqCgQFdccYXX+oCAALVo0cKzTXXS09PlcDg8j9jY2DqeHgAAnK3eBodNaWlpKikp8TwOHTrk65EAAGjU6m1wREVFSZIKCwu9lhcWFnrWRUVFqaioyGv96dOndezYMc821QkODlZkZKTXAwAA2FNvgyM+Pl5RUVHKycnxLHO5XNqyZYsSEhIkSQkJCSouLlZeXp5nm3Xr1sntdqtXr16XfGYAAFA9n75KpbS0VF9++aXn+f79+/Xxxx+rRYsWatOmjSZOnKhnnnlG7du3V3x8vKZNm6aYmBgNHjxYktS5c2f1799fo0aNUlZWliorK5WamqohQ4bwChUAAOoRnwbH9u3b1bt3b8/zyZMnS5KGDRumRYsWaerUqSorK9Po0aNVXFysW265RdnZ2QoJCfHss3TpUqWmpqpv377y9/dXSkqK5s2bd8m/FwAAUDM/Y4zx9RC+5nK55HA4VFJS0ujv54h7/C1fj4A6dGDmQF+PAOAy91P/Dq2393AAAIDGg+AAAADWERwAAMA6ggMAAFjn01epAAD+hZu6Gx9u7P4XrnAAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHX1Ojiqqqo0bdo0xcfHKzQ0VO3atdO///u/yxjj2cYYo+nTpys6OlqhoaFKSkrSvn37fDg1AAD4oXodHLNmzdKCBQv04osvavfu3Zo1a5Zmz56t+fPne7aZPXu25s2bp6ysLG3ZskVNmzZVcnKyysvLfTg5AAA4W4CvBzifzZs3a9CgQRo4cKAkKS4uTv/zP/+jrVu3Svr+6kZGRoaeeOIJDRo0SJK0ZMkSOZ1OrVixQkOGDPHZ7AAA4F/q9RWOxMRE5eTkaO/evZKkTz75RO+//74GDBggSdq/f78KCgqUlJTk2cfhcKhXr17Kzc2t8bgVFRVyuVxeDwAAYE+9vsLx+OOPy+VyqVOnTmrSpImqqqr07LPPaujQoZKkgoICSZLT6fTaz+l0etZVJz09XU899ZS9wQEAgJd6fYXj1Vdf1dKlS7Vs2TLt2LFDixcv1vPPP6/Fixdf1HHT0tJUUlLieRw6dKiOJgYAANWp11c4fve73+nxxx/33IvRtWtXff3110pPT9ewYcMUFRUlSSosLFR0dLRnv8LCQnXv3r3G4wYHBys4ONjq7AAA4F/q9RWOkydPyt/fe8QmTZrI7XZLkuLj4xUVFaWcnBzPepfLpS1btighIeGSzgoAAGpWr69w3HnnnXr22WfVpk0bXXPNNfroo480Z84c/fa3v5Uk+fn5aeLEiXrmmWfUvn17xcfHa9q0aYqJidHgwYN9OzwAAPCo18Exf/58TZs2TY888oiKiooUExOjMWPGaPr06Z5tpk6dqrKyMo0ePVrFxcW65ZZblJ2drZCQEB9ODgAAzuZnzn7bzsuUy+WSw+FQSUmJIiMjfT2OVXGPv+XrEVCHDswc6OsRUIc4Pxufy+Ec/al/h9brezgAAEDjQHAAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHW1Co6rrrpK33777TnLi4uLddVVV130UAAAoHGpVXAcOHBAVVVV5yyvqKjQ4cOHL3ooAADQuARcyMYrV670/Pc777wjh8PheV5VVaWcnBzFxcXV2XAAAKBxuKDgGDx4sCTJz89Pw4YN81oXGBiouLg4vfDCC3U2HAAAaBwuKDjcbrckKT4+Xtu2bVOrVq2sDAUAABqXCwqOM/bv31/XcwAAgEasVsEhSTk5OcrJyVFRUZHnyscZf/nLXy56MAAA0HjUKjieeuopPf300+rZs6eio6Pl5+dX13MBAIBGpFbBkZWVpUWLFumBBx6o63kAAEAjVKv34Th16pQSExPrehYAANBI1So4HnroIS1btqyuZwEAAI1UrX6lUl5erpdffllr167Vddddp8DAQK/1c+bMqZPhAABA41CrKxw7d+5U9+7d5e/vr127dumjjz7yPD7++OM6HfDw4cO6//771bJlS4WGhqpr167avn27Z70xRtOnT1d0dLRCQ0OVlJSkffv21ekMAADg4tTqCsf69evreo5qHT9+XDfffLN69+6tt99+W61bt9a+ffvUvHlzzzazZ8/WvHnztHjxYsXHx2vatGlKTk7W559/rpCQkEsyJwAAOL9avw/HpTBr1izFxsZq4cKFnmXx8fGe/zbGKCMjQ0888YQGDRokSVqyZImcTqdWrFihIUOGXPKZAQDAuWoVHL179z7ve2+sW7eu1gOdbeXKlUpOTtbdd9+tjRs36sorr9QjjzyiUaNGSfr+HU8LCgqUlJTk2cfhcKhXr17Kzc2tMTgqKipUUVHhee5yuepkXgAAUL1a3cPRvXt3devWzfPo0qWLTp06pR07dqhr1651Ntw//vEPLViwQO3bt9c777yjsWPHasKECVq8eLEkqaCgQJLkdDq99nM6nZ511UlPT5fD4fA8YmNj62xmAABwrlpd4Zg7d261y5988kmVlpZe1EBnc7vd6tmzp5577jlJ0vXXX69du3YpKyvrnE+rvRBpaWmaPHmy57nL5SI6AACwqFZXOGpy//331+nnqERHR6tLly5eyzp37qyDBw9KkqKioiRJhYWFXtsUFhZ61lUnODhYkZGRXg8AAGBPnQZHbm5unb4y5Oabb9aePXu8lu3du1dt27aV9P0NpFFRUcrJyfGsd7lc2rJlixISEupsDgAAcHFq9SuVu+66y+u5MUb5+fnavn27pk2bVieDSdKkSZOUmJio5557Tvfcc4+2bt2ql19+WS+//LIkyc/PTxMnTtQzzzyj9u3be14WGxMTo8GDB9fZHAAA4OLUKjgcDofXc39/f3Xs2FFPP/20br/99joZTJJuvPFGvfHGG0pLS9PTTz+t+Ph4ZWRkaOjQoZ5tpk6dqrKyMo0ePVrFxcW65ZZblJ2dzXtwAABQj/gZY4yvh/A1l8slh8OhkpKSRn8/R9zjb/l6BNShAzMH+noE1CHOz8bncjhHf+rfoRf1xl95eXnavXu3JOmaa67R9ddffzGHAwAAjVStgqOoqEhDhgzRhg0b1KxZM0lScXGxevfureXLl6t169Z1OSMAAGjgavUqlfHjx+vEiRP67LPPdOzYMR07dky7du2Sy+XShAkT6npGAADQwNXqCkd2drbWrl2rzp07e5Z16dJFmZmZdXrTKAAAaBxqdYXD7XYrMDDwnOWBgYFyu90XPRQAAGhcahUcffr00aOPPqojR454lh0+fFiTJk1S375962w4AADQONQqOF588UW5XC7FxcWpXbt2ateuneLj4+VyuTR//vy6nhEAADRwtbqHIzY2Vjt27NDatWv1xRdfSPr+M07O/ph4AACAMy7oCse6devUpUsXuVwu+fn5qV+/fho/frzGjx+vG2+8Uddcc43ee+89W7MCAIAG6oKCIyMjQ6NGjar2ncQcDofGjBmjOXPm1NlwAACgcbig4Pjkk0/Uv3//GtfffvvtysvLu+ihAABA43JBwVFYWFjty2HPCAgI0DfffHPRQwEAgMblgoLjyiuv1K5du2pcv3PnTkVHR1/0UAAAoHG5oOC44447NG3aNJWXl5+z7rvvvtOMGTP0q1/9qs6GAwAAjcMFvSz2iSee0N/+9jd16NBBqamp6tixoyTpiy++UGZmpqqqqvSHP/zByqAAAKDhuqDgcDqd2rx5s8aOHau0tDQZYyRJfn5+Sk5OVmZmppxOp5VBAQBAw3XBb/zVtm1brV69WsePH9eXX34pY4zat2+v5s2b25gPAAA0ArV6p1FJat68uW688ca6nAUAADRStfosFQAAgAtBcAAAAOsIDgAAYB3BAQAArCM4AACAdQQHAACwjuAAAADWERwAAMA6ggMAAFhHcAAAAOsIDgAAYB3BAQAArCM4AACAdQQHAACwjuAAAADWERwAAMA6ggMAAFhHcAAAAOsIDgAAYB3BAQAArCM4AACAdQQHAACwjuAAAADWNajgmDlzpvz8/DRx4kTPsvLyco0bN04tW7ZUeHi4UlJSVFhY6LshAQDAORpMcGzbtk0vvfSSrrvuOq/lkyZN0ptvvqnXXntNGzdu1JEjR3TXXXf5aEoAAFCdBhEcpaWlGjp0qP70pz+pefPmnuUlJSX685//rDlz5qhPnz664YYbtHDhQm3evFkffvihDycGAABnaxDBMW7cOA0cOFBJSUley/Py8lRZWem1vFOnTmrTpo1yc3NrPF5FRYVcLpfXAwAA2BPg6wF+zPLly7Vjxw5t27btnHUFBQUKCgpSs2bNvJY7nU4VFBTUeMz09HQ99dRTdT0qAACoQb2+wnHo0CE9+uijWrp0qUJCQursuGlpaSopKfE8Dh06VGfHBgAA56rXwZGXl6eioiL16NFDAQEBCggI0MaNGzVv3jwFBATI6XTq1KlTKi4u9tqvsLBQUVFRNR43ODhYkZGRXg8AAGBPvf6VSt++ffXpp596LRsxYoQ6deqk3//+94qNjVVgYKBycnKUkpIiSdqzZ48OHjyohIQEX4wMAACqUa+DIyIiQtdee63XsqZNm6ply5ae5SNHjtTkyZPVokULRUZGavz48UpISNBNN93ki5EBAEA16nVw/BRz586Vv7+/UlJSVFFRoeTkZP3xj3/09VgAAOAsDS44NmzY4PU8JCREmZmZyszM9M1AAADgR9Xrm0YBAEDjQHAAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYV6+DIz09XTfeeKMiIiJ0xRVXaPDgwdqzZ4/XNuXl5Ro3bpxatmyp8PBwpaSkqLCw0EcTAwCA6tTr4Ni4caPGjRunDz/8UGvWrFFlZaVuv/12lZWVebaZNGmS3nzzTb322mvauHGjjhw5orvuusuHUwMAgB8K8PUA55Odne31fNGiRbriiiuUl5enW2+9VSUlJfrzn/+sZcuWqU+fPpKkhQsXqnPnzvrwww910003+WJsAADwA/X6CscPlZSUSJJatGghScrLy1NlZaWSkpI823Tq1Elt2rRRbm5ujcepqKiQy+XyegAAAHsaTHC43W5NnDhRN998s6699lpJUkFBgYKCgtSsWTOvbZ1OpwoKCmo8Vnp6uhwOh+cRGxtrc3QAAC57DSY4xo0bp127dmn58uUXfay0tDSVlJR4HocOHaqDCQEAQE3q9T0cZ6SmpmrVqlXatGmTfvazn3mWR0VF6dSpUyouLva6ylFYWKioqKgajxccHKzg4GCbIwMAgLPU6yscxhilpqbqjTfe0Lp16xQfH++1/oYbblBgYKBycnI8y/bs2aODBw8qISHhUo8LAABqUK+vcIwbN07Lli3T3//+d0VERHjuy3A4HAoNDZXD4dDIkSM1efJktWjRQpGRkRo/frwSEhJ4hQoAAPVIvQ6OBQsWSJJ++ctfei1fuHChhg8fLkmaO3eu/P39lZKSooqKCiUnJ+uPf/zjJZ4UAACcT70ODmPMj24TEhKizMxMZWZmXoKJAABAbdTrezgAAEDjQHAAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABY12iCIzMzU3FxcQoJCVGvXr20detWX48EAAD+X6MIjldeeUWTJ0/WjBkztGPHDnXr1k3JyckqKiry9WgAAECNJDjmzJmjUaNGacSIEerSpYuysrIUFhamv/zlL74eDQAASArw9QAX69SpU8rLy1NaWppnmb+/v5KSkpSbm1vtPhUVFaqoqPA8LykpkSS5XC67w9YD7oqTvh4Bdehy+DN7OeH8bHwuh3P0zPdojDnvdg0+OI4ePaqqqio5nU6v5U6nU1988UW1+6Snp+upp546Z3lsbKyVGQFbHBm+ngDA+VxO5+iJEyfkcDhqXN/gg6M20tLSNHnyZM9zt9utY8eOqWXLlvLz8/PhZKgLLpdLsbGxOnTokCIjI309DoCzcH42PsYYnThxQjExMefdrsEHR6tWrdSkSRMVFhZ6LS8sLFRUVFS1+wQHBys4ONhrWbNmzWyNCB+JjIzkf2hAPcX52bic78rGGQ3+ptGgoCDdcMMNysnJ8Sxzu93KyclRQkKCDycDAABnNPgrHJI0efJkDRs2TD179tTPf/5zZWRkqKysTCNGjPD1aAAAQI0kOO6991598803mj59ugoKCtS9e3dlZ2efcyMpLg/BwcGaMWPGOb82A+B7nJ+XLz/zY69jAQAAuEgN/h4OAABQ/xEcAADAOoIDAABYR3DgshcXF6eMjAxfjwE0ahs2bJCfn5+Ki4vPux3nY+NFcMCq4cOHy8/PTzNnzvRavmLFikv+rq6LFi2q9g3etm3bptGjR1/SWYD66sw56+fnp6CgIF199dV6+umndfr06Ys6bmJiovLz8z1vEMX5ePkhOGBdSEiIZs2apePHj/t6lGq1bt1aYWFhvh4DqDf69++v/Px87du3T4899piefPJJ/cd//MdFHTMoKEhRUVE/+g8NzsfGi+CAdUlJSYqKilJ6enqN27z//vv6xS9+odDQUMXGxmrChAkqKyvzrM/Pz9fAgQMVGhqq+Ph4LVu27JxLr3PmzFHXrl3VtGlTxcbG6pFHHlFpaamk7y/njhgxQiUlJZ5/vT355JOSvC/h3nfffbr33nu9ZqusrFSrVq20ZMkSSd+/k216erri4+MVGhqqbt266fXXX6+DnxRQPwQHBysqKkpt27bV2LFjlZSUpJUrV+r48eN68MEH1bx5c4WFhWnAgAHat2+fZ7+vv/5ad955p5o3b66mTZvqmmuu0erVqyV5/0qF8/HyRHDAuiZNmui5557T/Pnz9c9//vOc9V999ZX69++vlJQU7dy5U6+88oref/99paamerZ58MEHdeTIEW3YsEH/+7//q5dffllFRUVex/H399e8efP02WefafHixVq3bp2mTp0q6fvLuRkZGYqMjFR+fr7y8/M1ZcqUc2YZOnSo3nzzTU+oSNI777yjkydP6te//rWk7z9teMmSJcrKytJnn32mSZMm6f7779fGjRvr5OcF1DehoaE6deqUhg8fru3bt2vlypXKzc2VMUZ33HGHKisrJUnjxo1TRUWFNm3apE8//VSzZs1SeHj4OcfjfLxMGcCiYcOGmUGDBhljjLnpppvMb3/7W2OMMW+88YY588dv5MiRZvTo0V77vffee8bf39989913Zvfu3UaS2bZtm2f9vn37jCQzd+7cGr/2a6+9Zlq2bOl5vnDhQuNwOM7Zrm3btp7jVFZWmlatWpklS5Z41v/mN78x9957rzHGmPLychMWFmY2b97sdYyRI0ea3/zmN+f/YQANwNnnrNvtNmvWrDHBwcFm8ODBRpL54IMPPNsePXrUhIaGmldffdUYY0zXrl3Nk08+We1x169fbySZ48ePG2M4Hy9HjeKtzdEwzJo1S3369DnnXzKffPKJdu7cqaVLl3qWGWPkdru1f/9+7d27VwEBAerRo4dn/dVXX63mzZt7HWft2rVKT0/XF198IZfLpdOnT6u8vFwnT578yb8TDggI0D333KOlS5fqgQceUFlZmf7+979r+fLlkqQvv/xSJ0+eVL9+/bz2O3XqlK6//voL+nkA9dWqVasUHh6uyspKud1u3Xfffbrrrru0atUq9erVy7Ndy5Yt1bFjR+3evVuSNGHCBI0dO1bvvvuukpKSlJKSouuuu67Wc3A+Ni4EBy6ZW2+9VcnJyUpLS9Pw4cM9y0tLSzVmzBhNmDDhnH3atGmjvXv3/uixDxw4oF/96lcaO3asnn32WbVo0ULvv/++Ro4cqVOnTl3QTWhDhw7VbbfdpqKiIq1Zs0ahoaHq37+/Z1ZJeuutt3TllVd67cdnQ6Cx6N27txYsWKCgoCDFxMQoICBAK1eu/NH9HnroISUnJ+utt97Su+++q/T0dL3wwgsaP358rWfhfGw8CA5cUjNnzlT37t3VsWNHz7IePXro888/19VXX13tPh07dtTp06f10Ucf6YYbbpD0/b9szn7VS15entxut1544QX5+39/a9Krr77qdZygoCBVVVX96IyJiYmKjY3VK6+8orffflt33323AgMDJUldunRRcHCwDh48qNtuu+3CvnmggWjatOk552Pnzp11+vRpbdmyRYmJiZKkb7/9Vnv27FGXLl0828XGxurhhx/Www8/rLS0NP3pT3+qNjg4Hy8/BAcuqa5du2ro0KGaN2+eZ9nvf/973XTTTUpNTdVDDz2kpk2b6vPPP9eaNWv04osvqlOnTkpKStLo0aO1YMECBQYG6rHHHlNoaKjnJXZXX321KisrNX/+fN1555364IMPlJWV5fW14+LiVFpaqpycHHXr1k1hYWE1Xvm47777lJWVpb1792r9+vWe5REREZoyZYomTZokt9utW265RSUlJfrggw8UGRmpYcOGWfipAb7Xvn17DRo0SKNGjdJLL72kiIgIPf7447ryyis1aNAgSdLEiRM1YMAAdejQQcePH9f69evVuXPnao/H+XgZ8vVNJGjczr4B7Yz9+/eboKAgc/Yfv61bt5p+/fqZ8PBw07RpU3PdddeZZ5991rP+yJEjZsCAASY4ONi0bdvWLFu2zFxxxRUmKyvLs82cOXNMdHS0CQ0NNcnJyWbJkiVeN6kZY8zDDz9sWrZsaSSZGTNmGGO8b1I74/PPPzeSTNu2bY3b7fZa53a7TUZGhunYsaMJDAw0rVu3NsnJyWbjxo0X98MC6oHqztkzjh07Zh544AHjcDg859nevXs961NTU027du1McHCwad26tXnggQfM0aNHjTHn3jRqDOfj5YaPp0eD9M9//lOxsbFau3at+vbt6+txAAA/guBAg7Bu3TqVlpaqa9euys/P19SpU3X48GHt3bvX8/tcAED9xT0caBAqKyv1b//2b/rHP/6hiIgIJSYmaunSpcQGADQQXOEAAADW8dbmAADAOoIDAABYR3AAAADrCA4AAGAdwQGg3tmwYYP8/PxUXFzs61EA1BGCA0CNvvnmG40dO1Zt2rRRcHCwoqKilJycrA8++KDOvsYvf/lLTZw40WtZYmKi8vPz5XA46uzr1Nbw4cM1ePBgX48BNHi8DweAGqWkpOjUqVNavHixrrrqKhUWFionJ0fffvut1a8bFBSkqKgoq18DwCXmy/dVB1B/HT9+3EgyGzZsOO82I0eONK1atTIRERGmd+/e5uOPP/asnzFjhunWrZtZsmSJadu2rYmMjDT33nuvcblcxpjvP7dDktdj//7953zuxsKFC43D4TBvvvmm6dChgwkNDTUpKSmmrKzMLFq0yLRt29Y0a9bMjB8/3pw+fdrz9cvLy81jjz1mYmJiTFhYmPn5z39u1q9f71l/5rjZ2dmmU6dOpmnTpiY5OdkcOXLEM/8P5zt7fwA/Hb9SAVCt8PBwhYeHa8WKFaqoqKh2m7vvvltFRUV6++23lZeXpx49eqhv3746duyYZ5uvvvpKK1as0KpVq7Rq1Spt3LhRM2fOlCT953/+pxISEjRq1Cjl5+crPz9fsbGx1X6tkydPat68eVq+fLmys7O1YcMG/frXv9bq1au1evVq/fWvf9VLL72k119/3bNPamqqcnNztXz5cu3cuVN33323+vfvr3379nkd9/nnn9df//pXbdq0SQcPHtSUKVMkSVOmTNE999yj/v37e+Y789HsAC6Qr4sHQP31+uuvm+bNm5uQkBCTmJho0tLSzCeffGKMMea9994zkZGRpry83Gufdu3amZdeeskY8/0VgrCwMM8VDWOM+d3vfmd69erleX7bbbeZRx991OsY1V3hkGS+/PJLzzZjxowxYWFh5sSJE55lycnJZsyYMcYYY77++mvTpEkTc/jwYa9j9+3b16SlpdV43MzMTON0Oj3Pz/fpqQB+Ou7hAFCjlJQUDRw4UO+9954+/PBDvf3225o9e7b+67/+S2VlZSotLVXLli299vnuu+/01VdfeZ7HxcUpIiLC8zw6OlpFRUUXPEtYWJjatWvnee50OhUXF6fw8HCvZWeO/emnn6qqqkodOnTwOk5FRYXXzD88bm3nA3B+BAeA8woJCVG/fv3Ur18/TZs2TQ899JBmzJihRx55RNHR0dqwYcM5+zRr1szz3z/8gD0/Pz+53e4LnqO645zv2KWlpWrSpIny8vLUpEkTr+3OjpTqjmH4iCmgzhEcAC5Ily5dtGLFCvXo0UMFBQUKCAhQXFxcrY8XFBSkqqqquhvw/11//fWqqqpSUVGRfvGLX9T6OLbmAy433DQKoFrffvut+vTpo//+7//Wzp07tX//fr322muaPXu2Bg0apKSkJCUkJGjw4MF69913deDAAW3evFl/+MMftH379p/8deLi4rRlyxYdOHBAR48erdXVj+p06NBBQ4cO1YMPPqi//e1v2r9/v7Zu3ar09HS99dZbFzTfzp07tWfPHh09elSVlZV1Mh9wuSE4AFQrPDxcvXr10ty5c3Xrrbfq2muv1bRp0zRq1Ci9+OKL8vPz0+rVq3XrrbdqxIgR6tChg4YMGaKvv/5aTqfzJ3+dKVOmqEmTJurSpYtat26tgwcP1tn3sHDhQj344IN67LHH1LFjRw0ePFjbtm1TmzZtfvIxRo0apY4dO6pnz55q3bp1nb7pGXA58TP8shIAAFjGFQ4AAGAdwQEAAKwjOAAAgHUEBwAAsI7gAAAA1hEcAADAOoIDAABYR3AAAADrCA4AAGAdwQEAAKwjOAAAgHUEBwAAsO7/AEFyyRxAFnsmAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n", "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n", "ax.set_xlabel(\"Sentiment\")\n", "ax.set_ylabel(\"Count\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (before tokenization)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b4a25bdbb50c49b585c563f23cd13cef", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/209 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Gather all the unique words in the dataset\n", "word_freq = Counter()\n", "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n", "\n", "# Now get the most common words\n", "common_words = word_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common words\n", "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (after tokenization)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9a560aa6aa15497690e4b28504a6ae44", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/209 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Gather all the unique tokens in the dataset\n", "token_freq = Counter()\n", "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n", "\n", "# Now get the most common tokens\n", "common_tokens = token_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common tokens\n", "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Token association" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5db39c26bec14882a8412f5ff70e9906", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/96 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", "\n", "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n", " freq = Counter()\n", " dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n", " most_common = freq.most_common(100)\n", "\n", " cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n", " ax[i].imshow(cloud, interpolation=\"bilinear\")\n", " ax[i].axis(\"off\")\n", " ax[i].set_title(sentiment)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Token frequency" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", "dataset[\"text\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"Before Tokenization\")\n", "dataset[\"tokens\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"After Tokenization\")\n", "\n", "ax.set_xlabel(\"Number of tokens\")\n", "ax.set_ylabel(\"Count\")\n", "ax.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vocabulary size" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vocabulary size before tokenization: 947\n", "Vocabulary size after tokenization: 689\n" ] } ], "source": [ "print(f\"Vocabulary size before tokenization: {len(word_freq)}\")\n", "print(f\"Vocabulary size after tokenization: {len(token_freq)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vectorization\n", "\n", "The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Define vectorizers\n", "vectorizers = {\n", " \"hashing\": _get_vectorizer(\"hashing\", n_features=2**20),\n", " \"count\": _get_vectorizer(\"count\", 20_000),\n", " \"tfidf\": _get_vectorizer(\"tfidf\", 20_000),\n", "}" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hashing: (209, 1048576)\n", "count: (209, 82)\n", "tfidf: (209, 82)\n" ] } ], "source": [ "# Fit and vectorize the tokens\n", "token_list = dataset[\"tokens\"].str.split().tolist()\n", "X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}\n", "\n", "# Display the shape of the vectorized data\n", "for name, data in X.items():\n", " print(f\"{name}: {data.shape}\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "address, allocate, asset, athlete, challenge, citizen, community, corruption, crucial, crucial role\n" ] } ], "source": [ "# Print the first 10 features of count and tfidf vectorizers\n", "features = vectorizers[\"count\"].get_feature_names_out()[:10]\n", "print(\", \".join(features))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Plot top 20 features of tfidf vectorizer\n", "features = vectorizers[\"tfidf\"].get_feature_names_out()\n", "terms = pd.DataFrame(X[\"tfidf\"].toarray(), columns=features)\n", "\n", "_, ax = plt.subplots(figsize=(6, 4))\n", "terms.sum().sort_values(ascending=False).head(20).plot(kind=\"bar\", ax=ax)\n", "ax.set_xlabel(\"Term\")\n", "ax.set_ylabel(\"Frequency\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Define classifiers\n", "classifiers = [\n", " (LogisticRegression(max_iter=1000, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", " (SVC(random_state=SEED), {\"C\": np.logspace(-3, 3, 20), \"gamma\": np.logspace(-3, 3, 20)}),\n", " (KNeighborsClassifier(), {\"n_neighbors\": np.arange(1, 12)}),\n", " (DecisionTreeClassifier(random_state=SEED), {\"max_depth\": np.arange(1, 12)}),\n", " (RandomForestClassifier(random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n", " (AdaBoostClassifier(algorithm=\"SAMME\", random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n", " (GradientBoostingClassifier(random_state=SEED), {\"n_estimators\": np.arange(100, 500, 25)}),\n", " (\n", " VotingClassifier(\n", " estimators=[\n", " (\"lr\", LogisticRegression(max_iter=1000, random_state=SEED)),\n", " (\"svc\", SVC(random_state=SEED)),\n", " (\"rf\", RandomForestClassifier(random_state=SEED)),\n", " ],\n", " ),\n", " {\n", " \"lr__C\": np.logspace(-3, 3, 20),\n", " \"svc__C\": np.logspace(-3, 3, 20),\n", " \"svc__gamma\": np.logspace(-3, 3, 20),\n", " \"rf__n_estimators\": np.arange(10, 500, 50),\n", " },\n", " ),\n", " (\n", " MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=1000, random_state=SEED),\n", " {\"learning_rate_init\": np.logspace(-3, 0, 20), \"batch_size\": [32, 64, 128]},\n", " ),\n", "]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Split the data into training and testing sets (using TF-IDF vectorizer)\n", "X_train, X_test, y_train, y_test = train_test_split(X[\"tfidf\"], dataset[\"sentiment\"], test_size=0.2, random_state=SEED)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Define the cross-validation strategy\n", "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b7251857c51f41699ab2a750855875ac", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/9 [00:00 np.ndarray:\n", " group_names = [\"True Neg\", \"False Pos\", \"False Neg\", \"True Pos\"]\n", " group_percentages = [f\"{value:.2%}\" for value in cm.flatten() / cm.sum()]\n", " labels = [f\"{v1}\\n{v2}\" for v1, v2 in zip(group_names, group_percentages)]\n", " return np.asarray(labels).reshape(2, 2)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ab4d2fe86927402bbcb49bcd0e19eca7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/9 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = plt.figure(figsize=(20, 20))\n", "N_COLS = 3\n", "N_ROWS = len(estimators) // N_COLS\n", "\n", "results = pd.DataFrame(columns=[\"Classifier\", \"Accuracy\", \"Precision\", \"Recall\", \"F1\", \"ROC AUC\", \"Time\", \"Memory\"])\n", "for i, (clf, tim, mem) in enumerate(tqdm(estimators, unit=\"clf\")):\n", " name = clf.__class__.__name__\n", "\n", " # Predict\n", " y_pred = clf.predict(X_test)\n", "\n", " # Calculate metrics\n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred)\n", " recall = recall_score(y_test, y_pred)\n", " f1 = f1_score(y_test, y_pred)\n", " roc_auc = roc_auc_score(y_test, y_pred)\n", " cm = confusion_matrix(y_test, y_pred)\n", "\n", " # Plot confusion matrix\n", " ax = plt.subplot(N_ROWS, N_COLS, i + 1, aspect=\"equal\")\n", " ax.grid(False)\n", " ax.set_title(f\"{name} | Accuracy: {accuracy:.2%}\")\n", "\n", " labels = [\"Negative\", \"Positive\"]\n", " sns.heatmap(\n", " cm,\n", " xticklabels=labels if i // N_COLS == N_ROWS - 1 else [],\n", " yticklabels=labels if i % N_COLS == 0 else [],\n", " annot=cm_annotations(cm),\n", " square=True,\n", " cbar=False,\n", " cmap=\"viridis\",\n", " linewidths=0.5,\n", " fmt=\"\",\n", " ax=ax,\n", " )\n", "\n", " # Save results\n", " results.loc[i] = [name, accuracy, precision, recall, f1, roc_auc, tim, mem // 1024]\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Sort the results by accuracy\n", "results = results.sort_values(\"Accuracy\", ascending=False).reset_index(drop=True)\n", "\n", "# Save the results to CSV\n", "output_results = results.copy()\n", "output_results.columns = output_results.columns.str.lower().str.replace(\" \", \"_\")\n", "output_results = output_results.rename(columns={\"time\": \"time_seconds\", \"memory\": \"memory_kb\"})\n", "output_results.to_csv(DATA_DIR / \"proto_results.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 ClassifierAccuracyPrecisionRecallF1ROC AUCTimeMemory
0RandomForestClassifier80.95%78.26%85.71%81.82%80.95%97.70s504KB
1VotingClassifier76.19%76.19%76.19%76.19%76.19%69.08s1288KB
2LogisticRegression73.81%72.73%76.19%74.42%73.81%0.96s310KB
3SVC71.43%73.68%66.67%70.00%71.43%0.56s244KB
4MLPClassifier71.43%73.68%66.67%70.00%71.43%20.38s1208KB
5AdaBoostClassifier69.05%65.38%80.95%72.34%69.05%93.89s643KB
6GradientBoostingClassifier69.05%65.38%80.95%72.34%69.05%46.88s899KB
7KNeighborsClassifier64.29%68.75%52.38%59.46%64.29%0.46s293KB
8DecisionTreeClassifier61.90%57.14%95.24%71.43%61.90%0.53s324KB
\n" ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Display the results\n", "results.style.format(\n", " {\n", " \"Accuracy\": \"{:.2%}\",\n", " \"Precision\": \"{:.2%}\",\n", " \"Recall\": \"{:.2%}\",\n", " \"F1\": \"{:.2%}\",\n", " \"ROC AUC\": \"{:.2%}\",\n", " \"Time\": \"{:.2f}s\",\n", " \"Memory\": \"{}KB\",\n", " },\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }