{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports, constants and setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "import joblib\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from tqdm.notebook import tqdm\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "from app.constants import CACHE_DIR\n",
    "from app.data import load_data, tokenize\n",
    "from app.model import _get_vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 42\n",
    "CACHE = joblib.Memory(CACHE_DIR, verbose=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MC, happy mother`s day to your mom ;).. love yah</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A year from now is graduation....i am pretty s...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Great for organising my work life balance</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>remember the guy who 1st #tweetbud you! ~&gt; _2...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>She! Maybe that was our first mistake.  Not e...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  sentiment\n",
       "0   MC, happy mother`s day to your mom ;).. love yah          1\n",
       "1  A year from now is graduation....i am pretty s...          0\n",
       "2          Great for organising my work life balance          1\n",
       "3   remember the guy who 1st #tweetbud you! ~> _2...          1\n",
       "4   She! Maybe that was our first mistake.  Not e...          0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load data and convert to pandas DataFrame\n",
    "text_data, label_data = load_data(\"test\")\n",
    "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cleaning: 100%|██████████| 3276/3276 [00:02<00:00, 1205.57doc/s]\n",
      "Lemmatization: 100%|██████████| 3276/3276 [00:06<00:00, 508.76doc/s] \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MC, happy mother`s day to your mom ;).. love yah</td>\n",
       "      <td>1</td>\n",
       "      <td>happy mother day mom love yah</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A year from now is graduation....i am pretty s...</td>\n",
       "      <td>0</td>\n",
       "      <td>year graduationi pretty sure ready</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Great for organising my work life balance</td>\n",
       "      <td>1</td>\n",
       "      <td>great organise work life balance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>remember the guy who 1st #tweetbud you! ~&gt; _2...</td>\n",
       "      <td>1</td>\n",
       "      <td>remember guy help flwrs smile</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>She! Maybe that was our first mistake.  Not e...</td>\n",
       "      <td>0</td>\n",
       "      <td>maybe mistake cool brown nose moment</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  sentiment  \\\n",
       "0   MC, happy mother`s day to your mom ;).. love yah          1   \n",
       "1  A year from now is graduation....i am pretty s...          0   \n",
       "2          Great for organising my work life balance          1   \n",
       "3   remember the guy who 1st #tweetbud you! ~> _2...          1   \n",
       "4   She! Maybe that was our first mistake.  Not e...          0   \n",
       "\n",
       "                                 tokens  \n",
       "0         happy mother day mom love yah  \n",
       "1    year graduationi pretty sure ready  \n",
       "2      great organise work life balance  \n",
       "3         remember guy help flwrs smile  \n",
       "4  maybe mistake cool brown nose moment  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Tokenize text data\n",
    "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n",
    "dataset[\"tokens\"] = tokens.apply(\" \".join)\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data exploration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentiment distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "_, ax = plt.subplots(figsize=(6, 4))\n",
    "\n",
    "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n",
    "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n",
    "ax.set_xlabel(\"Sentiment\")\n",
    "ax.set_ylabel(\"Count\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word cloud (before tokenization)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gather all the unique words in the dataset\n",
    "word_freq = Counter()\n",
    "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n",
    "\n",
    "# Now get the most common words\n",
    "common_words = word_freq.most_common(100)\n",
    "\n",
    "# Create a word cloud of the most common words\n",
    "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n",
    "\n",
    "# Display the word cloud\n",
    "plt.figure(figsize=(20, 20))\n",
    "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n",
    "plt.axis(\"off\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word cloud (after tokenization)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gather all the unique tokens in the dataset\n",
    "token_freq = Counter()\n",
    "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n",
    "\n",
    "# Now get the most common tokens\n",
    "common_tokens = token_freq.most_common(100)\n",
    "\n",
    "# Create a word cloud of the most common tokens\n",
    "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n",
    "\n",
    "# Display the word cloud\n",
    "plt.figure(figsize=(20, 20))\n",
    "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n",
    "plt.axis(\"off\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Token association"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n",
    "\n",
    "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n",
    "    freq = Counter()\n",
    "    dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n",
    "    most_common = freq.most_common(100)\n",
    "\n",
    "    cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n",
    "    ax[i].imshow(cloud, interpolation=\"bilinear\")\n",
    "    ax[i].axis(\"off\")\n",
    "    ax[i].set_title(sentiment)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Token frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "_, ax = plt.subplots(figsize=(6, 4))\n",
    "\n",
    "dataset[\"text\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"Before Tokenization\")\n",
    "dataset[\"tokens\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"After Tokenization\")\n",
    "\n",
    "ax.set_xlabel(\"Number of tokens\")\n",
    "ax.set_ylabel(\"Count\")\n",
    "ax.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vocabulary size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Vocabulary size before tokenization: {len(word_freq)}\")\n",
    "print(f\"Vocabulary size after tokenization: {len(token_freq)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vectorization\n",
    "\n",
    "The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define vectorizers\n",
    "vectorizers = {\n",
    "    \"hashing\": _get_vectorizer(\"hashing\", n_features=2**20),\n",
    "    \"count\": _get_vectorizer(\"count\", 20_000),\n",
    "    \"tfidf\": _get_vectorizer(\"tfidf\", 20_000),\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hashing: (3276, 1048576)\n",
      "count: (3276, 1084)\n",
      "tfidf: (3276, 1084)\n"
     ]
    }
   ],
   "source": [
    "# Fit and vectorize the tokens\n",
    "token_list = dataset[\"tokens\"].str.split().tolist()\n",
    "X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}\n",
    "\n",
    "# Display the shape of the vectorized data\n",
    "for name, data in X.items():\n",
    "    print(f\"{name}: {data.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ability' 'able' 'absolutely' 'access' 'accomplish' 'account' 'ace'\n",
      " 'active' 'activity' 'actually']\n"
     ]
    }
   ],
   "source": [
    "# Print the first 10 features of count and tfidf vectorizers\n",
    "features = vectorizers[\"count\"].get_feature_names_out()[:10]\n",
    "print(features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define classifiers\n",
    "classifiers = [\n",
    "    (LogisticRegression(max_iter=1000, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n",
    "    (LinearSVC(max_iter=10000, dual=False, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n",
    "    (KNeighborsClassifier(), {\"n_neighbors\": np.arange(1, 10)}),\n",
    "    (DecisionTreeClassifier(random_state=SEED), {\"max_depth\": np.arange(1, 10)}),\n",
    "    (RandomForestClassifier(random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n",
    "    (GradientBoostingClassifier(random_state=SEED), {\"n_estimators\": np.arange(100, 500, 25)}),\n",
    "    (\n",
    "        VotingClassifier(\n",
    "            estimators=[\n",
    "                (\"lr\", LogisticRegression(max_iter=1000, random_state=SEED)),\n",
    "                (\"svc\", LinearSVC(max_iter=10000, dual=False, random_state=SEED)),\n",
    "                (\"rf\", RandomForestClassifier(random_state=SEED)),\n",
    "            ],\n",
    "        ),\n",
    "        {\n",
    "            \"lr__C\": np.logspace(-3, 3, 20),\n",
    "            \"svc__C\": np.logspace(-3, 3, 20),\n",
    "            \"rf__n_estimators\": np.arange(10, 500, 50),\n",
    "        },\n",
    "    ),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into training and testing sets\n",
    "X_split = {}\n",
    "for name, data in X.items():\n",
    "    X_train, X_test, y_train, y_test = train_test_split(data, dataset[\"sentiment\"], test_size=0.2, random_state=SEED)\n",
    "    X_split[name] = (X_train, X_test, y_train, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the cross-validation strategy\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}