{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports, constants and setup" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "\n", "import joblib\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import StratifiedKFold, train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.svm import LinearSVC\n", "from sklearn.tree import DecisionTreeClassifier\n", "from tqdm.notebook import tqdm\n", "from wordcloud import WordCloud\n", "\n", "from app.constants import CACHE_DIR\n", "from app.data import load_data, tokenize\n", "from app.model import _get_vectorizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tqdm.pandas()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "SEED = 42\n", "CACHE = joblib.Memory(CACHE_DIR, verbose=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data loading" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentiment
0MC, happy mother`s day to your mom ;).. love yah1
1A year from now is graduation....i am pretty s...0
2Great for organising my work life balance1
3remember the guy who 1st #tweetbud you! ~> _2...1
4She! Maybe that was our first mistake. Not e...0
\n", "
" ], "text/plain": [ " text sentiment\n", "0 MC, happy mother`s day to your mom ;).. love yah 1\n", "1 A year from now is graduation....i am pretty s... 0\n", "2 Great for organising my work life balance 1\n", "3 remember the guy who 1st #tweetbud you! ~> _2... 1\n", "4 She! Maybe that was our first mistake. Not e... 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load data and convert to pandas DataFrame\n", "text_data, label_data = load_data(\"test\")\n", "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cleaning: 100%|██████████| 3276/3276 [00:02<00:00, 1205.57doc/s]\n", "Lemmatization: 100%|██████████| 3276/3276 [00:06<00:00, 508.76doc/s] \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentimenttokens
0MC, happy mother`s day to your mom ;).. love yah1happy mother day mom love yah
1A year from now is graduation....i am pretty s...0year graduationi pretty sure ready
2Great for organising my work life balance1great organise work life balance
3remember the guy who 1st #tweetbud you! ~> _2...1remember guy help flwrs smile
4She! Maybe that was our first mistake. Not e...0maybe mistake cool brown nose moment
\n", "
" ], "text/plain": [ " text sentiment \\\n", "0 MC, happy mother`s day to your mom ;).. love yah 1 \n", "1 A year from now is graduation....i am pretty s... 0 \n", "2 Great for organising my work life balance 1 \n", "3 remember the guy who 1st #tweetbud you! ~> _2... 1 \n", "4 She! Maybe that was our first mistake. Not e... 0 \n", "\n", " tokens \n", "0 happy mother day mom love yah \n", "1 year graduationi pretty sure ready \n", "2 great organise work life balance \n", "3 remember guy help flwrs smile \n", "4 maybe mistake cool brown nose moment " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Tokenize text data\n", "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n", "dataset[\"tokens\"] = tokens.apply(\" \".join)\n", "dataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data exploration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sentiment distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n", "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n", "ax.set_xlabel(\"Sentiment\")\n", "ax.set_ylabel(\"Count\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (before tokenization)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gather all the unique words in the dataset\n", "word_freq = Counter()\n", "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n", "\n", "# Now get the most common words\n", "common_words = word_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common words\n", "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (after tokenization)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gather all the unique tokens in the dataset\n", "token_freq = Counter()\n", "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n", "\n", "# Now get the most common tokens\n", "common_tokens = token_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common tokens\n", "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Token association" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", "\n", "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n", " freq = Counter()\n", " dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n", " most_common = freq.most_common(100)\n", "\n", " cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n", " ax[i].imshow(cloud, interpolation=\"bilinear\")\n", " ax[i].axis(\"off\")\n", " ax[i].set_title(sentiment)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Token frequency" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", "dataset[\"text\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"Before Tokenization\")\n", "dataset[\"tokens\"].str.split().str.len().plot(kind=\"hist\", bins=50, ax=ax, alpha=0.5, label=\"After Tokenization\")\n", "\n", "ax.set_xlabel(\"Number of tokens\")\n", "ax.set_ylabel(\"Count\")\n", "ax.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vocabulary size" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Vocabulary size before tokenization: {len(word_freq)}\")\n", "print(f\"Vocabulary size after tokenization: {len(token_freq)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vectorization\n", "\n", "The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Define vectorizers\n", "vectorizers = {\n", " \"hashing\": _get_vectorizer(\"hashing\", n_features=2**20),\n", " \"count\": _get_vectorizer(\"count\", 20_000),\n", " \"tfidf\": _get_vectorizer(\"tfidf\", 20_000),\n", "}" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hashing: (3276, 1048576)\n", "count: (3276, 1084)\n", "tfidf: (3276, 1084)\n" ] } ], "source": [ "# Fit and vectorize the tokens\n", "token_list = dataset[\"tokens\"].str.split().tolist()\n", "X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}\n", "\n", "# Display the shape of the vectorized data\n", "for name, data in X.items():\n", " print(f\"{name}: {data.shape}\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['ability' 'able' 'absolutely' 'access' 'accomplish' 'account' 'ace'\n", " 'active' 'activity' 'actually']\n" ] } ], "source": [ "# Print the first 10 features of count and tfidf vectorizers\n", "features = vectorizers[\"count\"].get_feature_names_out()[:10]\n", "print(features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Define classifiers\n", "classifiers = [\n", " (LogisticRegression(max_iter=1000, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", " (LinearSVC(max_iter=10000, dual=False, random_state=SEED), {\"C\": np.logspace(-3, 3, 20)}),\n", " (KNeighborsClassifier(), {\"n_neighbors\": np.arange(1, 10)}),\n", " (DecisionTreeClassifier(random_state=SEED), {\"max_depth\": np.arange(1, 10)}),\n", " (RandomForestClassifier(random_state=SEED), {\"n_estimators\": np.arange(10, 500, 50)}),\n", " (GradientBoostingClassifier(random_state=SEED), {\"n_estimators\": np.arange(100, 500, 25)}),\n", " (\n", " VotingClassifier(\n", " estimators=[\n", " (\"lr\", LogisticRegression(max_iter=1000, random_state=SEED)),\n", " (\"svc\", LinearSVC(max_iter=10000, dual=False, random_state=SEED)),\n", " (\"rf\", RandomForestClassifier(random_state=SEED)),\n", " ],\n", " ),\n", " {\n", " \"lr__C\": np.logspace(-3, 3, 20),\n", " \"svc__C\": np.logspace(-3, 3, 20),\n", " \"rf__n_estimators\": np.arange(10, 500, 50),\n", " },\n", " ),\n", "]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Split the data into training and testing sets\n", "X_split = {}\n", "for name, data in X.items():\n", " X_train, X_test, y_train, y_test = train_test_split(data, dataset[\"sentiment\"], test_size=0.2, random_state=SEED)\n", " X_split[name] = (X_train, X_test, y_train, y_test)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Define the cross-validation strategy\n", "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Search" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }