{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from __future__ import annotations\n", "\n", "from typing import TYPE_CHECKING\n", "\n", "if TYPE_CHECKING:\n", " from sklearn.base import BaseEstimator\n", "\n", "import re\n", "import warnings\n", "from functools import cache\n", "\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from joblib import Memory\n", "from nltk.corpus import stopwords\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.model_selection import RandomizedSearchCV, train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import SVC\n", "\n", "from app.constants import CACHE_DIR, SENTIMENT140_PATH\n", "from app.model import TextCleaner, TextLemmatizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "SEED = 42\n", "MAX_FEATURES = 20000" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/tymec/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package stopwords to /home/tymec/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download(\"wordnet\")\n", "nltk.download(\"stopwords\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | target | \n", "id | \n", "date | \n", "flag | \n", "user | \n", "text | \n", "sentiment | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "1467810369 | \n", "Mon Apr 06 22:19:45 PDT 2009 | \n", "NO_QUERY | \n", "_TheSpecialOne_ | \n", "@switchfoot http://twitpic.com/2y1zl - Awww, t... | \n", "negative | \n", "
1 | \n", "0 | \n", "1467810672 | \n", "Mon Apr 06 22:19:49 PDT 2009 | \n", "NO_QUERY | \n", "scotthamilton | \n", "is upset that he can't update his Facebook by ... | \n", "negative | \n", "
2 | \n", "0 | \n", "1467810917 | \n", "Mon Apr 06 22:19:53 PDT 2009 | \n", "NO_QUERY | \n", "mattycus | \n", "@Kenichan I dived many times for the ball. Man... | \n", "negative | \n", "
3 | \n", "0 | \n", "1467811184 | \n", "Mon Apr 06 22:19:57 PDT 2009 | \n", "NO_QUERY | \n", "ElleCTF | \n", "my whole body feels itchy and like its on fire | \n", "negative | \n", "
4 | \n", "0 | \n", "1467811193 | \n", "Mon Apr 06 22:19:57 PDT 2009 | \n", "NO_QUERY | \n", "Karoli | \n", "@nationwideclass no, it's not behaving at all.... | \n", "negative | \n", "
\n", " | word | \n", "count | \n", "
---|---|---|
0 | \n", "i | \n", "750749 | \n", "
1 | \n", "to | \n", "564469 | \n", "
2 | \n", "the | \n", "520036 | \n", "
3 | \n", "a | \n", "377506 | \n", "
4 | \n", "my | \n", "314024 | \n", "
Pipeline(memory=Memory(location=.cache),\n", " steps=[('clean', TextCleaner()), ('lemma', TextLemmatizer()),\n", " ('vectorize',\n", " CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", " stop_words=['i', 'me', 'my', 'myself', 'we',\n", " 'our', 'ours', 'ourselves', 'you',\n", " "you're", "you've", "you'll",\n", " "you'd", 'your', 'yours',\n", " 'yourself', 'yourselves', 'he',\n", " 'him', 'his', 'himself', 'she',\n", " "she's", 'her', 'hers', 'herself',\n", " 'it', "it's", 'its', 'itself', ...])),\n", " ('tfidf', TfidfTransformer())],\n", " verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(memory=Memory(location=.cache),\n", " steps=[('clean', TextCleaner()), ('lemma', TextLemmatizer()),\n", " ('vectorize',\n", " CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", " stop_words=['i', 'me', 'my', 'myself', 'we',\n", " 'our', 'ours', 'ourselves', 'you',\n", " "you're", "you've", "you'll",\n", " "you'd", 'your', 'yours',\n", " 'yourself', 'yourselves', 'he',\n", " 'him', 'his', 'himself', 'she',\n", " "she's", 'her', 'hers', 'herself',\n", " 'it', "it's", 'its', 'itself', ...])),\n", " ('tfidf', TfidfTransformer())],\n", " verbose=True)
TextCleaner()
TextLemmatizer()
CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", " stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',\n", " 'ourselves', 'you', "you're", "you've", "you'll",\n", " "you'd", 'your', 'yours', 'yourself', 'yourselves',\n", " 'he', 'him', 'his', 'himself', 'she', "she's",\n", " 'her', 'hers', 'herself', 'it', "it's", 'its',\n", " 'itself', ...])
TfidfTransformer()