# Workshop: Sentiment Analysis

<div>
<img src="https://lh3.googleusercontent.com/pw/ADCreHdzakFbNdHwBE1ZrwOiNCQibViWOir9DF9Dv4fbZEdWpx4mzFOT_RxkUGLTyDW7fQ0OwEyNQwqllupbvm0WiU0RNuFs-kWx1fTIvjiSkPGE5m64PilOIeApxQLwX_rl-JU7uYT-ROxdppIsJimCeos=w406-h451-s-no-gm?authuser=0" width="390"/>
</div>

In [1]:
ls

[0m[01;34msample_data[0m/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd "/content/drive/MyDrive/689-WorkShop/Ass13-SemtimentAna"

/content/drive/MyDrive/689-WorkShop/Ass13-SemtimentAna


In [4]:
ls

imdb_reviews.csv  WorkshopSentimentsAna-65130700309.ipynb  WorkshopSentimentsAna-std.ipynb


In [5]:
!pip install nltk
!pip install transformers



## Rule-Based Approaches

- **Lexicon-Based Methods**: Use sentiment lexicons or dictionaries that contain words annotated with their sentiment polarity (positive, negative, neutral).
- **Pattern Matching**: Identify sentiment based on predefined patterns or rules in the text.


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
text = "I had a good experience with the product. Highly recommended!"

In [11]:
tokens = word_tokenize(text.lower())

In [12]:
print(tokens)

['i', 'had', 'a', 'good', 'experience', 'with', 'the', 'product', '.', 'highly', 'recommended', '!']


In [13]:
stop_words = set(stopwords.words('english'))

In [14]:
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  #alnum = alphanumeric

In [15]:
print(tokens)

['good', 'experience', 'product', 'highly', 'recommended']


In [16]:
# Sample positive and negative words
positive_words = set(['good', 'awesome', 'excellent', 'happy', 'positive'])
negative_words = set(['bad', 'terrible', 'poor', 'unhappy', 'negative'])

def rule_based_sentiment_analysis(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  #alnum = alphanumeric

    # Calculate sentiment score
    sentiment_score = sum(1 for word in tokens if word in positive_words) - sum(1 for word in tokens if word in negative_words)

    # Classify sentiment
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Example usage
text_to_analyze = "I had a good experience with the product. Highly recommended!"
sentiment_result = rule_based_sentiment_analysis(text_to_analyze)
print(f"Sentiment: {sentiment_result}")

Sentiment: Positive


## Machine Learning Approaches

### Import packages

In [17]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix



### TF-IDF vectorizer


<div>
<img src="https://www.kdnuggets.com/wp-content/uploads/awan_convert_text_documents_tfidf_matrix_tfidfvectorizer_3.png" width="590"/>
</div>


Image sources: https://www.kdnuggets.com/2022/09/convert-text-documents-tfidf-matrix-tfidfvectorizer.html






##### Example on Small data

In [18]:


# Sample data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

In [19]:
# Create a DataFrame for better visualization
df = pd.DataFrame({'Text': documents})

In [20]:
df

Unnamed: 0,Text
0,This is the first document.
1,This document is the second document.
2,And this is the third one.
3,Is this the first document?


In [21]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Text'].tolist())

In [22]:
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [23]:
# Print the original data
print("Original Data:")
print(df)

Original Data:
                                    Text
0            This is the first document.
1  This document is the second document.
2             And this is the third one.
3            Is this the first document?


In [24]:
print(tfidf_matrix)

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


In [25]:
# Print the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(tfidf_df)


TF-IDF Matrix:
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


## Naive Bayes classifier trained on the TF-IDF features.

<div>
<img src="https://lh3.googleusercontent.com/pw/AP1GczOBxYJIFFWIiKJfFMdSp10-P1FNc6fZx7Wxa2NKpgw36p0Lady_mBMgxnKXS4cRbHwH2rnItp3cjvgIYdAN_LDcLlEzf1-8AJ62ybBOEGe8QKyH-CAojll13TKEDAo3yxwKvDh7wJRv2Rf1u7wWTjDI=w1911-h485-s-no-gm?authuser=0" width="800"/>
</div>


### Read data/Preparation

In [26]:
# df = pd.read_csv("Womens_Clothing_E_Commerce_Reviews.csv")
df = pd.read_csv("imdb_reviews.csv")

In [27]:
df.shape

(50000, 2)

In [28]:
df.head(3)

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [29]:
df['label'].unique()

array(['positive', 'negative'], dtype=object)

In [None]:
df['label'].unique()

In [30]:
df.isna().sum()

text     0
label    0
dtype: int64

### Split the dataset into training and testing sets

In [31]:
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

In [32]:
print(train_data)

38094    As much as I love trains, I couldn't stomach t...
40624    This was a very good PPV, but like Wrestlemani...
49425    Not finding the right words is everybody's pro...
35734    I'm really suprised this movie didn't get a hi...
41708    I'll start by confessing that I tend to really...
                               ...                        
11284    `Shadow Magic' recaptures the joy and amazemen...
44732    I found this movie to be quite enjoyable and f...
38158    Avoid this one! It is a terrible movie. So wha...
860      This production was quite a surprise for me. I...
15795    This is a decent movie. Although little bit sh...
Name: text, Length: 35000, dtype: object


In [33]:
print(train_labels)

38094    negative
40624    positive
49425    negative
35734    positive
41708    negative
           ...   
11284    positive
44732    positive
38158    negative
860      positive
15795    positive
Name: label, Length: 35000, dtype: object


### Create a pipeline

In [34]:
sentiment_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

### Train the model using the pipeline

In [35]:
sentiment_pipeline.fit(train_data, train_labels)


### Make predictions on the test set

In [36]:
predictions = sentiment_pipeline.predict(test_data)

In [43]:
test_data[1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [41]:
test_labels[1]

'positive'

In [37]:
predictions

array(['negative', 'positive', 'negative', ..., 'negative', 'positive',
       'positive'], dtype='<U8')

### Evaluate the model

In [44]:
test_data.shape

(15000,)

In [45]:
test_labels.shape

(15000,)

In [46]:

report = classification_report(test_labels, predictions)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.89      0.86      7411
    positive       0.89      0.83      0.86      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



In [47]:
cm = confusion_matrix(test_labels, predictions)
cm

array([[6620,  791],
       [1296, 6293]])

## Huggingface: Pre-trained sentiment analysis model

https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

In [48]:

from transformers import pipeline
sentiment_analyzer = pipeline('sentiment-analysis', model ="distilbert-base-uncased-finetuned-sst-2-english") #, revision ="af0f99b")
data = ["I love you", "I hate you"]
sentiment_analyzer(data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [49]:
result = sentiment_analyzer("I love using this model!")
print(result)

[{'label': 'POSITIVE', 'score': 0.9996837377548218}]


## Huggingface: Thai

### model="poom-sci/WangchanBERTa-finetuned-sentiment"

https://huggingface.co/poom-sci/WangchanBERTa-finetuned-sentiment

In [50]:
from transformers import pipeline

sentiment_analyzer = pipeline('sentiment-analysis', model="poom-sci/WangchanBERTa-finetuned-sentiment")#, revision="b78d071")

data = ["อร่อยจัดๆ", "รอนานแท้"]
sentiment_analyzer(data)


config.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/305 [00:00<?, ?B/s]

[{'label': 'pos', 'score': 0.9793058037757874},
 {'label': 'neg', 'score': 0.7835226058959961}]

In [51]:
sentiment_analyzer("ข้าวบูด")

[{'label': 'neg', 'score': 0.7488928437232971}]

## Deploy on Streamlit Sharing

https://share.streamlit.io/  or https://huggingface.co/spaces

https://docs.streamlit.io/library/api-reference

https://github.com/



In [52]:

%%writefile app_senti_65130700309.py


import streamlit as st
from transformers import pipeline

# Load the sentiment analysis model
model_name = "poom-sci/WangchanBERTa-finetuned-sentiment"
sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)

# Streamlit app
st.title("Thai Sentiment Analysis App")

# Input text
text_input = st.text_area("Enter Thai text for sentiment analysis", "ขอความเห็นหน่อย... ")

# Button to trigger analysis
if st.button("Analyze Sentiment"):
    # Analyze sentiment using the model
    results = sentiment_analyzer([text_input])

    # Extract sentiment and score
    sentiment = results[0]['label']
    score = results[0]['score']


    # Display result as progress bars
    st.subheader("Sentiment Analysis Result:")

    if sentiment == 'pos':
        st.success(f"Positive Sentiment (Score: {score:.2f})")
        st.progress(score)
    elif sentiment == 'neg':
        st.error(f"Negative Sentiment (Score: {score:.2f})")
        st.progress(score)
    else:
        st.warning(f"Neutral Sentiment (Score: {score:.2f})")
        st.progress(score)


Writing app_senti_65130700309.py


In [53]:
%%writefile requirements.txt
transformers
torch


Writing requirements.txt
