Create Statistical NLP.py
Browse files- pages/Statistical NLP.py +111 -0
pages/Statistical NLP.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import nltk
|
4 |
+
from nltk import pos_tag, word_tokenize
|
5 |
+
from sklearn_crfsuite import CRF
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.datasets import fetch_20newsgroups
|
8 |
+
from collections import Counter
|
9 |
+
import random
|
10 |
+
|
11 |
+
# Download necessary NLTK data
|
12 |
+
nltk.download('punkt')
|
13 |
+
|
14 |
+
# Page title with emoji
|
15 |
+
st.title('π Statistical NLP π')
|
16 |
+
|
17 |
+
# Language Models: n-grams and smoothing techniques
|
18 |
+
st.markdown('<h2 style="color:#4CAF50">1οΈβ£ Language Models</h2>', unsafe_allow_html=True)
|
19 |
+
|
20 |
+
st.markdown('### π Definition:')
|
21 |
+
st.write("""
|
22 |
+
Language models are statistical models that assign probabilities to sequences of words.
|
23 |
+
They help us predict the next word in a sentence or assess the likelihood of a sentence.
|
24 |
+
- **n-grams**: A language model based on n-grams uses sequences of 'n' consecutive words to predict the next word.
|
25 |
+
- **Smoothing Techniques**: Smoothing techniques like Kneser-Ney are used to handle cases where a particular n-gram has not been seen in the training data, thus preventing zero probabilities.
|
26 |
+
""")
|
27 |
+
|
28 |
+
# Interactive example for n-grams
|
29 |
+
st.markdown('### π― n-gram Model Example:')
|
30 |
+
ngram_input = st.text_area("βοΈ Enter a sentence to see n-grams", "I love programming in Python")
|
31 |
+
|
32 |
+
n = st.slider("π’ Choose n for n-grams:", 1, 4, 2)
|
33 |
+
|
34 |
+
if st.button('β¨ Generate n-grams'):
|
35 |
+
tokens = word_tokenize(ngram_input.lower())
|
36 |
+
ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
|
37 |
+
st.success(f"{n}-grams in the input sentence: {ngrams}")
|
38 |
+
|
39 |
+
# Perplexity
|
40 |
+
st.markdown('<h2 style="color:#FF5733">2οΈβ£ Perplexity</h2>', unsafe_allow_html=True)
|
41 |
+
|
42 |
+
st.markdown('### π Definition:')
|
43 |
+
st.write("""
|
44 |
+
Perplexity is a measurement of how well a probability model predicts a sample.
|
45 |
+
Itβs often used to evaluate language models. A lower perplexity means the model is better at predicting the next word.
|
46 |
+
Itβs calculated as the inverse probability of the test set normalized by the number of words.
|
47 |
+
""")
|
48 |
+
|
49 |
+
# Interactive example for calculating Perplexity
|
50 |
+
st.markdown('### π Perplexity Calculation Example:')
|
51 |
+
sample_sentence = st.text_area("βοΈ Enter sentence to calculate perplexity", "This is an example sentence")
|
52 |
+
|
53 |
+
if st.button('π Calculate Perplexity'):
|
54 |
+
# Simple approximation of perplexity for demonstration
|
55 |
+
word_count = len(sample_sentence.split())
|
56 |
+
word_freq = Counter(sample_sentence.split())
|
57 |
+
vocab_size = len(word_freq)
|
58 |
+
perplexity = np.exp(sum(-np.log(word_freq[word] / word_count) for word in sample_sentence.split()) / word_count)
|
59 |
+
st.success(f"Perplexity of the sentence: {perplexity:.2f}")
|
60 |
+
|
61 |
+
# Hidden Markov Models (HMM): Part-of-Speech (POS) tagging
|
62 |
+
st.markdown('<h2 style="color:#3E7FCB">3οΈβ£ Hidden Markov Models (HMM)</h2>', unsafe_allow_html=True)
|
63 |
+
|
64 |
+
st.markdown('### π Definition:')
|
65 |
+
st.write("""
|
66 |
+
Hidden Markov Models (HMMs) are statistical models that represent sequences of observations with underlying hidden states.
|
67 |
+
In NLP, they are commonly used for tasks like Part-of-Speech (POS) tagging, where each word is assigned a grammatical category (e.g., noun, verb).
|
68 |
+
""")
|
69 |
+
|
70 |
+
# Interactive POS Tagging example using HMM
|
71 |
+
st.markdown('### π POS Tagging with HMM:')
|
72 |
+
sentence_to_tag = st.text_area("βοΈ Enter sentence for POS tagging", "I love programming in Python")
|
73 |
+
|
74 |
+
if st.button('π Tag POS'):
|
75 |
+
tokens = word_tokenize(sentence_to_tag)
|
76 |
+
tagged = pos_tag(tokens)
|
77 |
+
st.success(f"POS Tagging result: {tagged}")
|
78 |
+
|
79 |
+
# Conditional Random Fields (CRF): Sequence labeling tasks like NER
|
80 |
+
st.markdown('<h2 style="color:#E67E22">4οΈβ£ Conditional Random Fields (CRF)</h2>', unsafe_allow_html=True)
|
81 |
+
|
82 |
+
st.markdown('### π Definition:')
|
83 |
+
st.write("""
|
84 |
+
Conditional Random Fields (CRF) are used for sequence labeling tasks, where each element in the sequence is assigned a label.
|
85 |
+
They are particularly useful for tasks like Named Entity Recognition (NER), where we want to identify entities such as people, organizations, or locations in text.
|
86 |
+
""")
|
87 |
+
|
88 |
+
# Sample data for Named Entity Recognition (NER) using CRF
|
89 |
+
st.markdown('### π·οΈ NER Example using CRF:')
|
90 |
+
sample_sentence = st.text_area("βοΈ Enter sentence for NER (Named Entity Recognition)", "Barack Obama was born in Hawaii.")
|
91 |
+
|
92 |
+
# Sample NER prediction using a simple CRF model (a toy example)
|
93 |
+
ner_examples = [
|
94 |
+
(["Barack", "Obama", "was", "born", "in", "Hawaii"], ["B-PER", "I-PER", "O", "O", "O", "B-LOC"]),
|
95 |
+
(["Apple", "is", "based", "in", "California"], ["B-ORG", "O", "O", "O", "B-LOC"])
|
96 |
+
]
|
97 |
+
|
98 |
+
# Training a simple CRF model with toy data
|
99 |
+
X_train = [x[0] for x in ner_examples]
|
100 |
+
y_train = [x[1] for x in ner_examples]
|
101 |
+
|
102 |
+
# Initialize CRF
|
103 |
+
crf = CRF(algorithm='lbfgs')
|
104 |
+
crf.fit(X_train, y_train)
|
105 |
+
|
106 |
+
# Simple prediction for demonstration
|
107 |
+
if st.button('π Perform NER'):
|
108 |
+
tokens = word_tokenize(sample_sentence)
|
109 |
+
predicted_tags = crf.predict([tokens])[0]
|
110 |
+
entities = [(tokens[i], predicted_tags[i]) for i in range(len(tokens))]
|
111 |
+
st.success(f"NER result: {entities}")
|