Gopi9177 commited on
Commit
71169a9
Β·
verified Β·
1 Parent(s): 7dba2e2

Delete pages/Techniques of NLP.py

Browse files
Files changed (1) hide show
  1. pages/Techniques of NLP.py +0 -142
pages/Techniques of NLP.py DELETED
@@ -1,142 +0,0 @@
1
- import streamlit as st
2
- import string
3
- import numpy as np
4
- import pandas as pd
5
- import nltk
6
- from nltk.corpus import stopwords
7
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
8
- from sklearn.naive_bayes import MultinomialNB
9
- from sklearn.linear_model import LogisticRegression
10
- from sklearn.svm import SVC
11
- from sklearn.ensemble import RandomForestClassifier
12
- from sklearn.datasets import fetch_20newsgroups
13
- from sklearn.model_selection import train_test_split
14
- from sklearn.metrics import accuracy_score, classification_report
15
- from sklearn.decomposition import LatentDirichletAllocation, NMF
16
- from wordcloud import WordCloud
17
- import matplotlib.pyplot as plt
18
-
19
- # Download NLTK stopwords
20
- nltk.download('stopwords')
21
- stop_words = set(stopwords.words('english'))
22
-
23
- # Page title with emoji
24
- st.title('🌟 Traditional NLP Techniques 🌟')
25
-
26
- # Text Preprocessing
27
- st.header('πŸ”Ή 1. Text Preprocessing')
28
-
29
- st.subheader('πŸ“– Definition:')
30
- st.write("""
31
- Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
32
- This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
33
- and handling special characters like emojis.
34
- """)
35
-
36
- # Interactive example for preprocessing
37
- text_input = st.text_area("✍️ Enter text to preprocess", "I love NLP! 😍 This is amazing.")
38
-
39
- col1, col2, col3, col4 = st.columns(4)
40
-
41
- with col1:
42
- if st.button('βœ‚οΈ Remove Punctuation'):
43
- processed_text = ''.join([char for char in text_input if char not in string.punctuation])
44
- st.success(f"Text without punctuation: {processed_text}")
45
-
46
- with col2:
47
- if st.button('πŸ”‘ Convert to Lowercase'):
48
- lowercase_text = text_input.lower()
49
- st.success(f"Text in lowercase: {lowercase_text}")
50
-
51
- with col3:
52
- if st.button('😊 Remove Emojis'):
53
- processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
54
- st.success(f"Text without emojis: {processed_text_no_emoji}")
55
-
56
- with col4:
57
- if st.button('🚫 Remove Stopwords'):
58
- words = text_input.split()
59
- filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
60
- st.success(f"Text without stopwords: {filtered_text}")
61
-
62
- # Text Vectorization
63
- st.header('πŸ“Š 2. Text Vectorization')
64
-
65
- st.subheader('πŸ“– Definition:')
66
- st.write("""
67
- Text vectorization converts text into numerical form so that machine learning models can process it.
68
- Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
69
- """)
70
-
71
- # Interactive example for vectorization
72
- vectorization_choice = st.selectbox('πŸ›  Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
73
-
74
- # Text for vectorization
75
- sample_text = ["I love programming.", "NLP is fun.", "Streamlit makes things easy!"]
76
-
77
- if st.button('πŸš€ Apply Vectorization'):
78
- vectorizer = CountVectorizer() if vectorization_choice == 'Bag of Words' else TfidfVectorizer()
79
- X = vectorizer.fit_transform(sample_text)
80
- st.write(f"**Vectorized Representation:**\n{X.toarray()}")
81
- st.write(f"**Feature names:** {vectorizer.get_feature_names_out()}")
82
-
83
- # Basic Machine Learning
84
- st.header('πŸ€– 3. Basic Machine Learning')
85
-
86
- st.subheader('πŸ“– Definition:')
87
- st.write("""
88
- Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
89
- are commonly used for text classification tasks.
90
- """)
91
-
92
- # Load dataset
93
- newsgroups = fetch_20newsgroups(subset='train')
94
- X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
95
-
96
- model_choice = st.selectbox('πŸ€” Choose machine learning model for text classification:',
97
- ('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
98
-
99
- # Vectorization for classification
100
- vectorizer = TfidfVectorizer()
101
- X_train_vec = vectorizer.fit_transform(X_train)
102
- X_test_vec = vectorizer.transform(X_test)
103
-
104
- if st.button('🎯 Train Model'):
105
- model = {'Naive Bayes': MultinomialNB(), 'Logistic Regression': LogisticRegression(max_iter=1000),
106
- 'SVM': SVC(), 'Random Forest': RandomForestClassifier()}[model_choice]
107
-
108
- model.fit(X_train_vec, y_train)
109
- y_pred = model.predict(X_test_vec)
110
-
111
- accuracy = accuracy_score(y_test, y_pred)
112
- st.success(f"πŸ† Model Accuracy: {accuracy * 100:.2f}%")
113
- st.text("πŸ“Š Classification Report:")
114
- st.text(classification_report(y_test, y_pred))
115
-
116
- # Topic Modeling
117
- st.header('πŸ“š 4. Topic Modeling')
118
-
119
- st.subheader('πŸ“– Definition:')
120
- st.write("""
121
- Topic modeling is a technique used to identify the underlying topics in a collection of text data.
122
- Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
123
- """)
124
-
125
- topic_model_choice = st.selectbox('πŸ” Choose topic modeling technique:', ('LDA', 'NMF'))
126
-
127
- if st.button('πŸ“Œ Run Topic Modeling'):
128
- vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
129
- X = vectorizer.fit_transform(newsgroups.data)
130
-
131
- model = LatentDirichletAllocation(n_components=5, random_state=42) if topic_model_choice == 'LDA' else NMF(n_components=5, random_state=42)
132
- model.fit(X)
133
- feature_names = vectorizer.get_feature_names_out()
134
-
135
- for topic_idx, topic in enumerate(model.components_):
136
- st.write(f"πŸ“ **Topic {topic_idx + 1}:**")
137
- top_words_idx = topic.argsort()[:-10 - 1:-1]
138
- top_words = [feature_names[i] for i in top_words_idx]
139
- st.success(", ".join(top_words))
140
-
141
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
142
- st.image(wordcloud.to_array(), caption=f"πŸŒ₯ Word Cloud for Topic {topic_idx + 1}")