Spaces:

Gopi9177
/

NLP

Running

App Files Files Community

Gopi9177 commited on Feb 10

Commit

7dba2e2

verified ·

1 Parent(s): 7a45015

Create Tech.py

Browse files

Files changed (1) hide show

pages/Tech.py +152 -0

pages/Tech.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import streamlit as st
+import string
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.decomposition import LatentDirichletAllocation, NMF
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# Download NLTK stopwords
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+# Page title with emoji
+st.markdown("""
+    <h1 style='text-align: center; color: #FF5733;'>🌟  Techniques of NLP 🌟</h1>
+""", unsafe_allow_html=True)
+# Text Preprocessing
+st.markdown("""
+    <h2 style='color: #2E86C1;'>🔹 1. Text Preprocessing</h2>
+""", unsafe_allow_html=True)
+st.subheader('📖 Definition:')
+st.write("""
+Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
+This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
+and handling special characters like emojis.
+""")
+# Interactive example for preprocessing
+text_input = st.text_area("✍️ Enter text to preprocess", "I love NLP! 😍 This is amazing.")
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    if st.button('✂️ Remove Punctuation'):
+        processed_text = ''.join([char for char in text_input if char not in string.punctuation])
+        st.success(f"Text without punctuation: {processed_text}")
+with col2:
+    if st.button('🔡 Convert to Lowercase'):
+        lowercase_text = text_input.lower()
+        st.success(f"Text in lowercase: {lowercase_text}")
+with col3:
+    if st.button('😊 Remove Emojis'):
+        processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
+        st.success(f"Text without emojis: {processed_text_no_emoji}")
+with col4:
+    if st.button('🚫 Remove Stopwords'):
+        words = text_input.split()
+        filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
+        st.success(f"Text without stopwords: {filtered_text}")
+# Text Vectorization
+st.markdown("""
+    <h2 style='color: #2E86C1;'>📊 2. Text Vectorization</h2>
+""", unsafe_allow_html=True)
+st.subheader('📖 Definition:')
+st.write("""
+Text vectorization converts text into numerical form so that machine learning models can process it.
+Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
+""")
+# Interactive example for vectorization
+vectorization_choice = st.selectbox('🛠 Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
+# New example for vectorization
+sample_text = ["Artificial intelligence is transforming the world.", "Natural Language Processing is a subset of AI.", "Machine learning algorithms improve over time!"]
+if st.button('🚀 Apply Vectorization'):
+    vectorizer = CountVectorizer() if vectorization_choice == 'Bag of Words' else TfidfVectorizer()
+    X = vectorizer.fit_transform(sample_text)
+    st.write(f"**Vectorized Representation:**\n{X.toarray()}")
+    st.write(f"**Feature names:** {vectorizer.get_feature_names_out()}")
+# Basic Machine Learning
+st.markdown("""
+    <h2 style='color: #2E86C1;'>🤖 3. Basic Machine Learning</h2>
+""", unsafe_allow_html=True)
+st.subheader('📖 Definition:')
+st.write("""
+Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
+are commonly used for text classification tasks.
+""")
+# Load dataset
+newsgroups = fetch_20newsgroups(subset='train')
+X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
+model_choice = st.selectbox('🤔 Choose machine learning model for text classification:',
+                            ('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
+# Vectorization for classification
+vectorizer = TfidfVectorizer()
+X_train_vec = vectorizer.fit_transform(X_train)
+X_test_vec = vectorizer.transform(X_test)
+if st.button('🎯 Train Model'):
+    model = {'Naive Bayes': MultinomialNB(), 'Logistic Regression': LogisticRegression(max_iter=1000),
+             'SVM': SVC(), 'Random Forest': RandomForestClassifier()}[model_choice]
+    model.fit(X_train_vec, y_train)
+    y_pred = model.predict(X_test_vec)
+    accuracy = accuracy_score(y_test, y_pred)
+    st.success(f"🏆 Model Accuracy: {accuracy * 100:.2f}%")
+    st.text("📊 Classification Report:")
+    st.text(classification_report(y_test, y_pred))
+# Topic Modeling
+st.markdown("""
+    <h2 style='color: #2E86C1;'>📚 4. Topic Modeling</h2>
+""", unsafe_allow_html=True)
+st.subheader('📖 Definition:')
+st.write("""
+Topic modeling is a technique used to identify the underlying topics in a collection of text data.
+Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
+""")
+topic_model_choice = st.selectbox('🔍 Choose topic modeling technique:', ('LDA', 'NMF'))
+if st.button('📌 Run Topic Modeling'):
+    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
+    X = vectorizer.fit_transform(newsgroups.data)
+    model = LatentDirichletAllocation(n_components=5, random_state=42) if topic_model_choice == 'LDA' else NMF(n_components=5, random_state=42)
+    model.fit(X)
+    feature_names = vectorizer.get_feature_names_out()
+    for topic_idx, topic in enumerate(model.components_):
+        st.write(f"📝 **Topic {topic_idx + 1}:**")
+        top_words_idx = topic.argsort()[:-10 - 1:-1]
+        top_words = [feature_names[i] for i in top_words_idx]
+        st.success(", ".join(top_words))
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
+    st.image(wordcloud.to_array(), caption=f"🌥 Word Cloud for Topic {topic_idx + 1}")