Create Tech.py
Browse files- pages/Tech.py +152 -0
    	
        pages/Tech.py
    ADDED
    
    | @@ -0,0 +1,152 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            import string
         | 
| 3 | 
            +
            import numpy as np
         | 
| 4 | 
            +
            import pandas as pd
         | 
| 5 | 
            +
            import nltk
         | 
| 6 | 
            +
            from nltk.corpus import stopwords
         | 
| 7 | 
            +
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
         | 
| 8 | 
            +
            from sklearn.naive_bayes import MultinomialNB
         | 
| 9 | 
            +
            from sklearn.linear_model import LogisticRegression
         | 
| 10 | 
            +
            from sklearn.svm import SVC
         | 
| 11 | 
            +
            from sklearn.ensemble import RandomForestClassifier
         | 
| 12 | 
            +
            from sklearn.datasets import fetch_20newsgroups
         | 
| 13 | 
            +
            from sklearn.model_selection import train_test_split
         | 
| 14 | 
            +
            from sklearn.metrics import accuracy_score, classification_report
         | 
| 15 | 
            +
            from sklearn.decomposition import LatentDirichletAllocation, NMF
         | 
| 16 | 
            +
            from wordcloud import WordCloud
         | 
| 17 | 
            +
            import matplotlib.pyplot as plt
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            # Download NLTK stopwords
         | 
| 20 | 
            +
            nltk.download('stopwords')
         | 
| 21 | 
            +
            stop_words = set(stopwords.words('english'))
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            # Page title with emoji
         | 
| 24 | 
            +
            st.markdown("""
         | 
| 25 | 
            +
                <h1 style='text-align: center; color: #FF5733;'>π  Techniques of NLP π</h1>
         | 
| 26 | 
            +
            """, unsafe_allow_html=True)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # Text Preprocessing
         | 
| 29 | 
            +
            st.markdown("""
         | 
| 30 | 
            +
                <h2 style='color: #2E86C1;'>πΉ 1. Text Preprocessing</h2>
         | 
| 31 | 
            +
            """, unsafe_allow_html=True)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            st.subheader('π Definition:')
         | 
| 34 | 
            +
            st.write("""
         | 
| 35 | 
            +
            Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling. 
         | 
| 36 | 
            +
            This includes tasks such as removing unnecessary punctuation, converting text to lowercase, 
         | 
| 37 | 
            +
            and handling special characters like emojis.
         | 
| 38 | 
            +
            """)
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            # Interactive example for preprocessing
         | 
| 41 | 
            +
            text_input = st.text_area("βοΈ Enter text to preprocess", "I love NLP! π This is amazing.")
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            col1, col2, col3, col4 = st.columns(4)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            with col1:
         | 
| 46 | 
            +
                if st.button('βοΈ Remove Punctuation'):
         | 
| 47 | 
            +
                    processed_text = ''.join([char for char in text_input if char not in string.punctuation])
         | 
| 48 | 
            +
                    st.success(f"Text without punctuation: {processed_text}")
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            with col2:
         | 
| 51 | 
            +
                if st.button('π‘ Convert to Lowercase'):
         | 
| 52 | 
            +
                    lowercase_text = text_input.lower()
         | 
| 53 | 
            +
                    st.success(f"Text in lowercase: {lowercase_text}")
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            with col3:
         | 
| 56 | 
            +
                if st.button('π Remove Emojis'):
         | 
| 57 | 
            +
                    processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
         | 
| 58 | 
            +
                    st.success(f"Text without emojis: {processed_text_no_emoji}")
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            with col4:
         | 
| 61 | 
            +
                if st.button('π« Remove Stopwords'):
         | 
| 62 | 
            +
                    words = text_input.split()
         | 
| 63 | 
            +
                    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
         | 
| 64 | 
            +
                    st.success(f"Text without stopwords: {filtered_text}")
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            # Text Vectorization
         | 
| 67 | 
            +
            st.markdown("""
         | 
| 68 | 
            +
                <h2 style='color: #2E86C1;'>π 2. Text Vectorization</h2>
         | 
| 69 | 
            +
            """, unsafe_allow_html=True)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            st.subheader('π Definition:')
         | 
| 72 | 
            +
            st.write("""
         | 
| 73 | 
            +
            Text vectorization converts text into numerical form so that machine learning models can process it. 
         | 
| 74 | 
            +
            Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
         | 
| 75 | 
            +
            """)
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Interactive example for vectorization
         | 
| 78 | 
            +
            vectorization_choice = st.selectbox('π  Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            # New example for vectorization
         | 
| 81 | 
            +
            sample_text = ["Artificial intelligence is transforming the world.", "Natural Language Processing is a subset of AI.", "Machine learning algorithms improve over time!"]
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            if st.button('π Apply Vectorization'):
         | 
| 84 | 
            +
                vectorizer = CountVectorizer() if vectorization_choice == 'Bag of Words' else TfidfVectorizer()
         | 
| 85 | 
            +
                X = vectorizer.fit_transform(sample_text)
         | 
| 86 | 
            +
                st.write(f"**Vectorized Representation:**\n{X.toarray()}")
         | 
| 87 | 
            +
                st.write(f"**Feature names:** {vectorizer.get_feature_names_out()}")
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            # Basic Machine Learning
         | 
| 90 | 
            +
            st.markdown("""
         | 
| 91 | 
            +
                <h2 style='color: #2E86C1;'>π€ 3. Basic Machine Learning</h2>
         | 
| 92 | 
            +
            """, unsafe_allow_html=True)
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            st.subheader('π Definition:')
         | 
| 95 | 
            +
            st.write("""
         | 
| 96 | 
            +
            Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM), 
         | 
| 97 | 
            +
            are commonly used for text classification tasks.
         | 
| 98 | 
            +
            """)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            # Load dataset
         | 
| 101 | 
            +
            newsgroups = fetch_20newsgroups(subset='train')
         | 
| 102 | 
            +
            X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            model_choice = st.selectbox('π€ Choose machine learning model for text classification:', 
         | 
| 105 | 
            +
                                        ('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            # Vectorization for classification
         | 
| 108 | 
            +
            vectorizer = TfidfVectorizer()
         | 
| 109 | 
            +
            X_train_vec = vectorizer.fit_transform(X_train)
         | 
| 110 | 
            +
            X_test_vec = vectorizer.transform(X_test)
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            if st.button('π― Train Model'):
         | 
| 113 | 
            +
                model = {'Naive Bayes': MultinomialNB(), 'Logistic Regression': LogisticRegression(max_iter=1000), 
         | 
| 114 | 
            +
                         'SVM': SVC(), 'Random Forest': RandomForestClassifier()}[model_choice]
         | 
| 115 | 
            +
                
         | 
| 116 | 
            +
                model.fit(X_train_vec, y_train)
         | 
| 117 | 
            +
                y_pred = model.predict(X_test_vec)
         | 
| 118 | 
            +
                
         | 
| 119 | 
            +
                accuracy = accuracy_score(y_test, y_pred)
         | 
| 120 | 
            +
                st.success(f"π Model Accuracy: {accuracy * 100:.2f}%")
         | 
| 121 | 
            +
                st.text("π Classification Report:")
         | 
| 122 | 
            +
                st.text(classification_report(y_test, y_pred))
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            # Topic Modeling
         | 
| 125 | 
            +
            st.markdown("""
         | 
| 126 | 
            +
                <h2 style='color: #2E86C1;'>π 4. Topic Modeling</h2>
         | 
| 127 | 
            +
            """, unsafe_allow_html=True)
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            st.subheader('π Definition:')
         | 
| 130 | 
            +
            st.write("""
         | 
| 131 | 
            +
            Topic modeling is a technique used to identify the underlying topics in a collection of text data. 
         | 
| 132 | 
            +
            Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
         | 
| 133 | 
            +
            """)
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            topic_model_choice = st.selectbox('π Choose topic modeling technique:', ('LDA', 'NMF'))
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            if st.button('π Run Topic Modeling'):
         | 
| 138 | 
            +
                vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
         | 
| 139 | 
            +
                X = vectorizer.fit_transform(newsgroups.data)
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                model = LatentDirichletAllocation(n_components=5, random_state=42) if topic_model_choice == 'LDA' else NMF(n_components=5, random_state=42)
         | 
| 142 | 
            +
                model.fit(X)
         | 
| 143 | 
            +
                feature_names = vectorizer.get_feature_names_out()
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                for topic_idx, topic in enumerate(model.components_):
         | 
| 146 | 
            +
                    st.write(f"π **Topic {topic_idx + 1}:**")
         | 
| 147 | 
            +
                    top_words_idx = topic.argsort()[:-10 - 1:-1]
         | 
| 148 | 
            +
                    top_words = [feature_names[i] for i in top_words_idx]
         | 
| 149 | 
            +
                    st.success(", ".join(top_words))
         | 
| 150 | 
            +
                
         | 
| 151 | 
            +
                wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
         | 
| 152 | 
            +
                st.image(wordcloud.to_array(), caption=f"π₯ Word Cloud for Topic {topic_idx + 1}")
         |