import streamlit as st import pandas as pd import re import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import matplotlib.pyplot as plt from wordcloud import WordCloud import seaborn as sns # Download stopwords nltk.download('stopwords') stop_words = set(stopwords.words('english')) def preprocess_text(text): text = re.sub(r'[^a-zA-Z\s]', '', text, re.I) text = text.lower() tokens = text.split() tokens = [word for word in tokens if word not in stop_words] return ' '.join(tokens) def perform_lda(text_data, num_topics=5): vectorizer = CountVectorizer(stop_words='english') dtm = vectorizer.fit_transform(text_data) lda = LatentDirichletAllocation(n_components=num_topics, random_state=42) lda.fit(dtm) return lda, vectorizer, dtm def plot_wordcloud(term_dict): wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(term_dict) fig, ax = plt.subplots(figsize=(10, 5)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') st.pyplot(fig) def plot_topic_proportions(proportions, num_topics): fig, ax = plt.subplots(figsize=(10, 6)) ax.bar(range(num_topics), proportions, color='skyblue') ax.set_title("Proportions of Different Topics in Text Data") ax.set_xlabel("Topic") ax.set_ylabel("Proportion") ax.set_xticks(range(num_topics)) ax.set_xticklabels([f"Topic {i+1}" for i in range(num_topics)]) st.pyplot(fig) def print_topics(lda, vectorizer, num_words=10): terms = vectorizer.get_feature_names_out() topics = [] for index, topic in enumerate(lda.components_): top_terms_idx = topic.argsort()[-num_words:][::-1] top_terms = [terms[i] for i in top_terms_idx] topics.append(f"Topic #{index + 1}: {', '.join(top_terms)}") return topics def display_document_topic_distribution(topic_proportions): topic_df = pd.DataFrame(topic_proportions, columns=[f"Topic {i+1}" for i in range(topic_proportions.shape[1])]) topic_df['Dominant Topic'] = topic_df.idxmax(axis=1) return topic_df # Streamlit UI st.title("Text Analysis and Topic Modeling") st.write("Upload a CSV file containing a column with text data.") uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) if uploaded_file: data = pd.read_csv(uploaded_file) text_column = st.selectbox("Select the text column", data.columns) data['text_clean'] = data[text_column].apply(preprocess_text) st.write("Sample Processed Text:") st.write(data[['text_clean']].head()) # Extract key terms vectorizer = CountVectorizer(max_features=50, stop_words='english') X = vectorizer.fit_transform(data['text_clean']) terms = vectorizer.get_feature_names_out() term_frequencies = X.sum(axis=0).A1 term_dict = dict(zip(terms, term_frequencies)) st.subheader("Word Cloud of Key Terms") plot_wordcloud(term_dict) # Perform LDA num_topics = st.slider("Select number of topics", min_value=2, max_value=10, value=5) lda, vectorizer_lda, dtm = perform_lda(data['text_clean'], num_topics) # Display topics st.subheader("Identified Topics") topics = print_topics(lda, vectorizer_lda) for topic in topics: st.write(topic) # Topic proportions topic_proportions = lda.transform(dtm) avg_topic_proportions = topic_proportions.mean(axis=0) st.subheader("Topic Proportions") plot_topic_proportions(avg_topic_proportions, lda.components_.shape[0]) # Display document topic distribution st.subheader("Document Topic Distribution") document_topic_distribution = display_document_topic_distribution(topic_proportions) st.write(document_topic_distribution.head())