Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
import seaborn as sns | |
# Download stopwords | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('english')) | |
def preprocess_text(text): | |
text = re.sub(r'[^a-zA-Z\s]', '', text, re.I) | |
text = text.lower() | |
tokens = text.split() | |
tokens = [word for word in tokens if word not in stop_words] | |
return ' '.join(tokens) | |
def perform_lda(text_data, num_topics=5): | |
vectorizer = CountVectorizer(stop_words='english') | |
dtm = vectorizer.fit_transform(text_data) | |
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42) | |
lda.fit(dtm) | |
return lda, vectorizer, dtm | |
def plot_wordcloud(term_dict): | |
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(term_dict) | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
st.pyplot(fig) | |
def plot_topic_proportions(proportions, num_topics): | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.bar(range(num_topics), proportions, color='skyblue') | |
ax.set_title("Proportions of Different Topics in Text Data") | |
ax.set_xlabel("Topic") | |
ax.set_ylabel("Proportion") | |
ax.set_xticks(range(num_topics)) | |
ax.set_xticklabels([f"Topic {i+1}" for i in range(num_topics)]) | |
st.pyplot(fig) | |
def print_topics(lda, vectorizer, num_words=10): | |
terms = vectorizer.get_feature_names_out() | |
topics = [] | |
for index, topic in enumerate(lda.components_): | |
top_terms_idx = topic.argsort()[-num_words:][::-1] | |
top_terms = [terms[i] for i in top_terms_idx] | |
topics.append(f"Topic #{index + 1}: {', '.join(top_terms)}") | |
return topics | |
def display_document_topic_distribution(topic_proportions): | |
topic_df = pd.DataFrame(topic_proportions, columns=[f"Topic {i+1}" for i in range(topic_proportions.shape[1])]) | |
topic_df['Dominant Topic'] = topic_df.idxmax(axis=1) | |
return topic_df | |
# Streamlit UI | |
st.title("Text Analysis and Topic Modeling") | |
st.write("Upload a CSV file containing a column with text data.") | |
uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) | |
if uploaded_file: | |
data = pd.read_csv(uploaded_file) | |
text_column = st.selectbox("Select the text column", data.columns) | |
data['text_clean'] = data[text_column].apply(preprocess_text) | |
st.write("Sample Processed Text:") | |
st.write(data[['text_clean']].head()) | |
# Extract key terms | |
vectorizer = CountVectorizer(max_features=50, stop_words='english') | |
X = vectorizer.fit_transform(data['text_clean']) | |
terms = vectorizer.get_feature_names_out() | |
term_frequencies = X.sum(axis=0).A1 | |
term_dict = dict(zip(terms, term_frequencies)) | |
st.subheader("Word Cloud of Key Terms") | |
plot_wordcloud(term_dict) | |
# Perform LDA | |
num_topics = st.slider("Select number of topics", min_value=2, max_value=10, value=5) | |
lda, vectorizer_lda, dtm = perform_lda(data['text_clean'], num_topics) | |
# Display topics | |
st.subheader("Identified Topics") | |
topics = print_topics(lda, vectorizer_lda) | |
for topic in topics: | |
st.write(topic) | |
# Topic proportions | |
topic_proportions = lda.transform(dtm) | |
avg_topic_proportions = topic_proportions.mean(axis=0) | |
st.subheader("Topic Proportions") | |
plot_topic_proportions(avg_topic_proportions, lda.components_.shape[0]) | |
# Display document topic distribution | |
st.subheader("Document Topic Distribution") | |
document_topic_distribution = display_document_topic_distribution(topic_proportions) | |
st.write(document_topic_distribution.head()) | |