GENAI / app.py
Manikanta3776's picture
Update app.py
6e8f746 verified
import streamlit as st
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
text = text.lower()
tokens = text.split()
tokens = [word for word in tokens if word not in stop_words]
return ' '.join(tokens)
def perform_lda(text_data, num_topics=5):
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(text_data)
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(dtm)
return lda, vectorizer, dtm
def plot_wordcloud(term_dict):
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(term_dict)
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
st.pyplot(fig)
def plot_topic_proportions(proportions, num_topics):
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(num_topics), proportions, color='skyblue')
ax.set_title("Proportions of Different Topics in Text Data")
ax.set_xlabel("Topic")
ax.set_ylabel("Proportion")
ax.set_xticks(range(num_topics))
ax.set_xticklabels([f"Topic {i+1}" for i in range(num_topics)])
st.pyplot(fig)
def print_topics(lda, vectorizer, num_words=10):
terms = vectorizer.get_feature_names_out()
topics = []
for index, topic in enumerate(lda.components_):
top_terms_idx = topic.argsort()[-num_words:][::-1]
top_terms = [terms[i] for i in top_terms_idx]
topics.append(f"Topic #{index + 1}: {', '.join(top_terms)}")
return topics
def display_document_topic_distribution(topic_proportions):
topic_df = pd.DataFrame(topic_proportions, columns=[f"Topic {i+1}" for i in range(topic_proportions.shape[1])])
topic_df['Dominant Topic'] = topic_df.idxmax(axis=1)
return topic_df
# Streamlit UI
st.title("Text Analysis and Topic Modeling")
st.write("Upload a CSV file containing a column with text data.")
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
if uploaded_file:
data = pd.read_csv(uploaded_file)
text_column = st.selectbox("Select the text column", data.columns)
data['text_clean'] = data[text_column].apply(preprocess_text)
st.write("Sample Processed Text:")
st.write(data[['text_clean']].head())
# Extract key terms
vectorizer = CountVectorizer(max_features=50, stop_words='english')
X = vectorizer.fit_transform(data['text_clean'])
terms = vectorizer.get_feature_names_out()
term_frequencies = X.sum(axis=0).A1
term_dict = dict(zip(terms, term_frequencies))
st.subheader("Word Cloud of Key Terms")
plot_wordcloud(term_dict)
# Perform LDA
num_topics = st.slider("Select number of topics", min_value=2, max_value=10, value=5)
lda, vectorizer_lda, dtm = perform_lda(data['text_clean'], num_topics)
# Display topics
st.subheader("Identified Topics")
topics = print_topics(lda, vectorizer_lda)
for topic in topics:
st.write(topic)
# Topic proportions
topic_proportions = lda.transform(dtm)
avg_topic_proportions = topic_proportions.mean(axis=0)
st.subheader("Topic Proportions")
plot_topic_proportions(avg_topic_proportions, lda.components_.shape[0])
# Display document topic distribution
st.subheader("Document Topic Distribution")
document_topic_distribution = display_document_topic_distribution(topic_proportions)
st.write(document_topic_distribution.head())