Spaces:
Runtime error
Runtime error
| from bertopic import BERTopic | |
| import streamlit as st | |
| import streamlit.components.v1 as components | |
| #from datasets import load_dataset | |
| import pandas as pd | |
| from datasets import load_dataset | |
| import json | |
| ##Load Dataset from HF Hub | |
| #dataset = load_dataset("rshah/million-headlines") | |
| #news = pd.DataFrame.from_dict(dataset["train"]) | |
| #Load dataset locally - faster for demo | |
| news = pd.read_parquet("topic_10000.par") | |
| news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d') | |
| timestamps = news.date.to_list() | |
| tweets = news.headline_text.to_list() | |
| #Load topics | |
| with open("topics", "r") as fp: | |
| topics = json.load(fp) | |
| option_n = 5 | |
| st.set_page_config(page_title="News Topic Clustering") | |
| st.title("News Topic Clustering") | |
| st.caption("By Rajiv Shah") | |
| st.caption("") | |
| st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines). \ | |
| If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \ | |
| build the topics and create the visualizations") | |
| st.caption("The preloaded existing model provides the more interesting results. However, this app can be run live by building a new model, but \ | |
| is limited to a small number of rows. I also limited topics over time to the existing model.") | |
| form = st.sidebar.form("Main Settings") | |
| form.header("Main Settings") | |
| option = form.selectbox( | |
| 'What model would you like to run', | |
| ('Load existing model', 'Build new model'),index=0) | |
| option_n = form.number_input( | |
| 'What topic would you like to get terms for?', | |
| min_value=0,max_value=10,value=5) | |
| submitted = form.form_submit_button(label = 'Select Model') | |
| if option == 'Load existing model': | |
| ##Load existing model | |
| topic_model = BERTopic.load("topic_10000.model") | |
| #topics, _ = topic_model.transform(tweets) | |
| else: | |
| ##Builds Topic Model | |
| #news_sample = news[(news['date'] > '2015-06-01')] | |
| news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ] | |
| news_sample = news_sample.sample(200,random_state=123) | |
| tweets = news_sample.headline_text.to_list() | |
| topic_model = BERTopic(min_topic_size=5, verbose=True) | |
| topics, _ = topic_model.fit_transform(tweets) | |
| #Get top topics | |
| freq = topic_model.get_topic_info() | |
| freq = freq.iloc[1: , :] ##drop -1 row | |
| freq.head(10) | |
| st.header("The Main Topic Clusters") | |
| st.write(freq) | |
| topic_nr = freq.iloc[option_n]["Topic"] # We select a frequent topic | |
| st.caption("") | |
| st.write('Top words in topic cluster: ',option_n) | |
| #st.caption(option_n) | |
| mytuple = (topic_model.get_topic(topic_nr)) | |
| for item in mytuple: | |
| st.write(str(item[0])) | |
| st.header("Relationships between clusters ") | |
| st.plotly_chart(topic_model.visualize_hierarchy()) | |
| if option == 'Load existing model': | |
| st.header("Topics over time for Existing Model") | |
| topics_over_time = topic_model.topics_over_time(docs=tweets, | |
| topics=topics, | |
| timestamps=timestamps, | |
| global_tuning=True, | |
| evolution_tuning=True, | |
| nr_bins=20) | |
| st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)) |