Spaces:

rajistics
/

News_Topic_Clustering

Runtime error

App Files Files Community

News_Topic_Clustering / app.py

rajistics

fixed BERTopic name

b409097 over 3 years ago

raw

history blame

3.42 kB

	from bertopic import BERTopic
	import streamlit as st
	import streamlit.components.v1 as components
	#from datasets import load_dataset
	import pandas as pd
	from datasets import load_dataset
	import json

	##Load Dataset from HF Hub
	#dataset = load_dataset("rshah/million-headlines")
	#news = pd.DataFrame.from_dict(dataset["train"])

	#Load dataset locally - faster for demo
	news = pd.read_parquet("topic_10000.par")
	news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d')
	timestamps = news.date.to_list()
	tweets = news.headline_text.to_list()

	#Load topics
	with open("topics", "r") as fp:
	topics = json.load(fp)

	option_n = 5

	st.set_page_config(page_title="News Topic Clustering")
	st.title("News Topic Clustering")
	st.caption("By Rajiv Shah")
	st.caption("")
	st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines). \
	If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \
	build the topics and create the visualizations")
	st.caption("The preloaded existing model provides the more interesting results. However, this app can be run live by building a new model, but \
	is limited to a small number of rows. I also limited topics over time to the existing model.")


	form = st.sidebar.form("Main Settings")
	form.header("Main Settings")
	option = form.selectbox(
	'What model would you like to run',
	('Load existing model', 'Build new model'),index=0)

	option_n = form.number_input(
	'What topic would you like to get terms for?',
	min_value=0,max_value=10,value=5)

	submitted = form.form_submit_button(label = 'Select Model')

	if option == 'Load existing model':
	##Load existing model
	topic_model = BERTopic.load("topic_10000.model")
	#topics, _ = topic_model.transform(tweets)
	else:
	##Builds Topic Model
	#news_sample = news[(news['date'] > '2015-06-01')]
	news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ]
	news_sample = news_sample.sample(200,random_state=123)
	tweets = news_sample.headline_text.to_list()
	topic_model = BERTopic(min_topic_size=5, verbose=True)
	topics, _ = topic_model.fit_transform(tweets)


	#Get top topics
	freq = topic_model.get_topic_info()
	freq = freq.iloc[1: , :] ##drop -1 row
	freq.head(10)
	st.header("The Main Topic Clusters")
	st.write(freq)


	topic_nr = freq.iloc[option_n]["Topic"] # We select a frequent topic
	st.caption("")
	st.write('Top words in topic cluster: ',option_n)
	#st.caption(option_n)
	mytuple = (topic_model.get_topic(topic_nr))
	for item in mytuple:
	st.write(str(item[0]))

	st.header("Relationships between clusters ")
	st.plotly_chart(topic_model.visualize_hierarchy())


	if option == 'Load existing model':
	st.header("Topics over time for Existing Model")
	topics_over_time = topic_model.topics_over_time(docs=tweets,
	topics=topics,
	timestamps=timestamps,
	global_tuning=True,
	evolution_tuning=True,
	nr_bins=20)

	st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20))