Spaces:

ababio
/

Search-Engine

Sleeping

App Files Files Community

Search-Engine / app.py

iababio

modify names

0c04f42 about 1 month ago

raw

history blame

No virus

8.56 kB

	import streamlit as st
	from PIL import Image
	import ujson
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer

	import nltk
	nltk.download('stopwords')
	nltk.download('punkt')


	# Set up the NLTK components
	stemmer = PorterStemmer()
	stop_words = stopwords.words('english')
	tfidf = TfidfVectorizer()

	# Load the data
	with open('publication_list_stemmed.json', 'r') as f:
	pub_list_first_stem = ujson.load(f)
	with open('publication_indexed_dictionary.json', 'r') as f:
	pub_index = ujson.load(f)
	with open('author_list_stemmed.json', 'r') as f:
	author_list_first_stem = ujson.load(f)
	with open('author_indexed_dictionary.json', 'r') as f:
	author_index = ujson.load(f)
	with open('author_names.json', 'r') as f:
	author_name = ujson.load(f)
	with open('pub_name.json', 'r') as f:
	pub_name = ujson.load(f)
	with open('pub_url.json', 'r') as f:
	pub_url = ujson.load(f)
	with open('pub_cu_author.json', 'r') as f:
	pub_cu_author = ujson.load(f)
	with open('pub_date.json', 'r') as f:
	pub_date = ujson.load(f)


	def search_data(input_text, operator_val, search_type):
	output_data = {}
	if operator_val == 2:
	input_text = input_text.lower().split()
	pointer = []
	for token in input_text:
	if len(input_text) < 2:
	st.warning("Please enter at least 2 words to apply the operator.")
	break
	# if len(token) <= 3:
	# st.warning("Please enter more than 4 characters.")
	# break
	stem_temp = ""
	stem_word_file = []
	temp_file = []
	word_list = word_tokenize(token)

	for x in word_list:
	if x not in stop_words:
	stem_temp += stemmer.stem(x) + " "
	stem_word_file.append(stem_temp)

	if search_type == "publication" and pub_index.get(stem_word_file[0].strip()):
	pointer = pub_index.get(stem_word_file[0].strip())
	elif search_type == "author" and author_index.get(stem_word_file[0].strip()):
	pointer = author_index.get(stem_word_file[0].strip())

	if len(pointer) == 0:
	output_data = {}
	else:
	for j in pointer:
	if search_type == "publication":
	temp_file.append(pub_list_first_stem[j])
	elif search_type == "author":
	temp_file.append(author_list_first_stem[j])

	temp_file = tfidf.fit_transform(temp_file)
	cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))

	for j in pointer:
	output_data[j] = cosine_output[pointer.index(j)]

	else: # Relevant operator (OR)
	input_text = input_text.lower().split()
	pointer = []
	match_word = []
	for token in input_text:
	if len(input_text) < 2:
	st.warning("Please enter at least 2 words to apply the operator.")
	break
	# if len(token) <= 3:
	# st.warning("Please enter more than 4 characters.")
	# break
	temp_file = []
	set2 = set()
	stem_word_file = []
	word_list = word_tokenize(token)
	stem_temp = ""
	for x in word_list:
	if x not in stop_words:
	stem_temp += stemmer.stem(x) + " "
	stem_word_file.append(stem_temp)

	if search_type == "publication" and pub_index.get(stem_word_file[0].strip()):
	set1 = set(pub_index.get(stem_word_file[0].strip()))
	pointer.extend(list(set1))
	elif search_type == "author" and author_index.get(stem_word_file[0].strip()):
	set1 = set(author_index.get(stem_word_file[0].strip()))
	pointer.extend(list(set1))

	if match_word == []:
	match_word = list({z for z in pointer if z in set2 or (set2.add(z) or False)})
	else:
	match_word.extend(list(set1))
	match_word = list({z for z in match_word if z in set2 or (set2.add(z) or False)})

	if len(input_text) > 1:
	match_word = {z for z in match_word if z in set2 or (set2.add(z) or False)}

	if len(match_word) == 0:
	output_data = {}
	else:
	for j in list(match_word):
	if search_type == "publication":
	temp_file.append(pub_list_first_stem[j])
	elif search_type == "author":
	temp_file.append(author_list_first_stem[j])

	temp_file = tfidf.fit_transform(temp_file)
	cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))

	for j in list(match_word):
	output_data[j] = cosine_output[list(match_word).index(j)]
	else:
	if len(pointer) == 0:
	output_data = {}
	else:
	for j in pointer:
	if search_type == "publication":
	temp_file.append(pub_list_first_stem[j])
	elif search_type == "author":
	temp_file.append(author_list_first_stem[j])

	temp_file = tfidf.fit_transform(temp_file)
	cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))

	for j in pointer:
	output_data[j] = cosine_output[pointer.index(j)]

	return output_data


	def app():

	# Load the image and display it
	image = Image.open('Fordham-University-Logo.png')
	st.image(image)

	# Add a text description
	st.markdown("<p style='text-align: center;'> Uncover the brilliance: Explore profiles, groundbreaking work, and cutting-edge research by the exceptional minds of Fordham University.</p>", unsafe_allow_html=True)


	input_text = st.text_input("Search research:", key="query_input")
	operator_val = st.radio(
	"Search Filters",
	['Exact', 'Relevant'],
	index=1,
	key="operator_input",
	horizontal=True,
	)
	search_type = st.radio(
	"Search in:",
	['Publications', 'Authors'],
	index=0,
	key="search_type_input",
	horizontal=True,
	)

	if st.button("SEARCH"):
	if search_type == "Publications":
	output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "publication")
	elif search_type == "Authors":
	output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "author")
	else:
	output_data = {}

	# Display the search results
	show_results(output_data, search_type)

	# st.markdown("<p style='text-align: center;'> Brought to you with by <a href='https://github.com/iababio'>Boakye I Ababio</a> \| Data © Fordham University </p>", unsafe_allow_html=True)


	def show_results(output_data, search_type):
	aa = 0
	rank_sorting = sorted(output_data.items(), key=lambda z: z[1], reverse=True)

	# Show the total number of research results
	st.info(f"Showing results for: {len(rank_sorting)}")

	# Show the cards
	N_cards_per_row = 3
	for n_row, (id_val, ranking) in enumerate(rank_sorting):
	i = n_row % N_cards_per_row
	if i == 0:
	st.write("---")
	cols = st.columns(N_cards_per_row, gap="large")
	# Draw the card
	with cols[n_row % N_cards_per_row]:
	if search_type == "Publications":
	st.caption(f"{pub_date[id_val].strip()}")
	st.markdown(f"{pub_cu_author[id_val].strip()}")
	st.markdown(f"{pub_name[id_val].strip()}")
	st.markdown(f"{pub_url[id_val]}")
	elif search_type == "Authors":
	st.caption(f"{pub_date[id_val].strip()}")
	st.markdown(f"{author_name[id_val].strip()}")
	st.markdown(f"{pub_name[id_val].strip()}")
	st.markdown(f"{pub_url[id_val]}")
	st.markdown(f"Ranking: {ranking[0]:.2f}")

	aa += 1

	if aa == 0:
	st.info("No results found. Please try again.")
	else:
	st.info(f"Results shown for: {aa}")


	if __name__ == '__main__':
	app()