Search-Engine / app.py
iababio
modify names
0c04f42
raw
history blame
No virus
8.56 kB
import streamlit as st
from PIL import Image
import ujson
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
# Set up the NLTK components
stemmer = PorterStemmer()
stop_words = stopwords.words('english')
tfidf = TfidfVectorizer()
# Load the data
with open('publication_list_stemmed.json', 'r') as f:
pub_list_first_stem = ujson.load(f)
with open('publication_indexed_dictionary.json', 'r') as f:
pub_index = ujson.load(f)
with open('author_list_stemmed.json', 'r') as f:
author_list_first_stem = ujson.load(f)
with open('author_indexed_dictionary.json', 'r') as f:
author_index = ujson.load(f)
with open('author_names.json', 'r') as f:
author_name = ujson.load(f)
with open('pub_name.json', 'r') as f:
pub_name = ujson.load(f)
with open('pub_url.json', 'r') as f:
pub_url = ujson.load(f)
with open('pub_cu_author.json', 'r') as f:
pub_cu_author = ujson.load(f)
with open('pub_date.json', 'r') as f:
pub_date = ujson.load(f)
def search_data(input_text, operator_val, search_type):
output_data = {}
if operator_val == 2:
input_text = input_text.lower().split()
pointer = []
for token in input_text:
if len(input_text) < 2:
st.warning("Please enter at least 2 words to apply the operator.")
break
# if len(token) <= 3:
# st.warning("Please enter more than 4 characters.")
# break
stem_temp = ""
stem_word_file = []
temp_file = []
word_list = word_tokenize(token)
for x in word_list:
if x not in stop_words:
stem_temp += stemmer.stem(x) + " "
stem_word_file.append(stem_temp)
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()):
pointer = pub_index.get(stem_word_file[0].strip())
elif search_type == "author" and author_index.get(stem_word_file[0].strip()):
pointer = author_index.get(stem_word_file[0].strip())
if len(pointer) == 0:
output_data = {}
else:
for j in pointer:
if search_type == "publication":
temp_file.append(pub_list_first_stem[j])
elif search_type == "author":
temp_file.append(author_list_first_stem[j])
temp_file = tfidf.fit_transform(temp_file)
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))
for j in pointer:
output_data[j] = cosine_output[pointer.index(j)]
else: # Relevant operator (OR)
input_text = input_text.lower().split()
pointer = []
match_word = []
for token in input_text:
if len(input_text) < 2:
st.warning("Please enter at least 2 words to apply the operator.")
break
# if len(token) <= 3:
# st.warning("Please enter more than 4 characters.")
# break
temp_file = []
set2 = set()
stem_word_file = []
word_list = word_tokenize(token)
stem_temp = ""
for x in word_list:
if x not in stop_words:
stem_temp += stemmer.stem(x) + " "
stem_word_file.append(stem_temp)
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()):
set1 = set(pub_index.get(stem_word_file[0].strip()))
pointer.extend(list(set1))
elif search_type == "author" and author_index.get(stem_word_file[0].strip()):
set1 = set(author_index.get(stem_word_file[0].strip()))
pointer.extend(list(set1))
if match_word == []:
match_word = list({z for z in pointer if z in set2 or (set2.add(z) or False)})
else:
match_word.extend(list(set1))
match_word = list({z for z in match_word if z in set2 or (set2.add(z) or False)})
if len(input_text) > 1:
match_word = {z for z in match_word if z in set2 or (set2.add(z) or False)}
if len(match_word) == 0:
output_data = {}
else:
for j in list(match_word):
if search_type == "publication":
temp_file.append(pub_list_first_stem[j])
elif search_type == "author":
temp_file.append(author_list_first_stem[j])
temp_file = tfidf.fit_transform(temp_file)
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))
for j in list(match_word):
output_data[j] = cosine_output[list(match_word).index(j)]
else:
if len(pointer) == 0:
output_data = {}
else:
for j in pointer:
if search_type == "publication":
temp_file.append(pub_list_first_stem[j])
elif search_type == "author":
temp_file.append(author_list_first_stem[j])
temp_file = tfidf.fit_transform(temp_file)
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file))
for j in pointer:
output_data[j] = cosine_output[pointer.index(j)]
return output_data
def app():
# Load the image and display it
image = Image.open('Fordham-University-Logo.png')
st.image(image)
# Add a text description
st.markdown("<p style='text-align: center;'> Uncover the brilliance: Explore profiles, groundbreaking work, and cutting-edge research by the exceptional minds of Fordham University.</p>", unsafe_allow_html=True)
input_text = st.text_input("Search research:", key="query_input")
operator_val = st.radio(
"Search Filters",
['Exact', 'Relevant'],
index=1,
key="operator_input",
horizontal=True,
)
search_type = st.radio(
"Search in:",
['Publications', 'Authors'],
index=0,
key="search_type_input",
horizontal=True,
)
if st.button("SEARCH"):
if search_type == "Publications":
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "publication")
elif search_type == "Authors":
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "author")
else:
output_data = {}
# Display the search results
show_results(output_data, search_type)
# st.markdown("<p style='text-align: center;'> Brought to you with by <a href='https://github.com/iababio'>Boakye I Ababio</a> | Data © Fordham University </p>", unsafe_allow_html=True)
def show_results(output_data, search_type):
aa = 0
rank_sorting = sorted(output_data.items(), key=lambda z: z[1], reverse=True)
# Show the total number of research results
st.info(f"Showing results for: {len(rank_sorting)}")
# Show the cards
N_cards_per_row = 3
for n_row, (id_val, ranking) in enumerate(rank_sorting):
i = n_row % N_cards_per_row
if i == 0:
st.write("---")
cols = st.columns(N_cards_per_row, gap="large")
# Draw the card
with cols[n_row % N_cards_per_row]:
if search_type == "Publications":
st.caption(f"{pub_date[id_val].strip()}")
st.markdown(f"**{pub_cu_author[id_val].strip()}**")
st.markdown(f"*{pub_name[id_val].strip()}*")
st.markdown(f"**{pub_url[id_val]}**")
elif search_type == "Authors":
st.caption(f"{pub_date[id_val].strip()}")
st.markdown(f"**{author_name[id_val].strip()}**")
st.markdown(f"*{pub_name[id_val].strip()}*")
st.markdown(f"**{pub_url[id_val]}**")
st.markdown(f"Ranking: {ranking[0]:.2f}")
aa += 1
if aa == 0:
st.info("No results found. Please try again.")
else:
st.info(f"Results shown for: {aa}")
if __name__ == '__main__':
app()