Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PIL import Image | |
import ujson | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
# Set up the NLTK components | |
stemmer = PorterStemmer() | |
stop_words = stopwords.words('english') | |
tfidf = TfidfVectorizer() | |
# Load the data | |
with open('publication_list_stemmed.json', 'r') as f: | |
pub_list_first_stem = ujson.load(f) | |
with open('publication_indexed_dictionary.json', 'r') as f: | |
pub_index = ujson.load(f) | |
with open('author_list_stemmed.json', 'r') as f: | |
author_list_first_stem = ujson.load(f) | |
with open('author_indexed_dictionary.json', 'r') as f: | |
author_index = ujson.load(f) | |
with open('author_names.json', 'r') as f: | |
author_name = ujson.load(f) | |
with open('pub_name.json', 'r') as f: | |
pub_name = ujson.load(f) | |
with open('pub_url.json', 'r') as f: | |
pub_url = ujson.load(f) | |
with open('pub_cu_author.json', 'r') as f: | |
pub_cu_author = ujson.load(f) | |
with open('pub_date.json', 'r') as f: | |
pub_date = ujson.load(f) | |
def search_data(input_text, operator_val, search_type): | |
output_data = {} | |
if operator_val == 2: | |
input_text = input_text.lower().split() | |
pointer = [] | |
for token in input_text: | |
if len(input_text) < 2: | |
st.warning("Please enter at least 2 words to apply the operator.") | |
break | |
# if len(token) <= 3: | |
# st.warning("Please enter more than 4 characters.") | |
# break | |
stem_temp = "" | |
stem_word_file = [] | |
temp_file = [] | |
word_list = word_tokenize(token) | |
for x in word_list: | |
if x not in stop_words: | |
stem_temp += stemmer.stem(x) + " " | |
stem_word_file.append(stem_temp) | |
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): | |
pointer = pub_index.get(stem_word_file[0].strip()) | |
elif search_type == "author" and author_index.get(stem_word_file[0].strip()): | |
pointer = author_index.get(stem_word_file[0].strip()) | |
if len(pointer) == 0: | |
output_data = {} | |
else: | |
for j in pointer: | |
if search_type == "publication": | |
temp_file.append(pub_list_first_stem[j]) | |
elif search_type == "author": | |
temp_file.append(author_list_first_stem[j]) | |
temp_file = tfidf.fit_transform(temp_file) | |
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) | |
for j in pointer: | |
output_data[j] = cosine_output[pointer.index(j)] | |
else: # Relevant operator (OR) | |
input_text = input_text.lower().split() | |
pointer = [] | |
match_word = [] | |
for token in input_text: | |
if len(input_text) < 2: | |
st.warning("Please enter at least 2 words to apply the operator.") | |
break | |
# if len(token) <= 3: | |
# st.warning("Please enter more than 4 characters.") | |
# break | |
temp_file = [] | |
set2 = set() | |
stem_word_file = [] | |
word_list = word_tokenize(token) | |
stem_temp = "" | |
for x in word_list: | |
if x not in stop_words: | |
stem_temp += stemmer.stem(x) + " " | |
stem_word_file.append(stem_temp) | |
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): | |
set1 = set(pub_index.get(stem_word_file[0].strip())) | |
pointer.extend(list(set1)) | |
elif search_type == "author" and author_index.get(stem_word_file[0].strip()): | |
set1 = set(author_index.get(stem_word_file[0].strip())) | |
pointer.extend(list(set1)) | |
if match_word == []: | |
match_word = list({z for z in pointer if z in set2 or (set2.add(z) or False)}) | |
else: | |
match_word.extend(list(set1)) | |
match_word = list({z for z in match_word if z in set2 or (set2.add(z) or False)}) | |
if len(input_text) > 1: | |
match_word = {z for z in match_word if z in set2 or (set2.add(z) or False)} | |
if len(match_word) == 0: | |
output_data = {} | |
else: | |
for j in list(match_word): | |
if search_type == "publication": | |
temp_file.append(pub_list_first_stem[j]) | |
elif search_type == "author": | |
temp_file.append(author_list_first_stem[j]) | |
temp_file = tfidf.fit_transform(temp_file) | |
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) | |
for j in list(match_word): | |
output_data[j] = cosine_output[list(match_word).index(j)] | |
else: | |
if len(pointer) == 0: | |
output_data = {} | |
else: | |
for j in pointer: | |
if search_type == "publication": | |
temp_file.append(pub_list_first_stem[j]) | |
elif search_type == "author": | |
temp_file.append(author_list_first_stem[j]) | |
temp_file = tfidf.fit_transform(temp_file) | |
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) | |
for j in pointer: | |
output_data[j] = cosine_output[pointer.index(j)] | |
return output_data | |
def app(): | |
# Load the image and display it | |
image = Image.open('Fordham-University-Logo.png') | |
st.image(image) | |
# Add a text description | |
st.markdown("<p style='text-align: center;'> Uncover the brilliance: Explore profiles, groundbreaking work, and cutting-edge research by the exceptional minds of Fordham University.</p>", unsafe_allow_html=True) | |
input_text = st.text_input("Search research:", key="query_input") | |
operator_val = st.radio( | |
"Search Filters", | |
['Exact', 'Relevant'], | |
index=1, | |
key="operator_input", | |
horizontal=True, | |
) | |
search_type = st.radio( | |
"Search in:", | |
['Publications', 'Authors'], | |
index=0, | |
key="search_type_input", | |
horizontal=True, | |
) | |
if st.button("SEARCH"): | |
if search_type == "Publications": | |
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "publication") | |
elif search_type == "Authors": | |
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "author") | |
else: | |
output_data = {} | |
# Display the search results | |
show_results(output_data, search_type) | |
st.markdown("<p style='text-align: center;'> Brought to you with ❤ by <a href='https://github.com/maladeep'>Mala Deep</a> | Data © Coventry University </p>", unsafe_allow_html=True) | |
def show_results(output_data, search_type): | |
aa = 0 | |
rank_sorting = sorted(output_data.items(), key=lambda z: z[1], reverse=True) | |
# Show the total number of research results | |
st.info(f"Showing results for: {len(rank_sorting)}") | |
# Show the cards | |
N_cards_per_row = 3 | |
for n_row, (id_val, ranking) in enumerate(rank_sorting): | |
i = n_row % N_cards_per_row | |
if i == 0: | |
st.write("---") | |
cols = st.columns(N_cards_per_row, gap="large") | |
# Draw the card | |
with cols[n_row % N_cards_per_row]: | |
if search_type == "Publications": | |
st.caption(f"{pub_date[id_val].strip()}") | |
st.markdown(f"**{pub_cu_author[id_val].strip()}**") | |
st.markdown(f"*{pub_name[id_val].strip()}*") | |
st.markdown(f"**{pub_url[id_val]}**") | |
elif search_type == "Authors": | |
st.caption(f"{pub_date[id_val].strip()}") | |
st.markdown(f"**{author_name[id_val].strip()}**") | |
st.markdown(f"*{pub_name[id_val].strip()}*") | |
st.markdown(f"**{pub_url[id_val]}**") | |
st.markdown(f"Ranking: {ranking[0]:.2f}") | |
aa += 1 | |
if aa == 0: | |
st.info("No results found. Please try again.") | |
else: | |
st.info(f"Results shown for: {aa}") | |
if __name__ == '__main__': | |
app() | |