Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import math
|
| 3 |
+
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
def boolean_search(paragraph, query):
|
| 10 |
+
# Split paragraph into words
|
| 11 |
+
words = paragraph.lower().split()
|
| 12 |
+
words_dict = dict.fromkeys(words, True)
|
| 13 |
+
|
| 14 |
+
# Split query into words
|
| 15 |
+
query_words = query.lower().split()
|
| 16 |
+
|
| 17 |
+
result = words_dict.get(query_words[0], False)
|
| 18 |
+
|
| 19 |
+
for i in range(1, len(query_words), 2):
|
| 20 |
+
operator = query_words[i]
|
| 21 |
+
operand = words_dict.get(query_words[i + 1], False)
|
| 22 |
+
|
| 23 |
+
if operator == 'and':
|
| 24 |
+
result = result and operand
|
| 25 |
+
elif operator == 'or':
|
| 26 |
+
result = result or operand
|
| 27 |
+
elif operator == 'not':
|
| 28 |
+
result = result and not operand
|
| 29 |
+
|
| 30 |
+
return result
|
| 31 |
+
|
| 32 |
+
def parse_retrieved(retrieved_examples,scores,filters,k):
|
| 33 |
+
|
| 34 |
+
results=[]
|
| 35 |
+
repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)
|
| 36 |
+
|
| 37 |
+
for i in range(len(scores)):
|
| 38 |
+
|
| 39 |
+
resdict={}
|
| 40 |
+
for key in keys:
|
| 41 |
+
resdict[key] = retrieved_examples[key][i]
|
| 42 |
+
resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
|
| 43 |
+
resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
|
| 44 |
+
resdict['published'] = retrieved_examples['versions'][0][0]['created']
|
| 45 |
+
resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
|
| 46 |
+
resdict['score'] = str(round(scores[i],3))[:5]
|
| 47 |
+
relevant=True
|
| 48 |
+
|
| 49 |
+
if resdict['repo_url']==None:
|
| 50 |
+
repo_avail-=1
|
| 51 |
+
resdict['repo_url']=""
|
| 52 |
+
if filters['limit2_pwc']:
|
| 53 |
+
relevant=False
|
| 54 |
+
|
| 55 |
+
if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
|
| 56 |
+
relevant=False
|
| 57 |
+
in_date-=1
|
| 58 |
+
print(filters['boolean_terms'])
|
| 59 |
+
if filters['boolean_terms']!="":
|
| 60 |
+
boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
|
| 61 |
+
if not boolean_met:
|
| 62 |
+
relevant=False
|
| 63 |
+
boolmet-=1
|
| 64 |
+
|
| 65 |
+
if relevant:
|
| 66 |
+
results.append(resdict)
|
| 67 |
+
|
| 68 |
+
return [results[:k],repo_avail,in_date,boolmet]
|
| 69 |
+
|
| 70 |
+
def create_metadata_html(metadata_dict):
|
| 71 |
+
html = '''
|
| 72 |
+
<div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
|
| 73 |
+
<h2>{title}</h2>
|
| 74 |
+
<pre><p><strong>Relevance_score:</strong> {score} <strong>Published:</strong> {published}</p></pre>
|
| 75 |
+
<p><strong>Authors:</strong> {authors}</p>
|
| 76 |
+
<pre><p><strong>Categories:</strong> {categories} <strong>Year:</strong> {year}</p></pre>
|
| 77 |
+
<pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a> <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>
|
| 78 |
+
<p><strong>Abstract:</strong> {abstract}</p>
|
| 79 |
+
<p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
|
| 80 |
+
</div>
|
| 81 |
+
'''
|
| 82 |
+
return html.format(**metadata_dict)
|
| 83 |
+
|
| 84 |
+
def search(query, boolean_terms, sy, ey,limit2_pwc):
|
| 85 |
+
|
| 86 |
+
k=10
|
| 87 |
+
|
| 88 |
+
question_embedding = model.encode(query)
|
| 89 |
+
scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)
|
| 90 |
+
|
| 91 |
+
filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}
|
| 92 |
+
|
| 93 |
+
results = parse_retrieved(retrieved_examples,scores,filters,k)
|
| 94 |
+
|
| 95 |
+
divs=[create_metadata_html(r) for r in results[0]]
|
| 96 |
+
divs.reverse()
|
| 97 |
+
|
| 98 |
+
html="<br><br><pre><strong>Articles with Repo:</strong> {} <strong>Articles in date range:</strong> {} <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 10 results returned<strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
|
| 99 |
+
return html
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
global keys
|
| 103 |
+
keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
|
| 107 |
+
ds['train'].add_faiss_index(column='embeddings')
|
| 108 |
+
|
| 109 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
search_interface = gr.Blocks()
|
| 113 |
+
|
| 114 |
+
with search_interface:
|
| 115 |
+
fn = search,
|
| 116 |
+
inputs=[
|
| 117 |
+
gr.Textbox(label="Query",value="",info="Search Query"),
|
| 118 |
+
gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
|
| 119 |
+
gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
|
| 120 |
+
gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
|
| 121 |
+
gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
|
| 122 |
+
]
|
| 123 |
+
run = gr.Button(label="Search")
|
| 124 |
+
examples=[
|
| 125 |
+
["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
|
| 126 |
+
]
|
| 127 |
+
output=gr.outputs.HTML()
|
| 128 |
+
run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")
|
| 129 |
+
|
| 130 |
+
search_interface.launch()
|