import datetime import math from datasets import load_dataset from sentence_transformers import SentenceTransformer import gradio as gr def boolean_search(paragraph, query): # Split paragraph into words words = paragraph.lower().split() words_dict = dict.fromkeys(words, True) # Split query into words query_words = query.lower().split() result = words_dict.get(query_words[0], False) for i in range(1, len(query_words), 2): operator = query_words[i] operand = words_dict.get(query_words[i + 1], False) if operator == 'and': result = result and operand elif operator == 'or': result = result or operand elif operator == 'not': result = result and not operand return result def parse_retrieved(retrieved_examples,scores,filters,k): results=[] repo_avail,in_date,boolmet=len(scores),len(scores),len(scores) for i in range(len(scores)): resdict={} for key in keys: resdict[key] = retrieved_examples[key][i] resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i]) resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i]) resdict['published'] = retrieved_examples['versions'][0][0]['created'] resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year resdict['score'] = str(round(scores[i],3))[:5] relevant=True if resdict['repo_url']==None: repo_avail-=1 resdict['repo_url']="" if filters['limit2_pwc']: relevant=False if filters['sy']>resdict['year'] or filters['ey']

{title}

Relevance_score: {score} Published: {published}

Authors: {authors}

Categories: {categories} Year: {year}

ArXiv URL PDF URL

Abstract: {abstract}

Repo URL: {repo_url}

''' return html.format(**metadata_dict) def search(query, boolean_terms, sy, ey,limit2_pwc): k=30 question_embedding = model.encode(query) scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100) filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms} results = parse_retrieved(retrieved_examples,scores,filters,k) divs=[create_metadata_html(r) for r in results[0]] divs.reverse() html="

Articles with Repo: {}    Articles in date range: {}    Articles meeting boolean terms: {}

Top 30 results returned
".format(str(results[1]),str(results[2]),str(results[3]))+"
".join(divs) return html global keys keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper'] ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6") ds['train'].add_faiss_index(column='embeddings') model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') search_interface = gr.Blocks() with search_interface: fn = search, inputs=[ gr.Textbox(label="Query",value="",info="Search Query"), gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"), gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"), gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"), gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc") ] run = gr.Button(label="Search") examples=[ ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True], ] output=gr.outputs.HTML() run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search") search_interface.launch()