Spaces:

Chri12345
/

Best_Guess

Sleeping

App Files Files Community

Best_Guess / app.py

Chri12345

Upload folder using huggingface_hub

5c363c5 verified about 1 year ago

raw

history blame

2.5 kB

	import os
	from sentence_transformers import SentenceTransformer, util
	from datasets import load_dataset
	from datasets import load_dataset
	from transformers import pipeline
	import streamlit as st



	dataset_id = "sentence-transformers/natural-questions"
	dataset_file = load_dataset(dataset_id, split="train")

	# Use the allenai-specter model with SentenceTransformers
	model = SentenceTransformer('allenai-specter')

	# Prepare paper texts by combining query and answer fields
	paper_texts = [
	record['query'] + '[SEP]' + record['answer'] for record in dataset_file.select(range(32))
	]

	# Compute embeddings for all paper texts
	corpus_embeddings = model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True)

	# Function to search for answers given a query
	def search_papers(query):
	# Encode the query
	query_embedding = model.encode(query, convert_to_tensor=True)

	# Perform semantic search
	search_hits = util.semantic_search(query_embedding, corpus_embeddings)
	search_hits = search_hits[0] # Get the hits for the first query

	print("\n\nQuery:", query)
	print("Most similar answers:")
	for hit in search_hits[:5]: # Limit to top 5 results for clarity
	related_text = dataset_file[int(hit['corpus_id'])] # Access related record
	print("{:.2f}\tAnswer: {}".format(
	hit['score'], related_text['answer']
	))


	# Summarization pipeline
	summarizer = pipeline("summarization")

	# Collect the relevant answers from the search function
	def search_papers_and_summarize(query, max_summary_length=45):
	# Encode the query
	query_embedding = model.encode(query, convert_to_tensor=True)

	# Perform semantic search
	search_hits = util.semantic_search(query_embedding, corpus_embeddings)
	search_hits = search_hits[0] # Get the hits for the first query

	# Collect answers from top hits
	answers = []
	for hit in search_hits[:5]: # Limit to top 5 results
	related_text = dataset_file[int(hit['corpus_id'])]
	answers.append(related_text['answer'])

	# Combine answers into a single text for summarization
	combined_text = " ".join(answers)

	# Summarize the combined text
	summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
	print("Summary:")
	print(summary[0]['summary_text'])


	title = st.text_input("Ask a question", "What is Wimpy Kid")
	new_preds = search_papers_and_summarize(title)
	st.write("The Answer is", new_preds)