Spaces:

profoz
/

index_demo

Build error

File size: 4,152 Bytes

import openai
import requests
import streamlit as st
from bs4 import BeautifulSoup
from sentence_transformers import CrossEncoder
from transformers import pipeline

all_documents = {}


def qa_gpt3(question, context):
    print(question, context)
    openai.api_key = st.secrets["openai_key"]

    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=f"Answer given the following context: {context}\n\nQuestion: {question}",
        temperature=0.7,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    print(response)
    return {'answer': response['choices'][0]['text'].strip()}


st.title('Document Question Answering System')

qa_model = None

crawl_urls = st.checkbox('Crawl?', value=False)

document_text = st.text_area(
    label="Links (Comma separated)", height=100,
    value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
)
query = st.text_input("Query")

qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))
tokenizing = st.selectbox('How to Tokenize',
                          ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo'))

if qa_option == 'gpt3':
    qa_model = qa_gpt3
else:
    qa_model = pipeline("question-answering", qa_option)
st.write(f'Using {qa_option} as the Q/A model')

encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


def get_relevent_passage(question, documents):
    query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]

    scores = encoder.predict(query_paragraph_list)
    top_5_indices = scores.argsort()[-5:]
    top_5_query_paragraph_list = [query_paragraph_list[i] for i in top_5_indices]
    top_5_query_paragraph_list.reverse()
    return top_5_query_paragraph_list[0][1]


def answer_question(query, context):
    answer = qa_model(question=query, context=context)['answer']
    return answer


def get_documents(document_text, crawl=crawl_urls):
    urls = document_text.split(',')
    for url in urls:
        st.write(f'Crawling {url}')
        if url in set(all_documents.values()):
            continue
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html.parser')

        if crawl:
            st.write('Give me a sec, crawling..')
            import re

            more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                   html)
            more_urls = list(
                set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
            for more_url in more_urls:
                all_documents.update(get_documents(more_url, crawl=False))

        body = "\n".join([x for x in soup.body.get_text().split('\n') if len(x) > 10])
        print(body)

        if tokenizing == "Don't (use entire body as document)":
            document_paragraphs = [body]
        elif tokenizing == 'Newline (split by newline character)':
            document_paragraphs = [n for n in body.split('\n') if len(n) > 250]
        elif tokenizing == 'Combo':
            document_paragraphs = [body] + [n for n in body.split('\n') if len(n) > 250]

        for document_paragraph in document_paragraphs:
            all_documents[document_paragraph] = url

    return all_documents


if len(document_text.strip()) > 0 and len(query.strip()) > 0 and qa_model and encoder:
    st.write('Hmmm let me think about that..')
    document_text = document_text.strip()
    documents = get_documents(document_text)
    st.write(f'I am looking through {len(set(documents.values()))} sites')

    query = query.strip()
    context = get_relevent_passage(query, documents)
    answer = answer_question(query, context)

    relevant_url = documents[context]

    st.write('Check the answer below...with reference text')
    st.header("ANSWER: " + answer)
    st.subheader("REFERENCE: " + context)
    st.subheader("REFERENCE URL: " + relevant_url)