File size: 3,691 Bytes
137066c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
from transformers import pipeline
from sentence_transformers import CrossEncoder
import requests
from bs4 import BeautifulSoup
from functools import reduce
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelWithLMHead, pipeline
import openai

all_documents = {}

def qa_gpt3(question, context):
    openai.api_key = st.secrets["openai_key"]

    response = openai.Completion.create(
      model="text-davinci-002",
      prompt=f"Given this context, answer a question. If you cannot find an answer say \"Unknown\".\n\nContext: {context}\n\nQuestion: {query}",
      temperature=0.7,
      max_tokens=256,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    print(response)
    return {'answer': response['choices'][0]['text'].strip()}

st.title('Document Question Answering System')

qa_model = None

crawl_urls = st.checkbox('Crawl?', value=False)

document_text = st.text_area(
label="Links (Comma separated)", height=100,
value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq'
)
query = st.text_input("Query")

qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2'))

if qa_option == 'gpt3':
    qa_model = qa_gpt3
else:
    qa_model = pipeline("question-answering", qa_option)
st.write(f'Using {qa_option} as the Q/A model')

encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def get_relevent_passage(question, documents):
    query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0]

    scores = encoder.predict(query_paragraph_list)
    top_5_indices = scores.argsort()[-5:]
    top_5_query_paragraph_list = [query_paragraph_list[i] for i in top_5_indices]
    top_5_query_paragraph_list.reverse()
    return top_5_query_paragraph_list[0][1]


def answer_question(query, context):
    answer = qa_model(question=query, context=context)['answer']
    return answer


def get_documents(document_text, crawl=crawl_urls):
    urls = document_text.split(',')
    for url in urls:
        st.write(f'Crawling {url}')
        if url in set(all_documents.values()):
            continue
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html.parser')

        if crawl:
            st.write('Give me a sec, crawling..')
            import re

            more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)
            more_urls = list(set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]]))
            for more_url in more_urls:
                all_documents.update(get_documents(more_url, crawl=False))

        body = soup.get_text()

        document_paragraphs = [body]  # TODO change this to tokenize docs differently
        for document_paragraph in document_paragraphs:
            all_documents[document_paragraph] = url

    return all_documents


if len(document_text.strip()) > 0 and len(query.strip()) > 0 and qa_model and encoder:
    st.write('Hmmm let me think about that..')
    document_text = document_text.strip()
    documents = get_documents(document_text)
    st.write(f'I am looking through {len(set(documents.values()))} sites')

    query = query.strip()
    context = get_relevent_passage(query, documents)
    answer = answer_question(query, context)

    relevant_url = documents[context]

    st.write('Check the answer below...with reference text')
    st.header("ANSWER: "+answer)
    st.subheader("REFERENCE: "+context)
    st.subheader("REFERENCE URL: "+relevant_url)