Annelisseishere commited on
Commit
48c38d2
·
1 Parent(s): 3090c68

first commit

Browse files
Files changed (2) hide show
  1. app.py.py +142 -0
  2. requirements.txt +14 -0
app.py.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import streamlit as st
4
+ from PyPDF2 import PdfFileReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chains.question_answering import load_qa_chain
9
+ from langchain.llms import OpenAI as LLMSOpenAI
10
+ from langchain.llms import AzureOpenAI
11
+ from langchain.callbacks import get_openai_callback
12
+ from langchain.chat_models import ChatOpenAI
13
+ from docx import Document
14
+ from openpyxl import load_workbook
15
+ import pdfplumber
16
+
17
+
18
+ def extract_text_from_pdf(pdf_file):
19
+ with pdfplumber.open(pdf_file) as pdf:
20
+ text = ""
21
+ for page in pdf.pages:
22
+ text += page.extract_text()
23
+ return text
24
+
25
+
26
+ def extract_text_from_docx(docx_file):
27
+ doc = Document(docx_file)
28
+ paragraphs = [paragraph.text for paragraph in doc.paragraphs]
29
+ return "\n".join(paragraphs)
30
+
31
+
32
+ def extract_text_from_excel(excel_file):
33
+ workbook = load_workbook(excel_file)
34
+ text = ""
35
+ for sheet in workbook.sheetnames:
36
+ worksheet = workbook[sheet]
37
+ for row in worksheet.iter_rows():
38
+ for cell in row:
39
+ if cell.value:
40
+ text += str(cell.value) + "\n"
41
+ return text
42
+
43
+
44
+ def split_text_into_chunks(text):
45
+ text_splitter = CharacterTextSplitter(
46
+ separator="\n",
47
+ chunk_size=1000,
48
+ chunk_overlap=200,
49
+ length_function=len
50
+ )
51
+ return text_splitter.split_text(text)
52
+
53
+
54
+ def create_knowledge_base(chunks, api_key=None):
55
+ embeddings = OpenAIEmbeddings(openai_api_key=api_key)
56
+ knowledge_base = FAISS.from_texts(chunks, embeddings)
57
+ return knowledge_base
58
+
59
+
60
+ def answer_question(question, knowledge_base, model):
61
+ docs = knowledge_base.similarity_search(question)
62
+ llm = model(model_name="gpt-3.5-turbo")
63
+ chain = load_qa_chain(llm, chain_type="stuff")
64
+ with get_openai_callback() as cb:
65
+ response = chain.run(input_documents=docs, question=question)
66
+ return response
67
+
68
+
69
+ def save_api_key(api_key):
70
+ st.session_state.api_key = api_key
71
+
72
+
73
+ def main():
74
+ load_dotenv()
75
+ st.set_page_config(page_title="Ask Your PDF", layout="wide")
76
+
77
+ # Sidebar
78
+ st.sidebar.title("Settings")
79
+
80
+ # API Key input
81
+ st.sidebar.subheader("API Key")
82
+ api_key = st.sidebar.text_input("Insert your API Key", type="password")
83
+ st.sidebar.button("Save API Key", on_click=save_api_key, args=(api_key,))
84
+
85
+ model_type = st.sidebar.selectbox("Select Language Model", ["OpenAI", "AzureOpenAI"])
86
+ if model_type == "AzureOpenAI":
87
+ model = AzureOpenAI
88
+ else:
89
+ model = ChatOpenAI
90
+
91
+ chunk_size = st.sidebar.slider("Chunk Size", min_value=500, max_value=2000, value=1000, step=100)
92
+ chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=100, max_value=500, value=200, step=50)
93
+ show_content = st.sidebar.checkbox("Show Document Content")
94
+ show_answers = st.sidebar.checkbox("Show Previous Answers")
95
+
96
+ # Main content
97
+ st.title("Ask Your Document 💭")
98
+ file_format = st.selectbox("Select File Format", ["PDF", "docx", "xlsx"])
99
+ document = st.file_uploader("Upload Document", type=[file_format.lower()])
100
+
101
+ if not hasattr(st.session_state, "api_key") or not st.session_state.api_key:
102
+ st.warning("You need to insert your API Key first.")
103
+ elif document is not None:
104
+ if file_format == "PDF":
105
+ text = extract_text_from_pdf(document)
106
+ elif file_format == "docx":
107
+ text = extract_text_from_docx(document)
108
+ elif file_format == "xlsx":
109
+ text = extract_text_from_excel(document)
110
+ else:
111
+ text = ""
112
+
113
+ if show_content:
114
+ st.subheader("Document Text:")
115
+ st.text_area("Content", value=text, height=300)
116
+
117
+ chunks = split_text_into_chunks(text)
118
+ knowledge_base = create_knowledge_base(chunks, api_key=st.session_state.api_key)
119
+
120
+ user_question = st.text_input("Ask a question based on the document content:")
121
+
122
+ if user_question:
123
+ response = answer_question(user_question, knowledge_base, model)
124
+ st.subheader("Answer:")
125
+ st.write(response)
126
+
127
+ # Store and display previous answers
128
+ if "answers" not in st.session_state:
129
+ st.session_state.answers = []
130
+ st.session_state.answers.append((user_question, response))
131
+
132
+ if show_answers:
133
+ st.subheader("Previous Answers:")
134
+ for question, answer in st.session_state.answers:
135
+ st.write(f"Question: {question}")
136
+ st.write(f"Answer: {answer}")
137
+ st.write("------")
138
+
139
+
140
+ if __name__ == '__main__':
141
+ main()
142
+
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dotenv
2
+ os
3
+ streamlit
4
+ PyPDF2
5
+ langchain.text_splitter
6
+ langchain.embeddings.openai
7
+ langchain.vectorstores
8
+ langchain.chains.question_answering
9
+ langchain.llms
10
+ langchain.callbacks
11
+ langchain.chat_models
12
+ docx
13
+ openpyxl
14
+ pdfplumber