Streamlit_GPT / app.py
Annelisseishere's picture
first commit
9ac353e
from dotenv import load_dotenv
import os
import streamlit as st
from PyPDF2 import PdfFileReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI as LLMSOpenAI
from langchain.llms import AzureOpenAI
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from docx import Document
from openpyxl import load_workbook
import pdfplumber
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
def extract_text_from_docx(docx_file):
doc = Document(docx_file)
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
return "\n".join(paragraphs)
def extract_text_from_excel(excel_file):
workbook = load_workbook(excel_file)
text = ""
for sheet in workbook.sheetnames:
worksheet = workbook[sheet]
for row in worksheet.iter_rows():
for cell in row:
if cell.value:
text += str(cell.value) + "\n"
return text
def split_text_into_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
return text_splitter.split_text(text)
def create_knowledge_base(chunks, api_key=None):
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
knowledge_base = FAISS.from_texts(chunks, embeddings)
return knowledge_base
def answer_question(question, knowledge_base, model):
docs = knowledge_base.similarity_search(question)
llm = model(model_name="gpt-3.5-turbo", openai_api_key=st.session_state.api_key)
chain = load_qa_chain(llm, chain_type="stuff")
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=question)
return response
def save_api_key(api_key):
st.session_state.api_key = api_key
def main():
load_dotenv()
st.set_page_config(page_title="Ask Your PDF", layout="wide")
# Sidebar
st.sidebar.title("Settings")
# API Key input
st.sidebar.subheader("API Key")
api_key = st.sidebar.text_input("Insert your API Key", type="password")
st.sidebar.button("Save API Key", on_click=save_api_key, args=(api_key,))
model_type = st.sidebar.selectbox("Select Language Model", ["OpenAI", "AzureOpenAI"])
if model_type == "AzureOpenAI":
model = AzureOpenAI
else:
model = ChatOpenAI
chunk_size = st.sidebar.slider("Chunk Size", min_value=500, max_value=2000, value=1000, step=100)
chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=100, max_value=500, value=200, step=50)
show_content = st.sidebar.checkbox("Show Document Content")
show_answers = st.sidebar.checkbox("Show Previous Answers")
# Main content
st.title("Ask Your Document 💭")
file_format = st.selectbox("Select File Format", ["PDF", "docx", "xlsx"])
document = st.file_uploader("Upload Document", type=[file_format.lower()])
if not hasattr(st.session_state, "api_key") or not st.session_state.api_key:
st.warning("You need to insert your API Key first.")
elif document is not None:
if file_format == "PDF":
text = extract_text_from_pdf(document)
elif file_format == "docx":
text = extract_text_from_docx(document)
elif file_format == "xlsx":
text = extract_text_from_excel(document)
else:
text = ""
if show_content:
st.subheader("Document Text:")
st.text_area("Content", value=text, height=300)
chunks = split_text_into_chunks(text)
knowledge_base = create_knowledge_base(chunks, api_key=st.session_state.api_key)
user_question = st.text_input("Ask a question based on the document content:")
if user_question:
response = answer_question(user_question, knowledge_base, model)
st.subheader("Answer:")
st.write(response)
# Store and display previous answers
if "answers" not in st.session_state:
st.session_state.answers = []
st.session_state.answers.append((user_question, response))
if show_answers:
st.subheader("Previous Answers:")
for question, answer in st.session_state.answers:
st.write(f"Question: {question}")
st.write(f"Answer: {answer}")
st.write("------")
if __name__ == '__main__':
main()