Spaces:

Annelisseishere
/

Streamlit_GPT

Sleeping

File size: 4,760 Bytes

from dotenv import load_dotenv
import os
import streamlit as st
from PyPDF2 import PdfFileReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI as LLMSOpenAI
from langchain.llms import AzureOpenAI
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from docx import Document
from openpyxl import load_workbook
import pdfplumber


def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text


def extract_text_from_docx(docx_file):
    doc = Document(docx_file)
    paragraphs = [paragraph.text for paragraph in doc.paragraphs]
    return "\n".join(paragraphs)


def extract_text_from_excel(excel_file):
    workbook = load_workbook(excel_file)
    text = ""
    for sheet in workbook.sheetnames:
        worksheet = workbook[sheet]
        for row in worksheet.iter_rows():
            for cell in row:
                if cell.value:
                    text += str(cell.value) + "\n"
    return text


def split_text_into_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)


def create_knowledge_base(chunks, api_key=None):
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    knowledge_base = FAISS.from_texts(chunks, embeddings)
    return knowledge_base


def answer_question(question, knowledge_base, model):
    docs = knowledge_base.similarity_search(question)
    llm = model(model_name="gpt-3.5-turbo", openai_api_key=st.session_state.api_key)
    chain = load_qa_chain(llm, chain_type="stuff")
    with get_openai_callback() as cb:
        response = chain.run(input_documents=docs, question=question)
    return response


def save_api_key(api_key):
    st.session_state.api_key = api_key


def main():
    load_dotenv()
    st.set_page_config(page_title="Ask Your PDF", layout="wide")

    # Sidebar
    st.sidebar.title("Settings")

    # API Key input
    st.sidebar.subheader("API Key")
    api_key = st.sidebar.text_input("Insert your API Key", type="password")
    st.sidebar.button("Save API Key", on_click=save_api_key, args=(api_key,))

    model_type = st.sidebar.selectbox("Select Language Model", ["OpenAI", "AzureOpenAI"])
    if model_type == "AzureOpenAI":
        model = AzureOpenAI
    else:
        model = ChatOpenAI

    chunk_size = st.sidebar.slider("Chunk Size", min_value=500, max_value=2000, value=1000, step=100)
    chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=100, max_value=500, value=200, step=50)
    show_content = st.sidebar.checkbox("Show Document Content")
    show_answers = st.sidebar.checkbox("Show Previous Answers")

    # Main content
    st.title("Ask Your Document 💭")
    file_format = st.selectbox("Select File Format", ["PDF", "docx", "xlsx"])
    document = st.file_uploader("Upload Document", type=[file_format.lower()])

    if not hasattr(st.session_state, "api_key") or not st.session_state.api_key:
        st.warning("You need to insert your API Key first.")
    elif document is not None:
        if file_format == "PDF":
            text = extract_text_from_pdf(document)
        elif file_format == "docx":
            text = extract_text_from_docx(document)
        elif file_format == "xlsx":
            text = extract_text_from_excel(document)
        else:
            text = ""

        if show_content:
            st.subheader("Document Text:")
            st.text_area("Content", value=text, height=300)

        chunks = split_text_into_chunks(text)
        knowledge_base = create_knowledge_base(chunks, api_key=st.session_state.api_key)

        user_question = st.text_input("Ask a question based on the document content:")

        if user_question:
            response = answer_question(user_question, knowledge_base, model)
            st.subheader("Answer:")
            st.write(response)

            # Store and display previous answers
            if "answers" not in st.session_state:
                st.session_state.answers = []
            st.session_state.answers.append((user_question, response))

        if show_answers:
            st.subheader("Previous Answers:")
            for question, answer in st.session_state.answers:
                st.write(f"Question: {question}")
                st.write(f"Answer: {answer}")
                st.write("------")


if __name__ == '__main__':
    main()