RAG-PDF-QnA-ChatBot-Perf / PDF_Reader.py
thigobr's picture
Add caching
a0378f2
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
import streamlit as st
@st.cache_data
def read_pdf(uploaded_file):
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
@st.cache_data
def Chunks(docs):
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size = 1000,
chunk_overlap = 100,
)
doc = text_splitter.split_text(docs)
return doc
@st.cache_resource
def PDF_4_QA(file):
content = read_pdf(file)
pdf_chunks = Chunks(docs=content)
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'})
vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings)
return vectorstore_openai