pj2111's picture
updated app again
cd1b946 verified
import streamlit as st
import PyPDF2
from pprint import pprint
from getpass import getpass
from haystack import Document
from haystack import Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever # updated with 2.0
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.builders import PromptBuilder
import os
api_key = os.getenv("hf_key")
#client = OpenAI(api_key=api_key)
st.write("# Welcome to Invoice Processing! πŸ‘‹")
pdf_path = st.file_uploader("Upload invoice in pdf", type='pdf')
def extract_text_from_pdf(pdf_path):
text=''
with open(pdf_path,'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
if pdf_path:
pdf_text = extract_text_from_pdf(pdf_path)
doc = Document(content=pdf_text, meta={"pdf_path":pdf_file_path}) # Home work look into document data structure (shape)
docs = [doc]
print(doc)
cleaner = DocumentCleaner(
remove_empty_lines=True,
remove_extra_whitespaces=True)
preprocessor = DocumentSplitter(
split_by="word",
split_length=500, # 10
split_overlap=0)
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
document_store = InMemoryDocumentStore()
embedder = SentenceTransformersDocumentEmbedder("sentence-transformers/all-MiniLM-L6-v2")
embedder.warm_up()
output = embedder.run(docs)
document_store.write_documents(output.get("documents"))
template = """
Given the following information, answer the question.
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
from haystack.components.retrievers import InMemoryEmbeddingRetriever
prompt_builder = PromptBuilder(template=template)
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret
generator1 = HuggingFaceAPIGenerator(api_type="serverless_inference_api",
api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
token=Secret.from_token(api_key))
from haystack.components.embedders import SentenceTransformersTextEmbedder
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
basic_rag_pipeline = Pipeline()
basic_rag_pipeline.add_component("text_embedder", query_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm1", generator1)
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm1")
question_list = ['invoice number in the document', 'seller tax id in the document','seller address in the document',
'client tax id in the document','IBAN no in the document']
for query in question_list:
output = basic_rag_pipeline.run({"text_embedder": {"text": query}, "prompt_builder": {"question": query}})
print(output['llm1']['replies'])