Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
from pprint import pprint | |
from getpass import getpass | |
from haystack import Document | |
from haystack import Pipeline | |
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever # updated with 2.0 | |
from haystack.document_stores.in_memory import InMemoryDocumentStore | |
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter | |
from haystack.components.builders import PromptBuilder | |
import os | |
api_key = os.getenv("hf_key") | |
#client = OpenAI(api_key=api_key) | |
st.write("# Welcome to Invoice Processing! π") | |
pdf_path = st.file_uploader("Upload invoice in pdf", type='pdf') | |
def extract_text_from_pdf(pdf_path): | |
text='' | |
with open(pdf_path,'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
if pdf_path: | |
pdf_text = extract_text_from_pdf(pdf_path) | |
doc = Document(content=pdf_text, meta={"pdf_path":pdf_file_path}) # Home work look into document data structure (shape) | |
docs = [doc] | |
print(doc) | |
cleaner = DocumentCleaner( | |
remove_empty_lines=True, | |
remove_extra_whitespaces=True) | |
preprocessor = DocumentSplitter( | |
split_by="word", | |
split_length=500, # 10 | |
split_overlap=0) | |
from haystack.components.embedders import SentenceTransformersDocumentEmbedder | |
document_store = InMemoryDocumentStore() | |
embedder = SentenceTransformersDocumentEmbedder("sentence-transformers/all-MiniLM-L6-v2") | |
embedder.warm_up() | |
output = embedder.run(docs) | |
document_store.write_documents(output.get("documents")) | |
template = """ | |
Given the following information, answer the question. | |
Context: | |
{% for document in documents %} | |
{{ document.content }} | |
{% endfor %} | |
Question: {{question}} | |
Answer: | |
""" | |
from haystack.components.retrievers import InMemoryEmbeddingRetriever | |
prompt_builder = PromptBuilder(template=template) | |
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5) | |
from haystack.components.generators import HuggingFaceAPIGenerator | |
from haystack.utils import Secret | |
generator1 = HuggingFaceAPIGenerator(api_type="serverless_inference_api", | |
api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, | |
token=Secret.from_token(api_key)) | |
from haystack.components.embedders import SentenceTransformersTextEmbedder | |
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") | |
basic_rag_pipeline = Pipeline() | |
basic_rag_pipeline.add_component("text_embedder", query_embedder) | |
basic_rag_pipeline.add_component("retriever", retriever) | |
basic_rag_pipeline.add_component("prompt_builder", prompt_builder) | |
basic_rag_pipeline.add_component("llm1", generator1) | |
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") | |
basic_rag_pipeline.connect("retriever", "prompt_builder.documents") | |
basic_rag_pipeline.connect("prompt_builder", "llm1") | |
question_list = ['invoice number in the document', 'seller tax id in the document','seller address in the document', | |
'client tax id in the document','IBAN no in the document'] | |
for query in question_list: | |
output = basic_rag_pipeline.run({"text_embedder": {"text": query}, "prompt_builder": {"question": query}}) | |
print(output['llm1']['replies']) | |