ArticleChatbot

Sleeping

App Files Files Community

lfoppiano commited on Nov 18, 2023

Commit

137e5e2

1 Parent(s): 320f843

cosmetics

Browse files

Files changed (2) hide show

document_qa/document_qa_engine.py +22 -8
streamlit_app.py +1 -0

document_qa/document_qa_engine.py CHANGED Viewed

@@ -23,7 +23,13 @@ class DocumentQAEngine:
     embeddings_map_from_md5 = {}
     embeddings_map_to_md5 = {}
-    def __init__(self, llm, embedding_function, qa_chain_type="stuff", embeddings_root_path=None, grobid_url=None):
         self.embedding_function = embedding_function
         self.llm = llm
         self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
@@ -81,14 +87,14 @@ class DocumentQAEngine:
         return self.embeddings_map_from_md5[md5]
     def query_document(self, query: str, doc_id, output_parser=None, context_size=4, extraction_schema=None,
-                       verbose=False) -> (
             Any, str):
         # self.load_embeddings(self.embeddings_root_path)
         if verbose:
             print(query)
-        response = self._run_query(doc_id, query, context_size=context_size)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
@@ -138,9 +144,15 @@ class DocumentQAEngine:
         return parsed_output
-    def _run_query(self, doc_id, query, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
-        return self.chain.run(input_documents=relevant_documents, question=query)
         # return self.chain({"input_documents": relevant_documents, "question": prompt_chat_template}, return_only_outputs=True)
     def _get_context(self, doc_id, query, context_size=4):
@@ -150,6 +162,7 @@ class DocumentQAEngine:
         return relevant_documents
     def get_all_context_by_document(self, doc_id):
         db = self.embeddings_dict[doc_id]
         docs = db.get()
         return docs['documents']
@@ -161,6 +174,7 @@ class DocumentQAEngine:
         return relevant_documents
     def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
@@ -215,12 +229,11 @@ class DocumentQAEngine:
             self.embeddings_dict[hash] = Chroma.from_texts(texts, embedding=self.embedding_function, metadatas=metadata,
                                                            collection_name=hash)
         self.embeddings_root_path = None
         return hash
-    def create_embeddings(self, pdfs_dir_path: Path):
         input_files = []
         for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
             for file_ in files:
@@ -238,7 +251,8 @@ class DocumentQAEngine:
                 print(data_path, "exists. Skipping it ")
                 continue
-            texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=500, perc_overlap=0.1)
             filename = metadata[0]['filename']
             vector_db_document = Chroma.from_texts(texts,

     embeddings_map_from_md5 = {}
     embeddings_map_to_md5 = {}
+    def __init__(self,
+                 llm,
+                 embedding_function,
+                 qa_chain_type="stuff",
+                 embeddings_root_path=None,
+                 grobid_url=None,
+                 ):
         self.embedding_function = embedding_function
         self.llm = llm
         self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
         return self.embeddings_map_from_md5[md5]
     def query_document(self, query: str, doc_id, output_parser=None, context_size=4, extraction_schema=None,
+                       verbose=False, memory=None) -> (
             Any, str):
         # self.load_embeddings(self.embeddings_root_path)
         if verbose:
             print(query)
+        response = self._run_query(doc_id, query, context_size=context_size, memory=memory)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
         return parsed_output
+    def _run_query(self, doc_id, query, memory=None, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
+        if memory:
+            return self.chain.run(input_documents=relevant_documents,
+                                  question=query)
+        else:
+            return self.chain.run(input_documents=relevant_documents,
+                                  question=query,
+                                  memory=memory)
         # return self.chain({"input_documents": relevant_documents, "question": prompt_chat_template}, return_only_outputs=True)
     def _get_context(self, doc_id, query, context_size=4):
         return relevant_documents
     def get_all_context_by_document(self, doc_id):
+        """Return the full context from the document"""
         db = self.embeddings_dict[doc_id]
         docs = db.get()
         return docs['documents']
         return relevant_documents
     def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
+        """Extract text from documents using Grobid, if chunk_size is < 0 it keep each paragraph separately"""
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
             self.embeddings_dict[hash] = Chroma.from_texts(texts, embedding=self.embedding_function, metadatas=metadata,
                                                            collection_name=hash)
         self.embeddings_root_path = None
         return hash
+    def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
         input_files = []
         for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
             for file_ in files:
                 print(data_path, "exists. Skipping it ")
                 continue
+            texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=chunk_size,
+                                                               perc_overlap=perc_overlap)
             filename = metadata[0]['filename']
             vector_db_document = Chroma.from_texts(texts,

streamlit_app.py CHANGED Viewed

@@ -97,6 +97,7 @@ def init_qa(model, api_key=None):
     else:
         st.error("The model was not loaded properly. Try reloading. ")
         st.stop()
     return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])

     else:
         st.error("The model was not loaded properly. Try reloading. ")
         st.stop()
+        return
     return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])