eudr_retriever

Sleeping

App Files Files Community

mtyrrell commited on Jul 7

Commit

08a352f

1 Parent(s): 87abc73

updated for test storage module, plus prelim generalized approach to multi data source

Browse files

Files changed (5) hide show

app/main.py +11 -2
app/retriever.py +27 -62
app/vectorstore_interface.py +89 -0
params.cfg +9 -5
requirements.txt +3 -1

app/main.py CHANGED Viewed

@@ -1,5 +1,14 @@
 import gradio as gr
-from .retriever import retrieve_context
 # ---------------------------------------------------------------------
 # Gradio Interface with MCP support
@@ -78,7 +87,7 @@ ui = gr.Interface(
 if __name__ == "__main__":
     ui.launch(
         server_name="0.0.0.0",
-        server_port=7860,
         mcp_server=True,
         show_error=True
     )

 import gradio as gr
+from .retriever import retrieve_context, get_vectorstore
+# Initialize vector store at startup
+print("Initializing vector store connection...")
+try:
+    vectorstore = get_vectorstore()
+    print("Vector store connection initialized successfully")
+except Exception as e:
+    print(f"Failed to initialize vector store: {e}")
+    raise
 # ---------------------------------------------------------------------
 # Gradio Interface with MCP support
 if __name__ == "__main__":
     ui.launch(
         server_name="0.0.0.0",
+        server_port=7860,  # Different port from reranker
         mcp_server=True,
         show_error=True
     )

app/retriever.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List, Dict, Any, Optional
 from qdrant_client.http import models as rest
 from langchain.schema import Document
 from .utils import getconfig
 import logging
 # Load configuration
@@ -11,6 +12,20 @@ config = getconfig("params.cfg")
 RETRIEVER_TOP_K = int(config.get("retriever", "TOP_K"))
 SCORE_THRESHOLD = float(config.get("retriever", "SCORE_THRESHOLD"))
 def create_filter(
     reports: List[str] = None,
     sources: str = None,
@@ -74,37 +89,9 @@ def create_filter(
         return rest.Filter(must=conditions)
     return None
-def get_vectorstore():
-    """
-    Initialize and return the vectorstore connection.
-    This function should be implemented based on your specific vectorstore setup.
-    Returns:
-        Vectorstore instance (e.g., Qdrant, Pinecone, etc.)
-    """
-    # TODO: Implement based on your external vector database
-    # Example for Qdrant:
-    # from langchain_community.vectorstores import Qdrant
-    # from qdrant_client import QdrantClient
-    #
-    # client = QdrantClient(
-    #     host=config.get("vectorstore", "HOST"),
-    #     port=config.get("vectorstore", "PORT"),
-    #     api_key=config.get("vectorstore", "API_KEY", fallback=None)
-    # )
-    #
-    # vectorstore = Qdrant(
-    #     client=client,
-    #     collection_name=config.get("vectorstore", "COLLECTION_NAME"),
-    #     embeddings=your_embedding_model  # You'll need to configure this
-    # )
-    #
-    # return vectorstore
-    raise NotImplementedError("Please implement vectorstore connection based on your setup")
 def retrieve_context(
     query: str,
     reports: List[str] = None,
     sources: str = None,
     subtype: str = None,
@@ -116,6 +103,7 @@ def retrieve_context(
     Args:
         query: The search query
         reports: List of specific report filenames to search within
         sources: Source type to filter by
         subtype: Document subtype to filter by
@@ -126,48 +114,25 @@ def retrieve_context(
         List of dictionaries with 'page_content' and 'metadata' keys
     """
     try:
-        # Get vectorstore instance
-        vectorstore = get_vectorstore()
-        # Create metadata filter
-        filter_obj = create_filter(
-            reports=reports or [],
-            sources=sources,
-            subtype=subtype,
-            year=year or []
-        )
-        # Set up search parameters
         k = top_k or RETRIEVER_TOP_K
         search_kwargs = {
-            "score_threshold": SCORE_THRESHOLD,
-            "k": k
         }
-        if filter_obj:
-            search_kwargs["filter"] = filter_obj
-        # Create retriever
-        retriever = vectorstore.as_retriever(
-            search_type="similarity_score_threshold",
-            search_kwargs=search_kwargs
-        )
         # Perform retrieval
-        retrieved_docs: List[Document] = retriever.invoke(query)
         logging.info(f"Retrieved {len(retrieved_docs)} documents for query: {query[:50]}...")
-        # Convert to dictionary format
-        results = [
-            {
-                "page_content": doc.page_content,
-                "metadata": doc.metadata
-            }
-            for doc in retrieved_docs
-        ]
-        return results
     except Exception as e:
         logging.error(f"Error during retrieval: {str(e)}")

 from qdrant_client.http import models as rest
 from langchain.schema import Document
 from .utils import getconfig
+from .vectorstore_interface import create_vectorstore, VectorStoreInterface
 import logging
 # Load configuration
 RETRIEVER_TOP_K = int(config.get("retriever", "TOP_K"))
 SCORE_THRESHOLD = float(config.get("retriever", "SCORE_THRESHOLD"))
+# Initialize vector store connection at module import time
+logging.info("Initializing vector store connection...")
+vectorstore = create_vectorstore(config)
+logging.info("Vector store connection initialized successfully")
+def get_vectorstore() -> VectorStoreInterface:
+    """
+    Return the pre-initialized vector store connection.
+    Returns:
+        VectorStoreInterface instance
+    """
+    return vectorstore
 def create_filter(
     reports: List[str] = None,
     sources: str = None,
         return rest.Filter(must=conditions)
     return None
 def retrieve_context(
     query: str,
+    vectorstore,
     reports: List[str] = None,
     sources: str = None,
     subtype: str = None,
     Args:
         query: The search query
+        vectorstore: Pre-initialized vector store instance
         reports: List of specific report filenames to search within
         sources: Source type to filter by
         subtype: Document subtype to filter by
         List of dictionaries with 'page_content' and 'metadata' keys
     """
     try:
+        # Use the passed vector store instead of calling get_vectorstore()
         k = top_k or RETRIEVER_TOP_K
+        # For Hugging Face Spaces, we pass the model name from config
         search_kwargs = {
+            "model_name": config.get("embeddings", "MODEL_NAME")
         }
+        # Note: Filtering is currently limited for Hugging Face Spaces
+        # as the API doesn't expose filtering capabilities
+        if any([reports, sources, subtype, year]):
+            logging.warning("Filtering not supported for Hugging Face Spaces API")
         # Perform retrieval
+        retrieved_docs = vectorstore.search(query, k, **search_kwargs)
         logging.info(f"Retrieved {len(retrieved_docs)} documents for query: {query[:50]}...")
+        return retrieved_docs
     except Exception as e:
         logging.error(f"Error during retrieval: {str(e)}")

app/vectorstore_interface.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional
+from gradio_client import Client
+import logging
+import os
+import time
+class VectorStoreInterface(ABC):
+    """Abstract interface for different vector store implementations."""
+    @abstractmethod
+    def search(self, query: str, top_k: int, **kwargs) -> List[Dict[str, Any]]:
+        """Search for similar documents."""
+        pass
+class HuggingFaceSpacesVectorStore(VectorStoreInterface):
+    """Vector store implementation for Hugging Face Spaces with MCP endpoints."""
+    def __init__(self, space_url: str, collection_name: str, hf_token: Optional[str] = None):
+        token = os.getenv("HF_TOKEN")
+        repo_id = space_url
+        logging.info(f"Connecting to Hugging Face Space: {repo_id}")
+        if token:
+            self.client = Client(repo_id, hf_token=token)
+        else:
+            self.client = Client(repo_id)
+        self.collection_name = collection_name
+    def search(self, query: str, top_k: int, **kwargs) -> List[Dict[str, Any]]:
+        """Search using Hugging Face Spaces MCP API."""
+        try:
+            # Use the /search_text endpoint as documented in the API
+            result = self.client.predict(
+                query=query,
+                collection_name=self.collection_name,
+                model_name=kwargs.get('model_name'),
+                top_k=top_k,
+                api_name="/search_text"
+            )
+            logging.info(f"Successfully retrieved {len(result) if result else 0} documents")
+            return result
+        except Exception as e:
+            logging.error(f"Error searching Hugging Face Spaces: {str(e)}")
+            raise e
+# class QdrantVectorStore(VectorStoreInterface):
+#     """Vector store implementation for direct Qdrant connection."""
+#     # needs to be generalized for other vector stores (or add a new class for each vector store)
+#     def __init__(self, host: str, port: int, collection_name: str, api_key: Optional[str] = None):
+#         from qdrant_client import QdrantClient
+#         from langchain_community.vectorstores import Qdrant
+#         self.client = QdrantClient(
+#             host=host,
+#             port=port,
+#             api_key=api_key
+#         )
+#         self.collection_name = collection_name
+#          # Embedding model not implemented
+#     def search(self, query: str, top_k: int, **kwargs) -> List[Dict[str, Any]]:
+#         """Search using direct Qdrant connection."""
+#         # Embedding model not implemented
+#         raise NotImplementedError("Direct Qdrant search needs embedding model configuration")
+def create_vectorstore(config: Any) -> VectorStoreInterface:
+    """Factory function to create appropriate vector store based on configuration."""
+    vectorstore_type = config.get("vectorstore", "TYPE")
+    if vectorstore_type.lower() == "huggingface_spaces":
+        space_url = config.get("vectorstore", "SPACE_URL")
+        collection_name = config.get("vectorstore", "COLLECTION_NAME")
+        hf_token = config.get("vectorstore", "HF_TOKEN", fallback=None)
+        return HuggingFaceSpacesVectorStore(space_url, collection_name, hf_token)
+    elif vectorstore_type.lower() == "qdrant":
+        host = config.get("vectorstore", "HOST")
+        port = int(config.get("vectorstore", "PORT"))
+        collection_name = config.get("vectorstore", "COLLECTION_NAME")
+        api_key = config.get("vectorstore", "API_KEY", fallback=None)
+        return QdrantVectorStore(host, port, collection_name, api_key)
+    else:
+        raise ValueError(f"Unsupported vector store type: {vectorstore_type}")

params.cfg CHANGED Viewed

@@ -3,11 +3,15 @@ TOP_K = 10
 SCORE_THRESHOLD = 0.6
 [vectorstore]
-TYPE = qdrant
-HOST = localhost
-PORT = 6333
-COLLECTION_NAME = "auditqa"
-# API_KEY = your_api_key_if_needed
 [embeddings]
 MODEL_NAME = BAAI/bge-m3

 SCORE_THRESHOLD = 0.6
 [vectorstore]
+TYPE = huggingface_spaces
+SPACE_URL = GIZ/audit_data
+COLLECTION_NAME = docling
+# For future direct Qdrant usage:
+# TYPE = qdrant
+# HOST = ip address
+# PORT = 6333
+# COLLECTION_NAME = "collection name"
+# API_KEY = api key for source
 [embeddings]
 MODEL_NAME = BAAI/bge-m3

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ gradio[mcp]
 langchain
 langchain-community
 qdrant-client
-sentence-transformers

 langchain
 langchain-community
 qdrant-client
+sentence-transformers
+gradio_client>=0.10.0
+huggingface_hub>=0.20.0