Spaces:

dev-jas
/

CodeMind

Running

App Files Files Community

devjas1 commited on 11 days ago

Commit

593e022

1 Parent(s): b3a4795

(FEAT)[Implement document search functionality]: enhance the search_documents function to load FAISS index and metadata, enabling semantic document retrieval.

Browse files

Files changed (1) hide show

src/retriever.py +62 -5

src/retriever.py CHANGED Viewed

@@ -1,16 +1,73 @@
 """
 Retriever module for semantic document search using FAISS.
 Provides functions to perform similarity-based lookups over embedded document vectors.
 Integrates with FAISS for efficient vector search and returns relevant document matches.
 """
 import faiss
-import numpy as np
 def search_documents(query: str, config: dict):
-    # TODO: Load FAISS index and metadata
-    # For now simulate with dummy results
-    return [f"Match for '{query}' in file1.py", f"Match in utils.py"]

 """
 Retriever module for semantic document search using FAISS.
 Provides functions to perform similarity-based lookups over embedded document vectors.
 Integrates with FAISS for efficient vector search and returns relevant document matches.
 """
+import os
+import pickle
 import faiss
+from sentence_transformers import SentenceTransformer
 def search_documents(query: str, config: dict):
+    """
+    Search for semantically similar documents using FAISS index.
+    Args:
+        query (str): Search query
+        config (dict): Configuration dictionary
+    Returns:
+        list: List of relevant text chunks with similarity scores
+    """
+    # Check if FAISS index exists
+    if not os.path.exists("vector_cache/faiss_index.bin"):
+        print("No FAISS index found. Please run 'init' command first.")
+        return []
+    try:
+        # Load FAISS index and metadata
+        index = faiss.read_index("vector_cache/faiss_index.bin")
+        with open("vector_cache/metadata.pkl", "rb") as f:
+            metadata = pickle.load(f)
+        texts = metadata["texts"]
+        filenames = metadata["filenames"]
+        # Embed the query
+        model = SentenceTransformer(config["embedding"]["model_path"])
+        query_embedding = model.encode([query]).astype("float32")
+        faiss.normalize_L2(query_embedding)
+        # Search similar documents
+        top_k = config.get("retrieval", {}).get("top_k", 5)
+        similarity_threshold = config.get("retrieval", {}).get(
+            "similarity_threshold", 0.75
+        )
+        scores, indices = index.search(query_embedding, top_k)
+        results = []
+        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
+            if score >= similarity_threshold:
+                results.append(
+                    f"[{filenames[idx]}] (score: {score:.3f}): {texts[idx][:200]}..."
+                )
+            else:
+                break
+        if not results:
+            results.append(f"No matches found above threshold {similarity_threshold}")
+        return results
+    except (
+        FileNotFoundError,
+        pickle.UnpicklingError,
+        KeyError,
+        ValueError,
+    ) as e:
+        print(f"Error during search: {e}")
+        return [f"Search failed: {e}"]