devjas1 commited on
Commit
593e022
·
1 Parent(s): b3a4795

(FEAT)[Implement document search functionality]: enhance the search_documents function to load FAISS index and metadata, enabling semantic document retrieval.

Browse files
Files changed (1) hide show
  1. src/retriever.py +62 -5
src/retriever.py CHANGED
@@ -1,16 +1,73 @@
1
  """
2
  Retriever module for semantic document search using FAISS.
3
-
4
  Provides functions to perform similarity-based lookups over embedded document vectors.
5
  Integrates with FAISS for efficient vector search and returns relevant document matches.
6
  """
7
 
 
 
8
  import faiss
9
- import numpy as np
10
 
11
 
12
  def search_documents(query: str, config: dict):
13
- # TODO: Load FAISS index and metadata
14
- # For now simulate with dummy results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- return [f"Match for '{query}' in file1.py", f"Match in utils.py"]
 
 
 
 
 
 
 
 
1
  """
2
  Retriever module for semantic document search using FAISS.
 
3
  Provides functions to perform similarity-based lookups over embedded document vectors.
4
  Integrates with FAISS for efficient vector search and returns relevant document matches.
5
  """
6
 
7
+ import os
8
+ import pickle
9
  import faiss
10
+ from sentence_transformers import SentenceTransformer
11
 
12
 
13
  def search_documents(query: str, config: dict):
14
+ """
15
+ Search for semantically similar documents using FAISS index.
16
+
17
+ Args:
18
+ query (str): Search query
19
+ config (dict): Configuration dictionary
20
+
21
+ Returns:
22
+ list: List of relevant text chunks with similarity scores
23
+ """
24
+ # Check if FAISS index exists
25
+ if not os.path.exists("vector_cache/faiss_index.bin"):
26
+ print("No FAISS index found. Please run 'init' command first.")
27
+ return []
28
+
29
+ try:
30
+ # Load FAISS index and metadata
31
+ index = faiss.read_index("vector_cache/faiss_index.bin")
32
+
33
+ with open("vector_cache/metadata.pkl", "rb") as f:
34
+ metadata = pickle.load(f)
35
+
36
+ texts = metadata["texts"]
37
+ filenames = metadata["filenames"]
38
+
39
+ # Embed the query
40
+ model = SentenceTransformer(config["embedding"]["model_path"])
41
+ query_embedding = model.encode([query]).astype("float32")
42
+ faiss.normalize_L2(query_embedding)
43
+
44
+ # Search similar documents
45
+ top_k = config.get("retrieval", {}).get("top_k", 5)
46
+ similarity_threshold = config.get("retrieval", {}).get(
47
+ "similarity_threshold", 0.75
48
+ )
49
+
50
+ scores, indices = index.search(query_embedding, top_k)
51
+
52
+ results = []
53
+ for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
54
+ if score >= similarity_threshold:
55
+ results.append(
56
+ f"[{filenames[idx]}] (score: {score:.3f}): {texts[idx][:200]}..."
57
+ )
58
+ else:
59
+ break
60
+
61
+ if not results:
62
+ results.append(f"No matches found above threshold {similarity_threshold}")
63
+
64
+ return results
65
 
66
+ except (
67
+ FileNotFoundError,
68
+ pickle.UnpicklingError,
69
+ KeyError,
70
+ ValueError,
71
+ ) as e:
72
+ print(f"Error during search: {e}")
73
+ return [f"Search failed: {e}"]