- Write a Python notebook that does semantic search on the vector database and return top k results (use LangChain). Comment on what you observe.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import os
from langchain.vectorstores import Chroma

In [None]:
# Wrapper with embed_documents and embed_query
class SentenceTransformerWrapper:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)
        
    def embed_documents(self, texts):
        # Convert the list of texts to embeddings
        return self.model.encode(texts, show_progress_bar=True).tolist()
    
    def embed_query(self, text):
        # Convert a single query to its embedding
        return self.model.encode(text).tolist()

# Instantiate wrapper with model
embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')

In [None]:
# Chroma database
vector_db = Chroma(
    persist_directory="chroma_db_new",
    embedding=embedding_model  # Use your SentenceTransformerWrapper instance
)

# Test by running a similarity search
query = input("Enter your query: ")
results = vector_db.similarity_search(query, k=5)

# Display the results
print(f"\nTop 5 results for query: '{query}'\n")
for i, doc in enumerate(results):
    print(f"Result {i+1}:")
    print(f"Metadata: {doc.metadata}")
    print(f"Content: {doc.page_content[:50]}...")  # Display a preview of the chunk
    print("-" * 50)
