Spaces:
Running
Running
File size: 2,677 Bytes
7d6c1f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Write a Python notebook that does semantic search on the vector database and return top k results (use LangChain). Comment on what you observe."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import os\n",
"from langchain.vectorstores import Chroma"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wrapper with embed_documents and embed_query\n",
"class SentenceTransformerWrapper:\n",
" def __init__(self, model_name):\n",
" self.model = SentenceTransformer(model_name)\n",
" \n",
" def embed_documents(self, texts):\n",
" # Convert the list of texts to embeddings\n",
" return self.model.encode(texts, show_progress_bar=True).tolist()\n",
" \n",
" def embed_query(self, text):\n",
" # Convert a single query to its embedding\n",
" return self.model.encode(text).tolist()\n",
"\n",
"# Instantiate wrapper with model\n",
"embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Chroma database\n",
"vector_db = Chroma(\n",
" persist_directory=\"chroma_db_new\",\n",
" embedding=embedding_model # Use your SentenceTransformerWrapper instance\n",
")\n",
"\n",
"# Test by running a similarity search\n",
"query = input(\"Enter your query: \")\n",
"results = vector_db.similarity_search(query, k=5)\n",
"\n",
"# Display the results\n",
"print(f\"\\nTop 5 results for query: '{query}'\\n\")\n",
"for i, doc in enumerate(results):\n",
" print(f\"Result {i+1}:\")\n",
" print(f\"Metadata: {doc.metadata}\")\n",
" print(f\"Content: {doc.page_content[:50]}...\") # Display a preview of the chunk\n",
" print(\"-\" * 50)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "phapdienvv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|