Spaces:
Runtime error
Runtime error
| # vectordb_relank_law.py | |
| import faiss | |
| import numpy as np | |
| import os | |
| from chromadb import PersistentClient | |
| from chromadb.utils import embedding_functions | |
| from sentence_transformers import SentenceTransformer | |
| from retriever.reranker import rerank_documents | |
| from constants.embedding_models import embedding_models | |
| # chroma vector config v2 | |
| # law_db config v2 | |
| CHROMA_PATH = os.path.abspath("data/index/law_db") | |
| COLLECTION_NAME = "law_all" | |
| EMBEDDING_MODEL_NAME = embedding_models[1] # μ¬μ©νκ³ μ νλ λͺ¨λΈ μ ν | |
| # 1. μλ² λ© λͺ¨λΈ λ‘λ v2 | |
| # embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
| # 2. μλ² λ© ν¨μ μ€μ | |
| embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME) | |
| # 3. Chroma ν΄λΌμ΄μΈνΈ λ° μ»¬λ μ λ‘λ | |
| client = PersistentClient(path=CHROMA_PATH) | |
| collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_fn) | |
| # 4. κ²μ ν¨μ | |
| def search_documents(query: str, top_k: int = 5): | |
| print(f"\nπ κ²μμ΄: '{query}'") | |
| results = collection.query( | |
| query_texts=[query], | |
| n_results=top_k, | |
| include=["documents", "metadatas", "distances"] | |
| ) | |
| # λ¬Έμ 리μ€νΈλ§ μΆμΆ | |
| docs = results['documents'][0] | |
| metadatas = results['metadatas'][0] | |
| distances = results['distances'][0] | |
| # Rerank λ¬Έμ | |
| reranked_docs = rerank_documents(query, docs, top_k=top_k) | |
| # Rerankλ λ¬Έμμ λ§μΆ° metadata, distance λ€μ μ λ ¬ | |
| reranked_data = [] | |
| for doc in reranked_docs: | |
| idx = docs.index(doc) | |
| reranked_data.append((doc, metadatas[idx], distances[idx])) | |
| # for i, (doc, meta, dist) in enumerate(reranked_data): | |
| # print(f"\nπ κ²°κ³Ό {i+1} (μ μ¬λ: {1 - dist:.2f})") | |
| # print(f"λ¬Έμ: {doc[:150]}...") | |
| # print("λ©νλ°μ΄ν°:") | |
| # print(meta) | |
| return reranked_data # νμνλ©΄ λ¦¬ν΄ | |