Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -167,7 +167,7 @@ def select_embeddings_model(LLM_service="HuggingFace"):
|
|
| 167 |
return embeddings
|
| 168 |
|
| 169 |
#embeddings_OpenAI = select_embeddings_model(LLM_service="OpenAI")
|
| 170 |
-
|
| 171 |
embeddings_HuggingFace = select_embeddings_model(LLM_service="HuggingFace")
|
| 172 |
|
| 173 |
|
|
@@ -219,7 +219,7 @@ vector_store_OpenAI = Chroma(
|
|
| 219 |
print("vector_store_OpenAI:",vector_store_OpenAI._collection.count(),"chunks.")
|
| 220 |
"""
|
| 221 |
vector_store_google = Chroma(
|
| 222 |
-
persist_directory =
|
| 223 |
embedding_function=embeddings_google)
|
| 224 |
print("vector_store_google:",vector_store_google._collection.count(),"chunks.")
|
| 225 |
|
|
@@ -327,6 +327,117 @@ def CohereRerank_retriever(
|
|
| 327 |
|
| 328 |
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
def instantiate_LLM(LLM_provider,api_key,temperature=0.5,top_p=0.95,model_name=None):
|
| 331 |
"""Instantiate LLM in Langchain.
|
| 332 |
Parameters:
|
|
|
|
| 167 |
return embeddings
|
| 168 |
|
| 169 |
#embeddings_OpenAI = select_embeddings_model(LLM_service="OpenAI")
|
| 170 |
+
embeddings_google = select_embeddings_model(LLM_service="Google")
|
| 171 |
embeddings_HuggingFace = select_embeddings_model(LLM_service="HuggingFace")
|
| 172 |
|
| 173 |
|
|
|
|
| 219 |
print("vector_store_OpenAI:",vector_store_OpenAI._collection.count(),"chunks.")
|
| 220 |
"""
|
| 221 |
vector_store_google = Chroma(
|
| 222 |
+
persist_directory = current_dir + "/Vit_All_Google_Embeddings",
|
| 223 |
embedding_function=embeddings_google)
|
| 224 |
print("vector_store_google:",vector_store_google._collection.count(),"chunks.")
|
| 225 |
|
|
|
|
| 327 |
|
| 328 |
|
| 329 |
|
| 330 |
+
|
| 331 |
+
def retrieval_blocks(
|
| 332 |
+
create_vectorstore=True,# if True a Chroma vectorstore is created, else the Chroma vectorstore will be loaded
|
| 333 |
+
LLM_service="HuggingFace",
|
| 334 |
+
vectorstore_name="Vit_All_HF_Embeddings",
|
| 335 |
+
chunk_size = 1600, chunk_overlap=200, # parameters of the RecursiveCharacterTextSplitter
|
| 336 |
+
retriever_type="Vectorstore_backed_retriever",
|
| 337 |
+
base_retriever_search_type="similarity", base_retriever_k=10, base_retriever_score_threshold=None,
|
| 338 |
+
compression_retriever_k=16,
|
| 339 |
+
cohere_api_key="***", cohere_model="rerank-multilingual-v2.0", cohere_top_n=8,
|
| 340 |
+
):
|
| 341 |
+
"""
|
| 342 |
+
Rertieval includes: document loaders, text splitter, vectorstore and retriever.
|
| 343 |
+
|
| 344 |
+
Parameters:
|
| 345 |
+
create_vectorstore (boolean): If True, a new Chroma vectorstore will be created. Otherwise, an existing vectorstore will be loaded.
|
| 346 |
+
LLM_service: OpenAI, Google or HuggingFace.
|
| 347 |
+
vectorstore_name (str): the name of the vectorstore.
|
| 348 |
+
chunk_size and chunk_overlap: parameters of the RecursiveCharacterTextSplitter, default = (1600,200).
|
| 349 |
+
|
| 350 |
+
retriever_type (str): in [Vectorstore_backed_retriever,Contextual_compression,Cohere_reranker]
|
| 351 |
+
|
| 352 |
+
base_retriever_search_type: search_type in ["similarity", "mmr", "similarity_score_threshold"], default = similarity.
|
| 353 |
+
base_retriever_k: The most similar vectors to retrieve (default k = 10).
|
| 354 |
+
base_retriever_score_threshold: score_threshold used by the base retriever, default = None.
|
| 355 |
+
|
| 356 |
+
compression_retriever_k: top k documents returned by the compression retriever, default=16
|
| 357 |
+
|
| 358 |
+
cohere_api_key: Cohere API key
|
| 359 |
+
cohere_model (str): The Cohere model can be either 'rerank-english-v2.0' or 'rerank-multilingual-v2.0', with the latter being the default.
|
| 360 |
+
cohere_top_n: top n results returned by Cohere rerank, default = 8.
|
| 361 |
+
|
| 362 |
+
Output:
|
| 363 |
+
retriever.
|
| 364 |
+
"""
|
| 365 |
+
try:
|
| 366 |
+
# Create new Vectorstore (Chroma index)
|
| 367 |
+
if create_vectorstore:
|
| 368 |
+
# 1. load documents
|
| 369 |
+
documents = langchain_document_loader(current_dir)
|
| 370 |
+
|
| 371 |
+
# 2. Text Splitter: split documents to chunks
|
| 372 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 373 |
+
separators = ["\n\n", "\n", " ", ""],
|
| 374 |
+
chunk_size = chunk_size,
|
| 375 |
+
chunk_overlap= chunk_overlap
|
| 376 |
+
)
|
| 377 |
+
chunks = text_splitter.split_documents(documents=documents)
|
| 378 |
+
|
| 379 |
+
# 3. Embeddings
|
| 380 |
+
embeddings = select_embeddings_model(LLM_service=LLM_service)
|
| 381 |
+
|
| 382 |
+
# 4. Vectorsore: create Chroma index
|
| 383 |
+
vector_store = create_vectorstore(
|
| 384 |
+
embeddings=embeddings,
|
| 385 |
+
documents = chunks,
|
| 386 |
+
vectorstore_name=vectorstore_name,
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
# 5. Load a Vectorstore (Chroma index)
|
| 390 |
+
else:
|
| 391 |
+
embeddings = select_embeddings_model(LLM_service=LLM_service)
|
| 392 |
+
vector_store = Chroma(
|
| 393 |
+
persist_directory = current_dir + "/" + vectorstore_name,
|
| 394 |
+
embedding_function=embeddings
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
# 6. base retriever: Vector store-backed retriever
|
| 398 |
+
base_retriever = Vectorstore_backed_retriever(
|
| 399 |
+
vector_store,
|
| 400 |
+
search_type=base_retriever_search_type,
|
| 401 |
+
k=base_retriever_k,
|
| 402 |
+
score_threshold=base_retriever_score_threshold
|
| 403 |
+
)
|
| 404 |
+
retriever = None
|
| 405 |
+
if retriever_type=="Vectorstore_backed_retriever":
|
| 406 |
+
retriever = base_retriever
|
| 407 |
+
|
| 408 |
+
# 7. Contextual Compression Retriever
|
| 409 |
+
if retriever_type=="Contextual_compression":
|
| 410 |
+
retriever = create_compression_retriever(
|
| 411 |
+
embeddings=embeddings,
|
| 412 |
+
base_retriever=base_retriever,
|
| 413 |
+
k=compression_retriever_k,
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
# 8. CohereRerank retriever
|
| 417 |
+
if retriever_type=="Cohere_reranker":
|
| 418 |
+
retriever = CohereRerank_retriever(
|
| 419 |
+
base_retriever=base_retriever,
|
| 420 |
+
cohere_api_key=cohere_api_key,
|
| 421 |
+
cohere_model=cohere_model,
|
| 422 |
+
top_n=cohere_top_n
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
print(f"\n{retriever_type} is created successfully!")
|
| 426 |
+
print(f"Relevant documents will be retrieved from vectorstore ({vectorstore_name}) which uses {LLM_service} embeddings \
|
| 427 |
+
and has {vector_store._collection.count()} chunks.")
|
| 428 |
+
|
| 429 |
+
return retriever
|
| 430 |
+
except Exception as e:
|
| 431 |
+
print(e)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
|
| 441 |
def instantiate_LLM(LLM_provider,api_key,temperature=0.5,top_p=0.95,model_name=None):
|
| 442 |
"""Instantiate LLM in Langchain.
|
| 443 |
Parameters:
|