Spaces:
Sleeping
Sleeping
"""Utilities for creating and managing the vector store for similar question lookup.""" | |
import os | |
import json | |
import pandas as pd | |
from typing import List, Dict, Any, Optional | |
from dotenv import load_dotenv | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import SupabaseVectorStore | |
from supabase.client import Client, create_client | |
# Load environment variables | |
load_dotenv() | |
def create_embeddings_model(): | |
"""Create and return the embeddings model. | |
Returns: | |
The embeddings model | |
""" | |
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
def create_supabase_client(): | |
"""Create and return the Supabase client. | |
Returns: | |
The Supabase client | |
""" | |
supabase_url = os.environ.get("SUPABASE_URL") | |
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY") | |
if not supabase_url or not supabase_key: | |
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in the environment") | |
return create_client(supabase_url, supabase_key) | |
def create_vector_store(client: Client, embeddings): | |
"""Create and return the vector store. | |
Args: | |
client: The Supabase client | |
embeddings: The embeddings model | |
Returns: | |
The vector store | |
""" | |
return SupabaseVectorStore( | |
client=client, | |
embedding=embeddings, | |
table_name="documents", | |
query_name="match_documents_langchain", | |
) | |
def load_jsonl_qa_data(file_path: str) -> List[Dict[str, Any]]: | |
"""Load question-answer data from a JSONL file. | |
Args: | |
file_path: Path to the JSONL file | |
Returns: | |
List of question-answer data dictionaries | |
""" | |
qa_data = [] | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
for line in f: | |
if line.strip(): | |
qa_data.append(json.loads(line)) | |
except Exception as e: | |
print(f"Error loading JSONL file: {e}") | |
return [] | |
return qa_data | |
def prepare_documents_for_vector_store(qa_data: List[Dict[str, Any]]): | |
"""Prepare documents for insertion into the vector store. | |
Args: | |
qa_data: List of question-answer data dictionaries | |
Returns: | |
List of documents ready for insertion | |
""" | |
embeddings = create_embeddings_model() | |
documents = [] | |
for item in qa_data: | |
# Format the content with question and answer | |
content = f"Question: {item.get('Question', '')}\n\nFinal answer: {item.get('Final answer', '')}" | |
# Create document with metadata | |
doc = { | |
"content": content, | |
"metadata": { | |
"source": item.get("task_id", ""), | |
"level": item.get("Level", ""), | |
}, | |
"embedding": embeddings.embed_query(content), | |
} | |
documents.append(doc) | |
return documents | |
def insert_documents_to_vector_store(client: Client, documents: List[Dict[str, Any]]): | |
"""Insert documents into the vector store. | |
Args: | |
client: The Supabase client | |
documents: List of documents to insert | |
Returns: | |
The response from the insertion operation | |
""" | |
try: | |
response = client.table("documents").insert(documents).execute() | |
print(f"Successfully inserted {len(documents)} documents into vector store") | |
return response | |
except Exception as e: | |
print(f"Error inserting documents into vector store: {e}") | |
return None | |
def export_documents_to_csv(documents: List[Dict[str, Any]], output_path: str): | |
"""Export documents to a CSV file for manual upload. | |
Args: | |
documents: List of documents to export | |
output_path: Path to the output CSV file | |
""" | |
try: | |
df = pd.DataFrame(documents) | |
df.to_csv(output_path, index=False) | |
print(f"Successfully exported documents to {output_path}") | |
except Exception as e: | |
print(f"Error exporting documents to CSV: {e}") | |
def setup_vector_store(jsonl_path: str, export_csv: bool = False): | |
"""Set up the vector store with data from a JSONL file. | |
Args: | |
jsonl_path: Path to the JSONL file | |
export_csv: Whether to export documents to a CSV file for manual upload | |
Returns: | |
The vector store | |
""" | |
# Load question-answer data | |
qa_data = load_jsonl_qa_data(jsonl_path) | |
if not qa_data: | |
print("No question-answer data loaded") | |
return None | |
print(f"Loaded {len(qa_data)} question-answer pairs from {jsonl_path}") | |
# Prepare documents | |
documents = prepare_documents_for_vector_store(qa_data) | |
if export_csv: | |
# Export documents to CSV | |
export_documents_to_csv(documents, "supabase_docs.csv") | |
return None | |
# Create Supabase client and vector store | |
try: | |
client = create_supabase_client() | |
vector_store = create_vector_store(client, create_embeddings_model()) | |
# Insert documents | |
insert_documents_to_vector_store(client, documents) | |
return vector_store | |
except Exception as e: | |
print(f"Error setting up vector store: {e}") | |
return None | |
if __name__ == "__main__": | |
# Example usage | |
jsonl_path = "metadata.jsonl" | |
if os.path.exists(jsonl_path): | |
# Set up the vector store | |
setup_vector_store(jsonl_path, export_csv=True) | |
else: | |
print(f"JSONL file not found: {jsonl_path}") |