GAIA-Assessment-Agent / vector_store_util.py
schoemantian's picture
Add supporting files for enhanced agent functionality
c2b220b verified
"""Utilities for creating and managing the vector store for similar question lookup."""
import os
import json
import pandas as pd
from typing import List, Dict, Any, Optional
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase.client import Client, create_client
# Load environment variables
load_dotenv()
def create_embeddings_model():
"""Create and return the embeddings model.
Returns:
The embeddings model
"""
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
def create_supabase_client():
"""Create and return the Supabase client.
Returns:
The Supabase client
"""
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
if not supabase_url or not supabase_key:
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in the environment")
return create_client(supabase_url, supabase_key)
def create_vector_store(client: Client, embeddings):
"""Create and return the vector store.
Args:
client: The Supabase client
embeddings: The embeddings model
Returns:
The vector store
"""
return SupabaseVectorStore(
client=client,
embedding=embeddings,
table_name="documents",
query_name="match_documents_langchain",
)
def load_jsonl_qa_data(file_path: str) -> List[Dict[str, Any]]:
"""Load question-answer data from a JSONL file.
Args:
file_path: Path to the JSONL file
Returns:
List of question-answer data dictionaries
"""
qa_data = []
try:
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
qa_data.append(json.loads(line))
except Exception as e:
print(f"Error loading JSONL file: {e}")
return []
return qa_data
def prepare_documents_for_vector_store(qa_data: List[Dict[str, Any]]):
"""Prepare documents for insertion into the vector store.
Args:
qa_data: List of question-answer data dictionaries
Returns:
List of documents ready for insertion
"""
embeddings = create_embeddings_model()
documents = []
for item in qa_data:
# Format the content with question and answer
content = f"Question: {item.get('Question', '')}\n\nFinal answer: {item.get('Final answer', '')}"
# Create document with metadata
doc = {
"content": content,
"metadata": {
"source": item.get("task_id", ""),
"level": item.get("Level", ""),
},
"embedding": embeddings.embed_query(content),
}
documents.append(doc)
return documents
def insert_documents_to_vector_store(client: Client, documents: List[Dict[str, Any]]):
"""Insert documents into the vector store.
Args:
client: The Supabase client
documents: List of documents to insert
Returns:
The response from the insertion operation
"""
try:
response = client.table("documents").insert(documents).execute()
print(f"Successfully inserted {len(documents)} documents into vector store")
return response
except Exception as e:
print(f"Error inserting documents into vector store: {e}")
return None
def export_documents_to_csv(documents: List[Dict[str, Any]], output_path: str):
"""Export documents to a CSV file for manual upload.
Args:
documents: List of documents to export
output_path: Path to the output CSV file
"""
try:
df = pd.DataFrame(documents)
df.to_csv(output_path, index=False)
print(f"Successfully exported documents to {output_path}")
except Exception as e:
print(f"Error exporting documents to CSV: {e}")
def setup_vector_store(jsonl_path: str, export_csv: bool = False):
"""Set up the vector store with data from a JSONL file.
Args:
jsonl_path: Path to the JSONL file
export_csv: Whether to export documents to a CSV file for manual upload
Returns:
The vector store
"""
# Load question-answer data
qa_data = load_jsonl_qa_data(jsonl_path)
if not qa_data:
print("No question-answer data loaded")
return None
print(f"Loaded {len(qa_data)} question-answer pairs from {jsonl_path}")
# Prepare documents
documents = prepare_documents_for_vector_store(qa_data)
if export_csv:
# Export documents to CSV
export_documents_to_csv(documents, "supabase_docs.csv")
return None
# Create Supabase client and vector store
try:
client = create_supabase_client()
vector_store = create_vector_store(client, create_embeddings_model())
# Insert documents
insert_documents_to_vector_store(client, documents)
return vector_store
except Exception as e:
print(f"Error setting up vector store: {e}")
return None
if __name__ == "__main__":
# Example usage
jsonl_path = "metadata.jsonl"
if os.path.exists(jsonl_path):
# Set up the vector store
setup_vector_store(jsonl_path, export_csv=True)
else:
print(f"JSONL file not found: {jsonl_path}")