RobotPaiSec / seed_supabase.py
rojikada's picture
Create seed_supabase.py
8d1ea69 verified
import os
from datasets import load_dataset
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import SupabaseVectorStore
from supabase import create_client
# 1. Load GAIA train split
dataset = load_dataset("gaia-benchmark/GAIA", split="train")
# 2. Build Documents: "Q: …\nA: …"
docs = []
for ex in dataset:
q, a = ex["question"], ex["answer"]
docs.append(Document(
page_content=f"Q: {q}\nA: {a}",
metadata={"task_id": ex.get("task_id"), "split": "train"}
))
# 3. Initialize embedding & Supabase client
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
supabase_url = os.environ["SUPABASE_URL"]
supabase_key = os.environ["SUPABASE_SERVICE_KEY"]
supabase = create_client(supabase_url, supabase_key)
# 4. Upload to Supabase
vectorstore = SupabaseVectorStore.from_documents(
docs,
embedding=embeddings,
client=supabase,
table_name="documents",
query_name="match_documents_langchain"
)
print(f"Seeded {len(docs)} GAIA examples into Supabase.")