import os from datasets import load_dataset from langchain.schema import Document from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import SupabaseVectorStore from supabase import create_client # 1. Load GAIA train split dataset = load_dataset("gaia-benchmark/GAIA", split="train") # 2. Build Documents: "Q: …\nA: …" docs = [] for ex in dataset: q, a = ex["question"], ex["answer"] docs.append(Document( page_content=f"Q: {q}\nA: {a}", metadata={"task_id": ex.get("task_id"), "split": "train"} )) # 3. Initialize embedding & Supabase client embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") supabase_url = os.environ["SUPABASE_URL"] supabase_key = os.environ["SUPABASE_SERVICE_KEY"] supabase = create_client(supabase_url, supabase_key) # 4. Upload to Supabase vectorstore = SupabaseVectorStore.from_documents( docs, embedding=embeddings, client=supabase, table_name="documents", query_name="match_documents_langchain" ) print(f"Seeded {len(docs)} GAIA examples into Supabase.")