Spaces:
Sleeping
Sleeping
import os | |
from datasets import load_dataset | |
from langchain.schema import Document | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import SupabaseVectorStore | |
from supabase import create_client | |
# 1. Load GAIA train split | |
dataset = load_dataset("gaia-benchmark/GAIA", split="train") | |
# 2. Build Documents: "Q: …\nA: …" | |
docs = [] | |
for ex in dataset: | |
q, a = ex["question"], ex["answer"] | |
docs.append(Document( | |
page_content=f"Q: {q}\nA: {a}", | |
metadata={"task_id": ex.get("task_id"), "split": "train"} | |
)) | |
# 3. Initialize embedding & Supabase client | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
supabase_url = os.environ["SUPABASE_URL"] | |
supabase_key = os.environ["SUPABASE_SERVICE_KEY"] | |
supabase = create_client(supabase_url, supabase_key) | |
# 4. Upload to Supabase | |
vectorstore = SupabaseVectorStore.from_documents( | |
docs, | |
embedding=embeddings, | |
client=supabase, | |
table_name="documents", | |
query_name="match_documents_langchain" | |
) | |
print(f"Seeded {len(docs)} GAIA examples into Supabase.") |