Spaces:
Sleeping
Sleeping
Create seed_supabase.py
Browse files- seed_supabase.py +35 -0
seed_supabase.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datasets import load_dataset
|
3 |
+
from langchain.schema import Document
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain.vectorstores import SupabaseVectorStore
|
6 |
+
from supabase import create_client
|
7 |
+
|
8 |
+
# 1. Load GAIA train split
|
9 |
+
dataset = load_dataset("gaia-benchmark/GAIA", split="train")
|
10 |
+
|
11 |
+
# 2. Build Documents: "Q: …\nA: …"
|
12 |
+
docs = []
|
13 |
+
for ex in dataset:
|
14 |
+
q, a = ex["question"], ex["answer"]
|
15 |
+
docs.append(Document(
|
16 |
+
page_content=f"Q: {q}\nA: {a}",
|
17 |
+
metadata={"task_id": ex.get("task_id"), "split": "train"}
|
18 |
+
))
|
19 |
+
|
20 |
+
# 3. Initialize embedding & Supabase client
|
21 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
22 |
+
supabase_url = os.environ["SUPABASE_URL"]
|
23 |
+
supabase_key = os.environ["SUPABASE_SERVICE_KEY"]
|
24 |
+
supabase = create_client(supabase_url, supabase_key)
|
25 |
+
|
26 |
+
# 4. Upload to Supabase
|
27 |
+
vectorstore = SupabaseVectorStore.from_documents(
|
28 |
+
docs,
|
29 |
+
embedding=embeddings,
|
30 |
+
client=supabase,
|
31 |
+
table_name="documents",
|
32 |
+
query_name="match_documents_langchain"
|
33 |
+
)
|
34 |
+
|
35 |
+
print(f"Seeded {len(docs)} GAIA examples into Supabase.")
|