Spaces:
Sleeping
Sleeping
import ast | |
import logging | |
import os | |
import pandas as pd | |
from dotenv import load_dotenv | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import SupabaseVectorStore | |
from supabase.client import create_client | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class SupabaseConnector: | |
def __init__(self): | |
load_dotenv() | |
self.supabase = create_client( | |
os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_SERVICE_KEY") | |
) | |
self.embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-mpnet-base-v2" | |
) | |
self.vector_store = SupabaseVectorStore( | |
client=self.supabase, | |
embedding=self.embeddings, | |
table_name="documents", | |
query_name="match_documents_langchain", | |
) | |
def upload_csv(self, file_path: str, batch_size: int = 100): | |
""" | |
Upload documents from supabase_docs.csv to Supabase vector store. | |
Only 'content' and parsed 'metadata' are used. | |
""" | |
df = pd.read_csv(file_path) | |
logger.info(f"Loaded {len(df)} records from {file_path}") | |
# Parse metadata column from string to dict | |
df["metadata"] = df["metadata"].apply( | |
lambda x: ast.literal_eval(x) if isinstance(x, str) else {} | |
) | |
for i in range(0, len(df), batch_size): | |
batch = df.iloc[i : i + batch_size] | |
texts = batch["content"].tolist() | |
metadatas = batch["metadata"].tolist() | |
self.vector_store.add_texts(texts=texts, metadatas=metadatas) | |
logger.info(f"Uploaded batch {i//batch_size + 1}") | |
logger.info("CSV upload completed.") | |
if __name__ == "__main__": | |
connector = SupabaseConnector() | |
connector.upload_csv("supabase_docs.csv") | |