Spaces:

davanstrien
/

huggingface-datasets-search-v2

Running on CPU Upgrade

App Files Files Community

davanstrien HF Staff commited on Feb 13

Commit

79f2ae1

1 Parent(s): 1e1cb2a

refactor to duckdb

Browse files

Files changed (2) hide show

Dockerfile +15 -8
main.py +175 -194

Dockerfile CHANGED Viewed

@@ -1,7 +1,18 @@
-FROM python:3.12
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 # Switch to the "user" user
 USER user
@@ -9,14 +20,10 @@ USER user
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH
-# Set the working directory to the user's home directory
-WORKDIR $HOME/code
 WORKDIR /code
-COPY ./requirements.txt /code/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-COPY . .
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--log-config=log_conf.yaml"]

+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+# Copy requirements file
+COPY ./requirements.txt /code/requirements.txt
+# Install dependencies using uv (while still root)
+RUN uv pip install --system --no-cache-dir -r /code/requirements.txt
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
+# Create data directory with proper permissions
+RUN mkdir -p /data && chown -R user:user /data
+RUN chown -R user:user /code
 # Switch to the "user" user
 USER user
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH
+# Set the working directory
 WORKDIR /code
+# Copy the rest of the application
+COPY --chown=user:user . .
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--log-config=log_conf.yaml"]

main.py CHANGED Viewed

@@ -1,252 +1,233 @@
 import logging
-from contextlib import asynccontextmanager
-from typing import List, Optional
-import chromadb
-from cashews import cache
-from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
 from fastapi import FastAPI, HTTPException, Query
-from httpx import AsyncClient
-from huggingface_hub import DatasetCard
 from pydantic import BaseModel
-from starlette.responses import RedirectResponse
-from starlette.status import (
-    HTTP_403_FORBIDDEN,
-    HTTP_404_NOT_FOUND,
-    HTTP_500_INTERNAL_SERVER_ERROR,
-)
-from load_card_data import card_embedding_function, refresh_card_data
-from load_viewer_data import refresh_viewer_data
-from utils import get_save_path, get_collection, get_chroma_client
 # Set up logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 logger = logging.getLogger(__name__)
-# Set up caching
-cache.setup("mem://?check_interval=10&size=1000")
-# Initialize Chroma client
-client = get_chroma_client()
-async_client = AsyncClient(
-    follow_redirects=True,
-)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Startup: refresh data and initialize collection
-    logger.info("Starting up the application")
-    try:
-        # Refresh data
-        logger.info("Starting refresh of card data")
-        refresh_card_data()
-        logger.info("Card data refresh completed")
-        logger.info("Starting refresh of viewer data")
-        await refresh_viewer_data()
-        logger.info("Viewer data refresh completed")
-        logger.info("Data refresh completed successfully")
-    except Exception as e:
-        logger.error(f"Error during startup: {str(e)}")
-        logger.warning("Application starting with potential data issues")
     yield
-    # Shutdown: perform any cleanup
-    logger.info("Shutting down the application")
-    # Add any cleanup code here if needed
 app = FastAPI(lifespan=lifespan)
-@app.get("/", include_in_schema=False)
-def root():
-    return RedirectResponse(url="/docs")
-async def try_get_card(hub_id: str) -> Optional[str]:
     try:
-        response = await async_client.get(
-            f"https://huggingface.co/datasets/{hub_id}/raw/main/README.md"
         )
-        if response.status_code == 200:
-            card = DatasetCard(response.text)
-            return card.text
     except Exception as e:
-        logger.error(f"Error fetching card for hub_id {hub_id}: {str(e)}")
-        return None
 class QueryResult(BaseModel):
     dataset_id: str
     similarity: float
 class QueryResponse(BaseModel):
     results: List[QueryResult]
-class DatasetCardNotFoundError(HTTPException):
-    def __init__(self, dataset_id: str):
-        super().__init__(
-            status_code=HTTP_404_NOT_FOUND,
-            detail=f"No dataset card available for dataset: {dataset_id}",
-        )
-class DatasetNotForAllAudiencesError(HTTPException):
-    def __init__(self, dataset_id: str):
-        super().__init__(
-            status_code=HTTP_403_FORBIDDEN,
-            detail=f"Dataset {dataset_id} is not for all audiences and not supported in this service.",
-        )
-@app.get("/similar", response_model=QueryResponse)
-@cache(ttl="1h")
-async def api_query_dataset(dataset_id: str, n: int = Query(default=10, ge=1, le=100)):
-    embedding_function = card_embedding_function()
-    collection = get_collection(client, embedding_function, "dataset_cards")
     try:
-        logger.info(f"Querying dataset: {dataset_id}")
-        # Get the embedding for the given dataset_id
-        result = collection.get(ids=[dataset_id], include=["embeddings"])
-        if not result.get("embeddings"):
-            logger.info(f"Dataset not found: {dataset_id}")
-            try:
-                card = await try_get_card(dataset_id)
-                if card is None:
-                    raise DatasetCardNotFoundError(dataset_id)
-                embeddings = embedding_function(card)
-                collection.upsert(ids=[dataset_id], embeddings=embeddings[0])
-                logger.info(f"Dataset {dataset_id} added to collection")
-                result = collection.get(ids=[dataset_id], include=["embeddings"])
-                if result.get("not-for-all-audiences"):
-                    raise DatasetNotForAllAudiencesError(dataset_id)
-            except (DatasetCardNotFoundError, DatasetNotForAllAudiencesError):
-                raise
-            except Exception as e:
-                logger.error(
-                    f"Error adding dataset {dataset_id} to collection: {str(e)}"
-                )
-                raise DatasetCardNotFoundError(dataset_id) from e
-        embedding = result["embeddings"][0]
-        # Query the collection for similar datasets
-        query_result = collection.query(
-            query_embeddings=[embedding], n_results=n, include=["distances"]
-        )
-        if not query_result["ids"]:
-            logger.info(f"No similar datasets found for: {dataset_id}")
-            raise HTTPException(
-                status_code=HTTP_404_NOT_FOUND, detail="No similar datasets found."
-            )
-        # Prepare the response
         results = [
-            QueryResult(dataset_id=id, similarity=1 - distance)
-            for id, distance in zip(
-                query_result["ids"][0], query_result["distances"][0]
             )
         ]
-        logger.info(f"Found {len(results)} similar datasets for: {dataset_id}")
         return QueryResponse(results=results)
-    except (HTTPException, DatasetCardNotFoundError):
-        raise
     except Exception as e:
-        logger.error(f"Error querying dataset {dataset_id}: {str(e)}")
-        raise HTTPException(
-            status_code=HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="An unexpected error occurred.",
-        ) from e
-@app.get("/similar-text", response_model=QueryResponse)
-@cache(ttl="1h")
-async def api_query_by_text(query: str, n: int = Query(default=10, ge=1, le=100)):
     try:
-        logger.info(f"Querying datasets by text: {query}")
-        collection = client.get_collection(
-            name="dataset_cards", embedding_function=card_embedding_function()
-        )
-        print(query)
-        query_result = collection.query(
-            query_texts=query, n_results=n, include=["distances"]
-        )
-        print(query_result)
-        if not query_result["ids"]:
-            logger.info(f"No similar datasets found for query: {query}")
             raise HTTPException(
-                status_code=HTTP_404_NOT_FOUND, detail="No similar datasets found."
             )
-        # Prepare the response
         results = [
-            QueryResult(dataset_id=str(id), similarity=float(1 - distance))
-            for id, distance in zip(
-                query_result["ids"][0], query_result["distances"][0]
             )
         ]
-        logger.info(f"Found {len(results)} similar datasets for query: {query}")
-        return QueryResponse(results=results)
-    except Exception as e:
-        logger.error(f"Error querying datasets by text {query}: {str(e)}")
-        raise HTTPException(
-            status_code=HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="An unexpected error occurred.",
-        ) from e
-@app.get("/search-viewer", response_model=QueryResponse)
-@cache(ttl="1h")
-async def api_search_viewer(query: str, n: int = Query(default=10, ge=1, le=100)):
-    try:
-        embedding_function = SentenceTransformerEmbeddingFunction(
-            model_name="davanstrien/query-to-dataset-viewer-descriptions",
-            trust_remote_code=True,
-        )
-        collection = client.get_collection(
-            name="dataset-viewer-descriptions",
-            embedding_function=embedding_function,
-        )
-        query = f"USER_QUERY: {query}"
-        query_result = collection.query(
-            query_texts=query, n_results=n, include=["distances"]
-        )
-        print(query_result)
-        if not query_result["ids"]:
-            logger.info(f"No similar datasets found for query: {query}")
-            raise HTTPException(
-                status_code=HTTP_404_NOT_FOUND, detail="No similar datasets found."
-            )
-        # Prepare the response
-        results = [
-            QueryResult(dataset_id=str(id), similarity=float(1 - distance))
-            for id, distance in zip(
-                query_result["ids"][0], query_result["distances"][0]
-            )
-        ]
-        logger.info(f"Found {len(results)} similar datasets for query: {query}")
         return QueryResponse(results=results)
     except Exception as e:
-        logger.error(f"Error querying datasets by text {query}: {str(e)}")
-        raise HTTPException(
-            status_code=HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="An unexpected error occurred.",
-        ) from e
 if __name__ == "__main__":

 import logging
+import os
+from typing import List
+import sys
+import duckdb
+from cashews import cache  # Add this import
 from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer
+from contextlib import asynccontextmanager
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
 # Set up logging
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+LOCAL = False
+if sys.platform == "darwin":
+    LOCAL = True
+DATA_DIR = "data" if LOCAL else "/data"
+# Configure cache
+cache.setup("mem://", size_limit="4gb")
+# Initialize FastAPI app
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    # Startup: nothing special needed here since model and DB are initialized at module level
     yield
+    # Cleanup
+    await cache.close()
+    con.close()
 app = FastAPI(lifespan=lifespan)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "https://*.hf.space",  # Allow all Hugging Face Spaces
+        "https://*.huggingface.co",  # Allow all Hugging Face domains
+        # "http://localhost:5500",  # Allow localhost:5500 # TODO remove before prod
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize model and DuckDB
+model = SentenceTransformer("nomic-ai/modernbert-embed-base", device="cpu")
+embedding_dim = model.get_sentence_embedding_dimension()
+# Database setup with fallback
+db_path = f"{DATA_DIR}/vector_store.db"
+try:
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+    con = duckdb.connect(db_path)
+    logger.info(f"Connected to persistent database at {db_path}")
+except (OSError, PermissionError) as e:
+    logger.warning(
+        f"Could not create/access {db_path}. Falling back to in-memory database. Error: {e}"
+    )
+    con = duckdb.connect(":memory:")
+# Initialize VSS extension
+con.sql("INSTALL vss; LOAD vss;")
+con.sql("SET hnsw_enable_experimental_persistence=true;")
+def setup_database():
     try:
+        # Create table with properly typed embeddings
+        con.sql(f"""
+            CREATE TABLE IF NOT EXISTS model_cards AS
+            SELECT *, embeddings::FLOAT[{embedding_dim}] as embeddings_float
+            FROM 'hf://datasets/davanstrien/outputs-embeddings/**/*.parquet';
+        """)
+        # Check if index exists
+        index_exists = (
+            con.sql("""
+            SELECT COUNT(*) as count
+            FROM duckdb_indexes
+            WHERE index_name = 'my_hnsw_index';
+        """).fetchone()[0]
+            > 0
         )
+        if index_exists:
+            # Drop existing index
+            con.sql("DROP INDEX my_hnsw_index;")
+            logger.info("Dropped existing HNSW index")
+        # Create/Recreate HNSW index
+        con.sql("""
+            CREATE INDEX my_hnsw_index ON model_cards
+            USING HNSW (embeddings_float) WITH (metric = 'cosine');
+        """)
+        logger.info("Created/Recreated HNSW index")
+        # Log the number of rows in the database
+        row_count = con.sql("SELECT COUNT(*) as count FROM model_cards").fetchone()[0]
+        logger.info(f"Database initialized with {row_count:,} rows")
     except Exception as e:
+        logger.error(f"Setup error: {e}")
+# Run setup on startup
+setup_database()
 class QueryResult(BaseModel):
     dataset_id: str
     similarity: float
+    summary: str
+    likes: int
+    downloads: int
 class QueryResponse(BaseModel):
     results: List[QueryResult]
+@app.get("/")
+async def redirect_to_docs():
+    from fastapi.responses import RedirectResponse
+    return RedirectResponse(url="/docs")
+@app.get("/search/datasets", response_model=QueryResponse)
+@cache(ttl="10m")
+async def search_datasets(query: str, k: int = Query(default=5, ge=1, le=100)):
     try:
+        query_embedding = model.encode(f"search_query: {query}").tolist()
+        # Updated SQL query to include likes and downloads
+        result = con.sql(f"""
+            SELECT
+                datasetId as dataset_id,
+                1 - array_cosine_distance(
+                    embeddings_float::FLOAT[{embedding_dim}],
+                    {query_embedding}::FLOAT[{embedding_dim}]
+                ) as similarity,
+                summary,
+                likes,
+                downloads
+            FROM model_cards
+            ORDER BY similarity DESC
+            LIMIT {k};
+        """).df()
+        # Updated result conversion
         results = [
+            QueryResult(
+                dataset_id=row["dataset_id"],
+                similarity=float(row["similarity"]),
+                summary=row["summary"],
+                likes=int(row["likes"]),
+                downloads=int(row["downloads"]),
             )
+            for _, row in result.iterrows()
         ]
         return QueryResponse(results=results)
     except Exception as e:
+        logger.error(f"Search error: {str(e)}")
+        raise HTTPException(status_code=500, detail="Search failed")
+@app.get("/similarity/datasets", response_model=QueryResponse)
+@cache(ttl="10m")
+async def find_similar_datasets(
+    dataset_id: str, k: int = Query(default=5, ge=1, le=100)
+):
     try:
+        # First, get the embedding for the input dataset_id
+        reference_embedding = con.sql(f"""
+            SELECT embeddings_float
+            FROM model_cards
+            WHERE datasetId = '{dataset_id}'
+            LIMIT 1;
+        """).df()
+        if reference_embedding.empty:
             raise HTTPException(
+                status_code=404, detail=f"Dataset ID '{dataset_id}' not found"
             )
+        # Updated similarity search query to include likes and downloads
+        result = con.sql(f"""
+            SELECT
+                datasetId as dataset_id,
+                1 - array_cosine_distance(
+                    embeddings_float::FLOAT[{embedding_dim}],
+                    (SELECT embeddings_float FROM model_cards WHERE datasetId = '{dataset_id}' LIMIT 1)
+                ) as similarity,
+                summary,
+                likes,
+                downloads
+            FROM model_cards
+            WHERE datasetId != '{dataset_id}'
+            ORDER BY similarity DESC
+            LIMIT {k};
+        """).df()
+        # Updated result conversion
         results = [
+            QueryResult(
+                dataset_id=row["dataset_id"],
+                similarity=float(row["similarity"]),
+                summary=row["summary"],
+                likes=int(row["likes"]),
+                downloads=int(row["downloads"]),
             )
+            for _, row in result.iterrows()
         ]
         return QueryResponse(results=results)
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Similarity search error: {str(e)}")
+        raise HTTPException(status_code=500, detail="Similarity search failed")
 if __name__ == "__main__":