Spaces:

davanstrien
/

huggingface-datasets-search-v2

Running on CPU Upgrade

App Files Files Community

davanstrien HF Staff commited on Aug 13, 2024

Commit

4d185df

1 Parent(s): a5c714c

try adding missing cards

Browse files

Files changed (1) hide show

main.py +32 -4

main.py CHANGED Viewed

@@ -7,8 +7,9 @@ from cashews import cache
 from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
 from starlette.responses import RedirectResponse
 from load_data import get_embedding_function, get_save_path, refresh_data
 # Set up logging
 logging.basicConfig(
@@ -24,6 +25,10 @@ SAVE_PATH = get_save_path()
 client = chromadb.PersistentClient(path=SAVE_PATH)
 collection = None
 class QueryResult(BaseModel):
     dataset_id: str
@@ -69,6 +74,19 @@ def root():
     return RedirectResponse(url="/docs")
 @app.get("/query", response_model=Optional[QueryResponse])
 @cache(ttl="1h")
 async def api_query_dataset(dataset_id: str, n: int = Query(default=10, ge=1, le=100)):
@@ -76,10 +94,20 @@ async def api_query_dataset(dataset_id: str, n: int = Query(default=10, ge=1, le
         logger.info(f"Querying dataset: {dataset_id}")
         # Get the embedding for the given dataset_id
         result = collection.get(ids=[dataset_id], include=["embeddings"])
-        if not result["embeddings"]:
             logger.info(f"Dataset not found: {dataset_id}")
-            raise HTTPException(status_code=404, detail="Dataset not found")
         embedding = result["embeddings"][0]

 from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
 from starlette.responses import RedirectResponse
+from httpx import AsyncClient
 from load_data import get_embedding_function, get_save_path, refresh_data
+from huggingface_hub import DatasetCard
 # Set up logging
 logging.basicConfig(
 client = chromadb.PersistentClient(path=SAVE_PATH)
 collection = None
+async_client = AsyncClient(
+    follow_redirects=True,
+)
 class QueryResult(BaseModel):
     dataset_id: str
     return RedirectResponse(url="/docs")
+async def try_get_card(hub_id: str) -> Optional[str]:
+    try:
+        response = await async_client.get(
+            f"https://huggingface.co/datasets/{hub_id}/raw/main/README.md"
+        )
+        if response.status_code == 200:
+            card = DatasetCard(response.text)
+            return card.text
+    except Exception as e:
+        logger.error(f"Error fetching card for hub_id {hub_id}: {str(e)}")
+        return None
 @app.get("/query", response_model=Optional[QueryResponse])
 @cache(ttl="1h")
 async def api_query_dataset(dataset_id: str, n: int = Query(default=10, ge=1, le=100)):
         logger.info(f"Querying dataset: {dataset_id}")
         # Get the embedding for the given dataset_id
         result = collection.get(ids=[dataset_id], include=["embeddings"])
+        if not result.get("embeddings"):
             logger.info(f"Dataset not found: {dataset_id}")
+            try:
+                embedding_function = get_embedding_function()
+                card = await try_get_card(dataset_id)
+                embeddings = embedding_function(card)
+                collection.upsert(ids=[dataset_id], embeddings=embeddings[0])
+                logger.info(f"Dataset {dataset_id} added to collection")
+                result = collection.get(ids=[dataset_id], include=["embeddings"])
+            except Exception as e:
+                logger.error(
+                    f"Error adding dataset {dataset_id} to collection: {str(e)}"
+                )
+                raise HTTPException(status_code=404, detail="Dataset not found") from e
         embedding = result["embeddings"][0]