Spaces:

davanstrien
/

huggingface-datasets-search-v2

Running on CPU Upgrade

App Files Files Community

davanstrien HF staff commited on about 16 hours ago

Commit

ed553e8

1 Parent(s): 16eb3c6

improve refresh logic

Browse files

Files changed (1) hide show

main.py +44 -19

main.py CHANGED Viewed

@@ -13,6 +13,7 @@ import polars as pl
 from huggingface_hub import HfApi
 from transformers import AutoTokenizer
 import torch
 # Configuration constants
 MODEL_NAME = "davanstrien/SmolLM2-360M-tldr-sft-2025-02-12_15-13"
@@ -89,15 +90,11 @@ def get_embedding_function():
 def setup_database():
     try:
         embedding_function = get_embedding_function()
-        # Create dataset collection
         dataset_collection = client.get_or_create_collection(
             embedding_function=embedding_function,
             name="dataset_cards",
             metadata={"hnsw:space": "cosine"},
         )
-        # Create model collection
         model_collection = client.get_or_create_collection(
             embedding_function=embedding_function,
             name="model_cards",
@@ -111,26 +108,52 @@ def setup_database():
         df = df.filter(
             pl.col("datasetId").str.contains_any(["open-llm-leaderboard-old/"]).not_()
         )
-        row_count = df.select(pl.len()).collect().item()
-        logger.info(f"Row count of dataset data: {row_count}")
-        # Check if we need to update the collection
-        current_count = dataset_collection.count()
-        logger.info(f"Current dataset collection count: {current_count}")
-        if current_count < row_count:
             logger.info(
-                f"Updating dataset collection with {row_count - current_count} new records"
-            )
-            # Load parquet files and upsert into ChromaDB
-            df = df.select(
-                ["datasetId", "summary", "likes", "downloads", "last_modified"]
             )
-            df = df.collect()
-            total_rows = len(df)
             for i in range(0, total_rows, BATCH_SIZE):
                 batch_df = df.slice(i, min(BATCH_SIZE, total_rows - i))
                 dataset_collection.upsert(
                     ids=batch_df.select(["datasetId"]).to_series().to_list(),
@@ -148,9 +171,11 @@ def setup_database():
                         )
                     ],
                 )
-                logger.info(f"Processed {i + len(batch_df):,} / {total_rows:,} rows")
-        logger.info(f"Database initialized with {dataset_collection.count():,} rows")
         # Load model data
         model_df = pl.scan_parquet(

 from huggingface_hub import HfApi
 from transformers import AutoTokenizer
 import torch
+import dateutil.parser
 # Configuration constants
 MODEL_NAME = "davanstrien/SmolLM2-360M-tldr-sft-2025-02-12_15-13"
 def setup_database():
     try:
         embedding_function = get_embedding_function()
         dataset_collection = client.get_or_create_collection(
             embedding_function=embedding_function,
             name="dataset_cards",
             metadata={"hnsw:space": "cosine"},
         )
         model_collection = client.get_or_create_collection(
             embedding_function=embedding_function,
             name="model_cards",
         df = df.filter(
             pl.col("datasetId").str.contains_any(["open-llm-leaderboard-old/"]).not_()
         )
+        # Get the most recent last_modified date from the collection
+        latest_update = None
+        if dataset_collection.count() > 0:
+            metadata = dataset_collection.get(include=["metadatas"]).get("metadatas")
+            logger.info(f"Found {len(metadata)} existing records in collection")
+            last_modifieds = [
+                dateutil.parser.parse(m.get("last_modified")) for m in metadata
+            ]
+            latest_update = max(last_modifieds)
+            logger.info(f"Most recent record in DB from: {latest_update}")
+            logger.info(f"Oldest record in DB from: {min(last_modifieds)}")
+        # Filter and process only newer records
+        df = df.select(["datasetId", "summary", "likes", "downloads", "last_modified"])
+        # Log some stats about the incoming data
+        sample_dates = df.select("last_modified").limit(5).collect()
+        logger.info(f"Sample of incoming dates: {sample_dates}")
+        total_incoming = df.select(pl.len()).collect().item()
+        logger.info(f"Total incoming records: {total_incoming}")
+        if latest_update:
+            logger.info(f"Filtering records newer than {latest_update}")
+            df = df.filter(pl.col("last_modified") > latest_update)
+            filtered_count = df.select(pl.len()).collect().item()
+            logger.info(f"Found {filtered_count} records to update after filtering")
+        df = df.collect()
+        total_rows = len(df)
+        if total_rows > 0:
+            logger.info(f"Updating dataset collection with {total_rows} new records")
             logger.info(
+                f"Date range of updates: {df['last_modified'].min()} to {df['last_modified'].max()}"
             )
             for i in range(0, total_rows, BATCH_SIZE):
                 batch_df = df.slice(i, min(BATCH_SIZE, total_rows - i))
+                batch_size = len(batch_df)
+                logger.info(
+                    f"Processing batch {i // BATCH_SIZE + 1}: {batch_size} records "
+                    f"({batch_df['last_modified'].min()} to {batch_df['last_modified'].max()})"
+                )
                 dataset_collection.upsert(
                     ids=batch_df.select(["datasetId"]).to_series().to_list(),
                         )
                     ],
                 )
+                logger.info(f"Processed {i + batch_size:,} / {total_rows:,} records")
+        logger.info(
+            f"Database initialized with {dataset_collection.count():,} total rows"
+        )
         # Load model data
         model_df = pl.scan_parquet(