Spaces:

bhoomika19
/

math-routing-agent

Sleeping

App Files Files Community

bhoomika19 commited on 4 days ago

Commit

6874d8b

1 Parent(s): 61f25c3

phase 1 - data storage in qdrant and retrieval

Browse files

Files changed (7) hide show

.gitignore +3 -0
database/README.md +27 -0
database/ingest.py +115 -0
database/qdrant_manager.py +137 -0
database/requirements.txt +15 -0
database/test_retrieval.py +93 -0
database/utils.py +117 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+venv/
+__pycache__/

database/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# Database Module - Math Agentic RAG
+This module handles the knowledge base creation and retrieval for the Math Agentic RAG system.
+## Files Overview
+### Core Files
+- **`utils.py`** - Utility functions for embedding generation and data processing
+- **`qdrant_manager.py`** - Qdrant vector database client wrapper
+- **`ingest.py`** - Main ingestion script for loading dataset into Qdrant (includes config)
+- **`test_retrieval.py`** - Testing script for validating retrieval functionality (includes config)
+### Dependencies
+- **`requirements.txt`** - Python package dependencies
+## Usage
+1. **Setup Environment Variables**: Ensure `.env` file has Qdrant credentials
+2. **Install Dependencies**: `pip install -r requirements.txt`
+3. **Ingest Data**: `python ingest.py`
+4. **Test Retrieval**: `python test_retrieval.py`
+## Current Status
+- ✅ Dataset: Nuinamath (5,000 mathematical problems)
+- ✅ Vector DB: Qdrant Cloud
+- ✅ Embedding Model: all-MiniLM-L6-v2 (384 dimensions)
+- ✅ Status: Ready for Phase 2

database/ingest.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Main ingestion script for loading Nuinamath dataset into Qdrant.
+"""
+import logging
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+import time
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Configuration settings
+QDRANT_URL = os.getenv("QDRANT_URL")
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "nuinamath")
+DATASET_NAME = "AI-MO/NuminaMath-CoT"
+DATASET_SPLIT = "train"
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+VECTOR_SIZE = 384
+DISTANCE_METRIC = "Cosine"
+BATCH_SIZE = 100
+MAX_SAMPLES = None
+# Validation
+if not QDRANT_URL or not QDRANT_API_KEY:
+    raise ValueError("Please set QDRANT_URL and QDRANT_API_KEY in your .env file")
+from utils import EmbeddingGenerator, batch_process_dataset
+from qdrant_manager import QdrantManager
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def main():
+    """Main ingestion pipeline."""
+    try:
+        # Initialize components
+        logger.info("Initializing components...")
+        embedding_generator = EmbeddingGenerator(EMBEDDING_MODEL)
+        qdrant_manager = QdrantManager(QDRANT_URL, QDRANT_API_KEY)
+        # Load dataset
+        logger.info(f"Loading dataset: {DATASET_NAME}")
+        if MAX_SAMPLES:
+            dataset = load_dataset(DATASET_NAME, split=f"{DATASET_SPLIT}[:{MAX_SAMPLES}]")
+            logger.info(f"Loaded {len(dataset)} samples (limited)")
+        else:
+            dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
+            logger.info(f"Loaded full dataset: {len(dataset)} samples")
+        # Create Qdrant collection
+        logger.info(f"Creating collection: {QDRANT_COLLECTION}")
+        success = qdrant_manager.create_collection(
+            collection_name=QDRANT_COLLECTION,
+            vector_size=VECTOR_SIZE,
+            distance=DISTANCE_METRIC
+        )
+        if not success:
+            logger.error("Failed to create collection")
+            return
+        # Process dataset in batches
+        logger.info("Processing dataset in batches...")
+        batches = batch_process_dataset(dataset, BATCH_SIZE)
+        total_processed = 0
+        total_batches = len(batches)
+        for batch_idx, batch_data in enumerate(tqdm(batches, desc="Processing batches")):
+            try:
+                # Extract texts for embedding
+                texts = [item['text'] for item in batch_data]
+                # Generate embeddings
+                logger.info(f"Generating embeddings for batch {batch_idx + 1}/{total_batches}")
+                embeddings = embedding_generator.embed_text(texts)
+                # Upsert to Qdrant
+                logger.info(f"Uploading batch {batch_idx + 1} to Qdrant...")
+                qdrant_manager.upsert_points(
+                    collection_name=QDRANT_COLLECTION,
+                    points_data=batch_data,
+                    embeddings=embeddings
+                )
+                total_processed += len(batch_data)
+                logger.info(f"Progress: {total_processed}/{len(dataset)} items processed")
+                # Small delay to avoid overwhelming the API
+                time.sleep(0.5)
+            except Exception as e:
+                logger.error(f"Error processing batch {batch_idx + 1}: {e}")
+                continue
+        # Final summary
+        logger.info("Ingestion completed!")
+        logger.info(f"Total items processed: {total_processed}")
+        # Get collection info
+        collection_info = qdrant_manager.get_collection_info(QDRANT_COLLECTION)
+        if collection_info:
+            logger.info(f"Collection status: {collection_info.status}")
+            logger.info(f"Vectors count: {collection_info.vectors_count}")
+    except Exception as e:
+        logger.error(f"Fatal error in ingestion pipeline: {e}")
+        raise
+if __name__ == "__main__":
+    main()

database/qdrant_manager.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Qdrant client wrapper for vector database operations.
+"""
+import logging
+from typing import List, Dict, Any
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+import time
+logger = logging.getLogger(__name__)
+class QdrantManager:
+    """Manages Qdrant vector database operations."""
+    def __init__(self, url: str, api_key: str):
+        """Initialize Qdrant client."""
+        self.client = QdrantClient(url=url, api_key=api_key)
+        logger.info(f"Connected to Qdrant at {url}")
+    def create_collection(self, collection_name: str, vector_size: int, distance: str = "Cosine"):
+        """
+        Create a new collection in Qdrant.
+        Args:
+            collection_name: Name of the collection
+            vector_size: Dimension of vectors
+            distance: Distance metric (Cosine, Euclidean, Dot)
+        """
+        try:
+            # Check if collection already exists
+            collections = self.client.get_collections().collections
+            existing_names = [col.name for col in collections]
+            if collection_name in existing_names:
+                logger.info(f"Collection '{collection_name}' already exists")
+                return True
+            # Create new collection
+            distance_map = {
+                "Cosine": Distance.COSINE,
+                "Euclidean": Distance.EUCLID,
+                "Dot": Distance.DOT
+            }
+            self.client.create_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(
+                    size=vector_size,
+                    distance=distance_map.get(distance, Distance.COSINE)
+                )
+            )
+            logger.info(f"Created collection '{collection_name}' with vector size {vector_size}")
+            return True
+        except Exception as e:
+            logger.error(f"Error creating collection: {e}")
+            return False
+    def upsert_points(self, collection_name: str, points_data: List[Dict[str, Any]],
+                     embeddings: List[List[float]], max_retries: int = 3):
+        """
+        Upsert points into Qdrant collection with retry logic.
+        Args:
+            collection_name: Name of the collection
+            points_data: List of point data dictionaries
+            embeddings: List of embedding vectors
+            max_retries: Maximum number of retry attempts
+        """
+        points = []
+        for i, (data, embedding) in enumerate(zip(points_data, embeddings)):
+            point = PointStruct(
+                id=data['id'],
+                vector=embedding,
+                payload={
+                    'problem': data['problem'],
+                    'solution': data['solution'],
+                    'source': data['source']
+                }
+            )
+            points.append(point)
+        # Retry logic for network issues
+        for attempt in range(max_retries):
+            try:
+                self.client.upsert(
+                    collection_name=collection_name,
+                    points=points
+                )
+                logger.info(f"Successfully upserted {len(points)} points")
+                return True
+            except Exception as e:
+                logger.warning(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)  # Exponential backoff
+                else:
+                    logger.error(f"Failed to upsert points after {max_retries} attempts")
+                    raise e
+    def search_similar(self, collection_name: str, query_vector: List[float],
+                      limit: int = 3, score_threshold: float = 0.0):
+        """
+        Search for similar vectors in the collection.
+        Args:
+            collection_name: Name of the collection
+            query_vector: Query embedding vector
+            limit: Number of results to return
+            score_threshold: Minimum similarity score
+        Returns:
+            Search results from Qdrant
+        """
+        try:
+            results = self.client.search(
+                collection_name=collection_name,
+                query_vector=query_vector,
+                limit=limit,
+                score_threshold=score_threshold
+            )
+            logger.info(f"Found {len(results)} similar results")
+            return results
+        except Exception as e:
+            logger.error(f"Error searching collection: {e}")
+            return []
+    def get_collection_info(self, collection_name: str):
+        """Get information about a collection."""
+        try:
+            info = self.client.get_collection(collection_name)
+            logger.info(f"Collection info: {info}")
+            return info
+        except Exception as e:
+            logger.error(f"Error getting collection info: {e}")
+            return None

database/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Dataset loading and processing
+datasets==2.18.0
+pandas
+# For embedding generation
+sentence-transformers==2.2.2
+# For Qdrant client (VectorDB)
+qdrant-client==1.8.0
+# For environment variables
+python-dotenv
+# For progress tracking
+tqdm

database/test_retrieval.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Test script for retrieving similar math problems from Qdrant.
+"""
+import logging
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Configuration settings
+QDRANT_URL = os.getenv("QDRANT_URL")
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "nuinamath")
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+from utils import EmbeddingGenerator, format_retrieval_results
+from qdrant_manager import QdrantManager
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_retrieval():
+    """Test the retrieval system with sample math questions."""
+    # Sample test questions
+    test_questions = [
+        "What is the value of x in 3x + 5 = 20?",
+        "How do you find the area of a triangle given 3 sides?",
+        "Solve for y: 2y - 7 = 15",
+        "What is the derivative of x^2 + 3x?",
+        "Find the arithmetic sequence common difference"
+    ]
+    try:
+        # Initialize components
+        logger.info("Initializing retrieval system...")
+        embedding_generator = EmbeddingGenerator(EMBEDDING_MODEL)
+        qdrant_manager = QdrantManager(QDRANT_URL, QDRANT_API_KEY)
+        # Test each question
+        for i, question in enumerate(test_questions, 1):
+            print(f"\n{'='*60}")
+            print(f"TEST QUERY {i}: {question}")
+            print('='*60)
+            # Generate embedding for the question
+            query_embedding = embedding_generator.embed_single_text(question)
+            # Search for similar problems
+            results = qdrant_manager.search_similar(
+                collection_name=QDRANT_COLLECTION,
+                query_vector=query_embedding,
+                limit=3,
+                score_threshold=0.1
+            )
+            # Format and display results
+            formatted_results = format_retrieval_results(results)
+            print(formatted_results)
+    except Exception as e:
+        logger.error(f"Error in retrieval test: {e}")
+def test_collection_status():
+    """Check the status of the Qdrant collection."""
+    try:
+        qdrant_manager = QdrantManager(QDRANT_URL, QDRANT_API_KEY)
+        print(f"\n{'='*40}")
+        print("COLLECTION STATUS")
+        print('='*40)
+        info = qdrant_manager.get_collection_info(QDRANT_COLLECTION)
+        if info:
+            print(f"Collection Name: {QDRANT_COLLECTION}")
+            print(f"Status: {info.status}")
+            print(f"Vectors Count: {info.vectors_count}")
+            print(f"Vector Size: {info.config.params.vectors.size}")
+            print(f"Distance Metric: {info.config.params.vectors.distance}")
+        else:
+            print("Collection not found or error occurred")
+    except Exception as e:
+        logger.error(f"Error checking collection status: {e}")
+if __name__ == "__main__":
+    print("Testing Qdrant Collection Status...")
+    test_collection_status()
+    print("\n\nTesting Retrieval System...")
+    test_retrieval()

database/utils.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Utility functions for data processing and embedding generation.
+"""
+import logging
+from typing import List, Dict, Any
+from sentence_transformers import SentenceTransformer
+from datasets import Dataset
+import uuid
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EmbeddingGenerator:
+    """Handles text embedding generation using sentence transformers."""
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
+        """Initialize the embedding model."""
+        logger.info(f"Loading embedding model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        self.model_name = model_name
+    def embed_text(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of texts."""
+        logger.info(f"Generating embeddings for {len(texts)} texts")
+        embeddings = self.model.encode(texts, show_progress_bar=True)
+        return embeddings.tolist()
+    def embed_single_text(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        embedding = self.model.encode([text])
+        return embedding[0].tolist()
+def preprocess_dataset_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Preprocess a single dataset entry to create combined text for embedding.
+    Args:
+        entry: Dictionary containing 'problem' and 'solution' keys
+    Returns:
+        Processed entry with 'text' field for embedding
+    """
+    problem = entry.get('problem', '')
+    solution = entry.get('solution', '')
+    # Create combined text for embedding
+    combined_text = f"Question: {problem}\nAnswer: {solution}"
+    return {
+        'id': str(uuid.uuid4()),
+        'text': combined_text,
+        'problem': problem,
+        'solution': solution,
+        'source': entry.get('source', 'unknown')
+    }
+def batch_process_dataset(dataset: Dataset, batch_size: int = 100) -> List[List[Dict[str, Any]]]:
+    """
+    Process dataset in batches for memory efficiency.
+    Args:
+        dataset: HuggingFace dataset
+        batch_size: Number of items per batch
+    Returns:
+        List of batches, each containing processed entries
+    """
+    batches = []
+    total_items = len(dataset)
+    logger.info(f"Processing {total_items} items in batches of {batch_size}")
+    for i in range(0, total_items, batch_size):
+        batch_end = min(i + batch_size, total_items)
+        batch_data = dataset[i:batch_end]
+        # Process each item in the batch
+        processed_batch = []
+        for j in range(len(batch_data['problem'])):
+            entry = {
+                'problem': batch_data['problem'][j],
+                'solution': batch_data['solution'][j],
+                'source': batch_data['source'][j]
+            }
+            processed_entry = preprocess_dataset_entry(entry)
+            processed_batch.append(processed_entry)
+        batches.append(processed_batch)
+        logger.info(f"Processed batch {len(batches)}/{(total_items + batch_size - 1) // batch_size}")
+    return batches
+def format_retrieval_results(results: List[Dict]) -> str:
+    """
+    Format retrieval results for display.
+    Args:
+        results: List of search results from Qdrant
+    Returns:
+        Formatted string for display
+    """
+    if not results:
+        return "No results found."
+    output = []
+    for i, result in enumerate(results, 1):
+        payload = result.payload
+        score = result.score
+        output.append(f"\n--- Result {i} (Score: {score:.4f}) ---")
+        output.append(f"Question: {payload['problem']}")
+        output.append(f"Answer: {payload['solution'][:200]}...")  # Truncate long answers
+        output.append("-" * 50)
+    return "\n".join(output)