Spaces:

nananie143
/

agentic-system

Runtime error

File size: 5,270 Bytes

"""Model manager for handling multiple LLMs in Hugging Face Spaces."""

import os
from typing import Dict, Any, Optional, List
import logging
from dataclasses import dataclass
from enum import Enum
import huggingface_hub
from llama_cpp import Llama

class ModelType(Enum):
    """Types of models and their specific tasks."""
    REASONING = "reasoning"
    CODE = "code"
    CHAT = "chat"
    PLANNING = "planning"
    ANALYSIS = "analysis"

@dataclass
class ModelConfig:
    """Configuration for a specific model."""
    repo_id: str
    filename: str
    model_type: ModelType
    context_size: int = 4096
    gpu_layers: int = 35
    batch_size: int = 512
    threads: int = 8

class ModelManager:
    """Manages multiple LLM models for different tasks in Spaces."""
    
    def __init__(self, model_dir: Optional[str] = None):
        # In Spaces, models are stored in the cache directory
        self.model_dir = model_dir or os.getenv('SPACE_CACHE_DIR', '/tmp/models')
        self.models: Dict[str, Llama] = {}
        self.logger = logging.getLogger(__name__)
        
        # Define model configurations
        self.model_configs = {
            "reasoning": ModelConfig(
                repo_id="rrbale/pruned-qwen-moe",
                filename="model-Q6_K.gguf",
                model_type=ModelType.REASONING
            ),
            "code": ModelConfig(
                repo_id="YorkieOH10/deepseek-coder-6.7B-kexer-Q8_0-GGUF",
                filename="model.gguf",
                model_type=ModelType.CODE
            ),
            "chat": ModelConfig(
                repo_id="Nidum-Llama-3.2-3B-Uncensored-GGUF",
                filename="model-Q6_K.gguf",
                model_type=ModelType.CHAT
            ),
            "planning": ModelConfig(
                repo_id="deepseek-ai/JanusFlow-1.3B",
                filename="model.gguf",
                model_type=ModelType.PLANNING
            ),
            "analysis": ModelConfig(
                repo_id="prithivMLmods/QwQ-4B-Instruct",
                filename="model.gguf",
                model_type=ModelType.ANALYSIS,
                context_size=8192,
                gpu_layers=40
            ),
            "general": ModelConfig(
                repo_id="gpt-omni/mini-omni2",
                filename="mini-omni2.gguf",
                model_type=ModelType.CHAT
            )
        }
        
        os.makedirs(self.model_dir, exist_ok=True)
        
    async def initialize_model(self, model_key: str) -> Optional[Llama]:
        """Initialize a specific model in Spaces."""
        try:
            config = self.model_configs[model_key]
            cache_dir = os.path.join(self.model_dir, model_key)
            os.makedirs(cache_dir, exist_ok=True)
            
            # Download model using HF Hub
            self.logger.info(f"Downloading {model_key} model...")
            model_path = huggingface_hub.hf_hub_download(
                repo_id=config.repo_id,
                filename=config.filename,
                repo_type="model",
                cache_dir=cache_dir,
                local_dir_use_symlinks=False
            )
            
            # Configure for Spaces GPU environment
            try:
                model = Llama(
                    model_path=model_path,
                    n_ctx=config.context_size,
                    n_batch=config.batch_size,
                    n_threads=config.threads,
                    n_gpu_layers=config.gpu_layers,
                    main_gpu=0,
                    tensor_split=None  # Let it use all available GPU memory
                )
                self.logger.info(f"{model_key} model loaded with GPU acceleration!")
            except Exception as e:
                self.logger.warning(f"GPU loading failed for {model_key}: {e}, falling back to CPU...")
                model = Llama(
                    model_path=model_path,
                    n_ctx=2048,
                    n_batch=256,
                    n_threads=4,
                    n_gpu_layers=0
                )
                self.logger.info(f"{model_key} model loaded in CPU-only mode")
            
            self.models[model_key] = model
            return model
            
        except Exception as e:
            self.logger.error(f"Error initializing {model_key} model: {e}")
            return None
    
    async def get_model(self, model_key: str) -> Optional[Llama]:
        """Get a model, initializing it if necessary."""
        if model_key not in self.models:
            return await self.initialize_model(model_key)
        return self.models[model_key]
    
    async def initialize_all_models(self):
        """Initialize all configured models."""
        for model_key in self.model_configs.keys():
            await self.initialize_model(model_key)
    
    def get_best_model_for_task(self, task_type: str) -> str:
        """Get the best model key for a specific task type."""
        task_model_mapping = {
            "reasoning": "reasoning",
            "code": "code",
            "chat": "chat",
            "planning": "planning",
            "analysis": "analysis",
            "general": "general"
        }
        return task_model_mapping.get(task_type, "general")