Spaces:
Runtime error
Runtime error
"""Model manager for handling multiple LLMs in Hugging Face Spaces.""" | |
import os | |
from typing import Dict, Any, Optional, List | |
import logging | |
from dataclasses import dataclass | |
from enum import Enum | |
import huggingface_hub | |
from llama_cpp import Llama | |
class ModelType(Enum): | |
"""Types of models and their specific tasks.""" | |
REASONING = "reasoning" | |
CODE = "code" | |
CHAT = "chat" | |
PLANNING = "planning" | |
ANALYSIS = "analysis" | |
class ModelConfig: | |
"""Configuration for a specific model.""" | |
repo_id: str | |
filename: str | |
model_type: ModelType | |
context_size: int = 4096 | |
gpu_layers: int = 35 | |
batch_size: int = 512 | |
threads: int = 8 | |
class ModelManager: | |
"""Manages multiple LLM models for different tasks in Spaces.""" | |
def __init__(self, model_dir: Optional[str] = None): | |
# In Spaces, models are stored in the cache directory | |
self.model_dir = model_dir or os.getenv('SPACE_CACHE_DIR', '/tmp/models') | |
self.models: Dict[str, Llama] = {} | |
self.logger = logging.getLogger(__name__) | |
# Define model configurations | |
self.model_configs = { | |
"reasoning": ModelConfig( | |
repo_id="rrbale/pruned-qwen-moe", | |
filename="model-Q6_K.gguf", | |
model_type=ModelType.REASONING | |
), | |
"code": ModelConfig( | |
repo_id="YorkieOH10/deepseek-coder-6.7B-kexer-Q8_0-GGUF", | |
filename="model.gguf", | |
model_type=ModelType.CODE | |
), | |
"chat": ModelConfig( | |
repo_id="Nidum-Llama-3.2-3B-Uncensored-GGUF", | |
filename="model-Q6_K.gguf", | |
model_type=ModelType.CHAT | |
), | |
"planning": ModelConfig( | |
repo_id="deepseek-ai/JanusFlow-1.3B", | |
filename="model.gguf", | |
model_type=ModelType.PLANNING | |
), | |
"analysis": ModelConfig( | |
repo_id="prithivMLmods/QwQ-4B-Instruct", | |
filename="model.gguf", | |
model_type=ModelType.ANALYSIS, | |
context_size=8192, | |
gpu_layers=40 | |
), | |
"general": ModelConfig( | |
repo_id="gpt-omni/mini-omni2", | |
filename="mini-omni2.gguf", | |
model_type=ModelType.CHAT | |
) | |
} | |
os.makedirs(self.model_dir, exist_ok=True) | |
async def initialize_model(self, model_key: str) -> Optional[Llama]: | |
"""Initialize a specific model in Spaces.""" | |
try: | |
config = self.model_configs[model_key] | |
cache_dir = os.path.join(self.model_dir, model_key) | |
os.makedirs(cache_dir, exist_ok=True) | |
# Download model using HF Hub | |
self.logger.info(f"Downloading {model_key} model...") | |
model_path = huggingface_hub.hf_hub_download( | |
repo_id=config.repo_id, | |
filename=config.filename, | |
repo_type="model", | |
cache_dir=cache_dir, | |
local_dir_use_symlinks=False | |
) | |
# Configure for Spaces GPU environment | |
try: | |
model = Llama( | |
model_path=model_path, | |
n_ctx=config.context_size, | |
n_batch=config.batch_size, | |
n_threads=config.threads, | |
n_gpu_layers=config.gpu_layers, | |
main_gpu=0, | |
tensor_split=None # Let it use all available GPU memory | |
) | |
self.logger.info(f"{model_key} model loaded with GPU acceleration!") | |
except Exception as e: | |
self.logger.warning(f"GPU loading failed for {model_key}: {e}, falling back to CPU...") | |
model = Llama( | |
model_path=model_path, | |
n_ctx=2048, | |
n_batch=256, | |
n_threads=4, | |
n_gpu_layers=0 | |
) | |
self.logger.info(f"{model_key} model loaded in CPU-only mode") | |
self.models[model_key] = model | |
return model | |
except Exception as e: | |
self.logger.error(f"Error initializing {model_key} model: {e}") | |
return None | |
async def get_model(self, model_key: str) -> Optional[Llama]: | |
"""Get a model, initializing it if necessary.""" | |
if model_key not in self.models: | |
return await self.initialize_model(model_key) | |
return self.models[model_key] | |
async def initialize_all_models(self): | |
"""Initialize all configured models.""" | |
for model_key in self.model_configs.keys(): | |
await self.initialize_model(model_key) | |
def get_best_model_for_task(self, task_type: str) -> str: | |
"""Get the best model key for a specific task type.""" | |
task_model_mapping = { | |
"reasoning": "reasoning", | |
"code": "code", | |
"chat": "chat", | |
"planning": "planning", | |
"analysis": "analysis", | |
"general": "general" | |
} | |
return task_model_mapping.get(task_type, "general") | |