Spaces:
Runtime error
Runtime error
File size: 5,270 Bytes
1d75522 276480b 1d75522 276480b 1d75522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
"""Model manager for handling multiple LLMs in Hugging Face Spaces."""
import os
from typing import Dict, Any, Optional, List
import logging
from dataclasses import dataclass
from enum import Enum
import huggingface_hub
from llama_cpp import Llama
class ModelType(Enum):
"""Types of models and their specific tasks."""
REASONING = "reasoning"
CODE = "code"
CHAT = "chat"
PLANNING = "planning"
ANALYSIS = "analysis"
@dataclass
class ModelConfig:
"""Configuration for a specific model."""
repo_id: str
filename: str
model_type: ModelType
context_size: int = 4096
gpu_layers: int = 35
batch_size: int = 512
threads: int = 8
class ModelManager:
"""Manages multiple LLM models for different tasks in Spaces."""
def __init__(self, model_dir: Optional[str] = None):
# In Spaces, models are stored in the cache directory
self.model_dir = model_dir or os.getenv('SPACE_CACHE_DIR', '/tmp/models')
self.models: Dict[str, Llama] = {}
self.logger = logging.getLogger(__name__)
# Define model configurations
self.model_configs = {
"reasoning": ModelConfig(
repo_id="rrbale/pruned-qwen-moe",
filename="model-Q6_K.gguf",
model_type=ModelType.REASONING
),
"code": ModelConfig(
repo_id="YorkieOH10/deepseek-coder-6.7B-kexer-Q8_0-GGUF",
filename="model.gguf",
model_type=ModelType.CODE
),
"chat": ModelConfig(
repo_id="Nidum-Llama-3.2-3B-Uncensored-GGUF",
filename="model-Q6_K.gguf",
model_type=ModelType.CHAT
),
"planning": ModelConfig(
repo_id="deepseek-ai/JanusFlow-1.3B",
filename="model.gguf",
model_type=ModelType.PLANNING
),
"analysis": ModelConfig(
repo_id="prithivMLmods/QwQ-4B-Instruct",
filename="model.gguf",
model_type=ModelType.ANALYSIS,
context_size=8192,
gpu_layers=40
),
"general": ModelConfig(
repo_id="gpt-omni/mini-omni2",
filename="mini-omni2.gguf",
model_type=ModelType.CHAT
)
}
os.makedirs(self.model_dir, exist_ok=True)
async def initialize_model(self, model_key: str) -> Optional[Llama]:
"""Initialize a specific model in Spaces."""
try:
config = self.model_configs[model_key]
cache_dir = os.path.join(self.model_dir, model_key)
os.makedirs(cache_dir, exist_ok=True)
# Download model using HF Hub
self.logger.info(f"Downloading {model_key} model...")
model_path = huggingface_hub.hf_hub_download(
repo_id=config.repo_id,
filename=config.filename,
repo_type="model",
cache_dir=cache_dir,
local_dir_use_symlinks=False
)
# Configure for Spaces GPU environment
try:
model = Llama(
model_path=model_path,
n_ctx=config.context_size,
n_batch=config.batch_size,
n_threads=config.threads,
n_gpu_layers=config.gpu_layers,
main_gpu=0,
tensor_split=None # Let it use all available GPU memory
)
self.logger.info(f"{model_key} model loaded with GPU acceleration!")
except Exception as e:
self.logger.warning(f"GPU loading failed for {model_key}: {e}, falling back to CPU...")
model = Llama(
model_path=model_path,
n_ctx=2048,
n_batch=256,
n_threads=4,
n_gpu_layers=0
)
self.logger.info(f"{model_key} model loaded in CPU-only mode")
self.models[model_key] = model
return model
except Exception as e:
self.logger.error(f"Error initializing {model_key} model: {e}")
return None
async def get_model(self, model_key: str) -> Optional[Llama]:
"""Get a model, initializing it if necessary."""
if model_key not in self.models:
return await self.initialize_model(model_key)
return self.models[model_key]
async def initialize_all_models(self):
"""Initialize all configured models."""
for model_key in self.model_configs.keys():
await self.initialize_model(model_key)
def get_best_model_for_task(self, task_type: str) -> str:
"""Get the best model key for a specific task type."""
task_model_mapping = {
"reasoning": "reasoning",
"code": "code",
"chat": "chat",
"planning": "planning",
"analysis": "analysis",
"general": "general"
}
return task_model_mapping.get(task_type, "general")
|