File size: 5,270 Bytes
1d75522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276480b
1d75522
276480b
1d75522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Model manager for handling multiple LLMs in Hugging Face Spaces."""

import os
from typing import Dict, Any, Optional, List
import logging
from dataclasses import dataclass
from enum import Enum
import huggingface_hub
from llama_cpp import Llama

class ModelType(Enum):
    """Types of models and their specific tasks."""
    REASONING = "reasoning"
    CODE = "code"
    CHAT = "chat"
    PLANNING = "planning"
    ANALYSIS = "analysis"

@dataclass
class ModelConfig:
    """Configuration for a specific model."""
    repo_id: str
    filename: str
    model_type: ModelType
    context_size: int = 4096
    gpu_layers: int = 35
    batch_size: int = 512
    threads: int = 8

class ModelManager:
    """Manages multiple LLM models for different tasks in Spaces."""
    
    def __init__(self, model_dir: Optional[str] = None):
        # In Spaces, models are stored in the cache directory
        self.model_dir = model_dir or os.getenv('SPACE_CACHE_DIR', '/tmp/models')
        self.models: Dict[str, Llama] = {}
        self.logger = logging.getLogger(__name__)
        
        # Define model configurations
        self.model_configs = {
            "reasoning": ModelConfig(
                repo_id="rrbale/pruned-qwen-moe",
                filename="model-Q6_K.gguf",
                model_type=ModelType.REASONING
            ),
            "code": ModelConfig(
                repo_id="YorkieOH10/deepseek-coder-6.7B-kexer-Q8_0-GGUF",
                filename="model.gguf",
                model_type=ModelType.CODE
            ),
            "chat": ModelConfig(
                repo_id="Nidum-Llama-3.2-3B-Uncensored-GGUF",
                filename="model-Q6_K.gguf",
                model_type=ModelType.CHAT
            ),
            "planning": ModelConfig(
                repo_id="deepseek-ai/JanusFlow-1.3B",
                filename="model.gguf",
                model_type=ModelType.PLANNING
            ),
            "analysis": ModelConfig(
                repo_id="prithivMLmods/QwQ-4B-Instruct",
                filename="model.gguf",
                model_type=ModelType.ANALYSIS,
                context_size=8192,
                gpu_layers=40
            ),
            "general": ModelConfig(
                repo_id="gpt-omni/mini-omni2",
                filename="mini-omni2.gguf",
                model_type=ModelType.CHAT
            )
        }
        
        os.makedirs(self.model_dir, exist_ok=True)
        
    async def initialize_model(self, model_key: str) -> Optional[Llama]:
        """Initialize a specific model in Spaces."""
        try:
            config = self.model_configs[model_key]
            cache_dir = os.path.join(self.model_dir, model_key)
            os.makedirs(cache_dir, exist_ok=True)
            
            # Download model using HF Hub
            self.logger.info(f"Downloading {model_key} model...")
            model_path = huggingface_hub.hf_hub_download(
                repo_id=config.repo_id,
                filename=config.filename,
                repo_type="model",
                cache_dir=cache_dir,
                local_dir_use_symlinks=False
            )
            
            # Configure for Spaces GPU environment
            try:
                model = Llama(
                    model_path=model_path,
                    n_ctx=config.context_size,
                    n_batch=config.batch_size,
                    n_threads=config.threads,
                    n_gpu_layers=config.gpu_layers,
                    main_gpu=0,
                    tensor_split=None  # Let it use all available GPU memory
                )
                self.logger.info(f"{model_key} model loaded with GPU acceleration!")
            except Exception as e:
                self.logger.warning(f"GPU loading failed for {model_key}: {e}, falling back to CPU...")
                model = Llama(
                    model_path=model_path,
                    n_ctx=2048,
                    n_batch=256,
                    n_threads=4,
                    n_gpu_layers=0
                )
                self.logger.info(f"{model_key} model loaded in CPU-only mode")
            
            self.models[model_key] = model
            return model
            
        except Exception as e:
            self.logger.error(f"Error initializing {model_key} model: {e}")
            return None
    
    async def get_model(self, model_key: str) -> Optional[Llama]:
        """Get a model, initializing it if necessary."""
        if model_key not in self.models:
            return await self.initialize_model(model_key)
        return self.models[model_key]
    
    async def initialize_all_models(self):
        """Initialize all configured models."""
        for model_key in self.model_configs.keys():
            await self.initialize_model(model_key)
    
    def get_best_model_for_task(self, task_type: str) -> str:
        """Get the best model key for a specific task type."""
        task_model_mapping = {
            "reasoning": "reasoning",
            "code": "code",
            "chat": "chat",
            "planning": "planning",
            "analysis": "analysis",
            "general": "general"
        }
        return task_model_mapping.get(task_type, "general")