|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""MegrezO model configuration""" |
|
|
|
from typing import Optional |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.models.llama.configuration_llama import LlamaConfig |
|
from transformers.utils import logging |
|
|
|
from .modeling_navit_siglip import SiglipVisionConfig |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class AudioConfig(PretrainedConfig): |
|
model_type = "megrezo" |
|
|
|
def __init__( |
|
self, |
|
n_mels: int = 128, |
|
n_ctx: int = 1500, |
|
n_state: int = 1280, |
|
n_head: int = 20, |
|
n_layer: int = 32, |
|
output_dim: int = 2560, |
|
avg_pool: bool = True, |
|
add_audio_bos_eos_token: bool = True, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.n_mels = n_mels |
|
self.n_ctx = n_ctx |
|
self.n_state = n_state |
|
self.n_head = n_head |
|
self.n_layer = n_layer |
|
self.output_dim = output_dim |
|
self.avg_pool = avg_pool |
|
self.add_audio_bos_eos_token = add_audio_bos_eos_token |
|
|
|
|
|
class MegrezOConfig(LlamaConfig): |
|
model_type = "megrezo" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
is_composition = True |
|
|
|
_default_audio_config = { |
|
"n_mels": 128, |
|
"n_ctx": 1500, |
|
"n_state": 1280, |
|
"n_head": 20, |
|
"n_layer": 32, |
|
"output_dim": 2560, |
|
"avg_pool": True, |
|
"add_audio_bos_eos_token": True, |
|
} |
|
|
|
_default_vision_config = { |
|
"intermediate_size": 4304, |
|
"num_hidden_layers": 27, |
|
"num_attention_heads": 16, |
|
"image_size": 980, |
|
"hidden_size": 1152, |
|
"patch_size": 16, |
|
"model_type": "siglip_vision_model", |
|
} |
|
|
|
def __init__( |
|
self, |
|
audio_config: Optional[AudioConfig] = None, |
|
vision_config: Optional[SiglipVisionConfig] = None, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
if audio_config is None: |
|
self.audio_config = AudioConfig(**self._default_audio_config) |
|
elif isinstance(audio_config, dict): |
|
self.audio_config = AudioConfig(**audio_config) |
|
elif isinstance(audio_config, AudioConfig): |
|
self.audio_config = audio_config |
|
|
|
if vision_config is None: |
|
self.vision_config = SiglipVisionConfig(**self._default_vision_config) |
|
elif isinstance(vision_config, dict): |
|
self.vision_config = SiglipVisionConfig(**vision_config) |
|
elif isinstance(vision_config, SiglipVisionConfig): |
|
self.vision_config = vision_config |
|
|