File size: 2,484 Bytes
c102e8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config


class KimiAudioConfig(Qwen2Config):
    def __init__(
        self,
        vocab_size=163840,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        rope_theta=10000.0,
        rope_scaling=None,
        tie_word_embeddings=False,
        kimia_mimo_layers: int = 6,
        kimia_mimo_audiodelaytokens: int = 5,
        kimia_mimo_transformer_from_layer_index: int = 21,
        kimia_audio_output_vocab: int = 16896,
        kimia_text_output_vocab: int = 152064,
        num_audio_special_tokens: int = 512,
        num_base_tokens: int = 151643,
        kimia_token_offset: int = 152064,
        use_whisper_feature: bool = True,
        kimia_adaptor_input_dim: int = 5120,
        kimia_media_begin: int = 151661,
        kimia_media_end: int = 151663,
        **kwargs,
    ):
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            rms_norm_eps=rms_norm_eps,
            use_cache=use_cache,
            tie_word_embeddings=tie_word_embeddings,
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            **kwargs,
        )

        self.kimia_mimo_layers = kimia_mimo_layers
        self.kimia_mimo_audiodelaytokens = kimia_mimo_audiodelaytokens
        # vocab
        self.kimia_mimo_transformer_from_layer_index = (
            kimia_mimo_transformer_from_layer_index
        )
        self.kimia_audio_output_vocab = kimia_audio_output_vocab
        self.kimia_text_output_vocab = kimia_text_output_vocab
        self.num_audio_special_tokens = num_audio_special_tokens
        self.num_base_tokens = num_base_tokens
        self.kimia_token_offset = kimia_token_offset
        self.use_whisper_feature = use_whisper_feature
        self.kimia_adaptor_input_dim = kimia_adaptor_input_dim
        # special tokens
        self.kimia_media_begin = kimia_media_begin
        self.kimia_media_end = kimia_media_end