0 and -1 layer are dense
shared causal convulation expert
top-k noisy routing with load reblancing (deepseel)
BiBoForCausalLM(
(model): BiBoModel(
(embed_tokens): Embedding(128000, 1024)
(layers): ModuleList(
(0): BiBoDecoderLayer(
(self_attn): BiBoAttention(
(q_proj): Linear(in_features=1024, out_features=1020, bias=True)
(k_proj): Linear(in_features=1024, out_features=170, bias=True)
(v_proj): Linear(in_features=1024, out_features=170, bias=True)
(o_proj): Linear(in_features=1020, out_features=1024, bias=False)
)
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(mlp): BiBoMLP(
(gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
(up_proj): Linear(in_features=1024, out_features=49600, bias=False)
(down_proj): Linear(in_features=49600, out_features=1024, bias=False)
(act_fn): SiLU()
)
)
(1-10): 10 x BiBoDecoderLayer(
(self_attn): BiBoAttention(
(q_proj): Linear(in_features=1024, out_features=1020, bias=True)
(k_proj): Linear(in_features=1024, out_features=170, bias=True)
(v_proj): Linear(in_features=1024, out_features=170, bias=True)
(o_proj): Linear(in_features=1020, out_features=1024, bias=False)
)
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(mlp): BiBoMoELayer(
(routed_experts): ModuleList(
(0-8): 9 x MLPExpert(
(gate_proj): Linear(in_features=1024, out_features=512, bias=False)
(up_proj): Linear(in_features=1024, out_features=512, bias=False)
(down_proj): Linear(in_features=512, out_features=1024, bias=False)
(act_fn): SiLU()
)
(9): IdentityExpert()
)
(shared_experts_list): ModuleList(
(0): ModifiedConvolutionalExpert(
(gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False)
(up_proj): Linear(in_features=1024, out_features=512, bias=False)
(down_proj): Linear(in_features=512, out_features=1024, bias=False)
(act_fn): SiLU()
)
)
(gate): BiBoMoERouter(
(gate_proj): Linear(in_features=1024, out_features=10, bias=False)
)
)
)
(11): BiBoDecoderLayer(
(self_attn): BiBoAttention(
(q_proj): Linear(in_features=1024, out_features=1020, bias=True)
(k_proj): Linear(in_features=1024, out_features=170, bias=True)
(v_proj): Linear(in_features=1024, out_features=170, bias=True)
(o_proj): Linear(in_features=1020, out_features=1024, bias=False)
)
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
(mlp): BiBoMLP(
(gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
(up_proj): Linear(in_features=1024, out_features=49600, bias=False)
(down_proj): Linear(in_features=49600, out_features=1024, bias=False)
(act_fn): SiLU()
)
)
)
(norm): BiBoRMSNorm((1024,), eps=1e-06)
(rotary_emb): BiBoRotaryEmbedding()
)
(lm_head): Linear(in_features=1024, out_features=128000, bias=False)
)
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support