zaydzuhri's picture
Add files using upload-large-folder tool
e49db55 verified
raw
history blame
4.03 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from dataclasses import dataclass
from typing import Optional
from torch import nn
from torchtitan.components.tokenizer import Tokenizer
from torchtitan.config_manager import JobConfig
from torchtitan.protocols.train_spec import BaseModelArgs
from torchtitan.tools.logging import logger
@dataclass
class TransformerModelArgs(BaseModelArgs):
dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: Optional[int] = None
vocab_size: int = -1 # defined later by tokenizer
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
ffn_dim_multiplier: Optional[float] = None
norm_eps: float = 1e-5
rope_theta: float = 10000
max_seq_len: int = 2048
# If `True`, then each transformer block init uses its layer ID, and if
# `False`, each uses the total number of transformer blocks
depth_init: bool = True
norm_type: str = "rmsnorm"
use_flex_attn: bool = False
attn_mask_type: str = "causal"
eos_id: int = 0
# MoE args
moe_enabled: bool = True
num_experts: int = 8
use_shared_expert: bool = True
auto_scale_hidden_dim: bool = True
# frequency of using MoE layer instead of feedforward layer in a transformer block
interleave_moe_layer_step: int = 2
# token-choice
top_k: int = 1
use_grouped_mm: bool = True # grouped mm or for-loop for the experts computation
def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
self.norm_type = job_config.model.norm_type
self.vocab_size = tokenizer.n_words
self.max_seq_len = job_config.training.seq_len
self.use_flex_attn = job_config.model.use_flex_attn
def get_nparams_and_flops(
self, model: nn.Module, seq_len: int
) -> tuple[int, float]:
nparams_embedding = 0
nparams_moe_router = 0
nparams_shared_expert = 0
nparams_experts = 0
nparams_dense = 0
for name, p in model.named_parameters():
if "embedding" in name:
nparams_embedding += p.numel()
nparams_dense += p.numel()
elif "moe.shared_expert" in name:
nparams_shared_expert += p.numel()
elif "moe.router" in name:
nparams_moe_router += p.numel()
elif "moe.experts" in name:
nparams_experts += p.numel()
else:
nparams_dense += p.numel()
nparams_sparse = nparams_moe_router + nparams_shared_expert + nparams_experts
nparams = nparams_dense + nparams_sparse
nparams_sparse_active = (
nparams_moe_router
+ nparams_shared_expert
+ nparams_experts * self.top_k // self.num_experts
)
logger.info(
f"Total parameter count: dense {nparams_dense:,}, "
f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
)
l, h, q, t = (
self.n_layers,
self.n_heads,
self.dim // self.n_heads,
seq_len,
)
# Reasoning behind the factor of 12 for the self-attention part of the formula:
# 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
# 2. the flash attention does 1 more matmul recomputation in the backward
# but recomputation should not be counted in calculating MFU (+0)
# 3. each matmul performs 1 multiplication and 1 addition (*2)
# 4. we follow the convention and do not account for sparsity in causal attention
num_flops_per_token = (
6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+ 12 * l * h * q * t
)
return nparams, num_flops_per_token