Enable `cache_params` to work with `generate()` from `GenerationMixin`

IMPORTANT: The cache doesn't seem to work very well in my tests, and given that it was disabled and still contained a `breakpoint()` call, I assume it was just not ready, but this is still important to be able to understand how fast the model can run when the cache is used.

In my tests, the model in Q4 BF16 goes from generating about 3.3 tok/s to about 19.3 tok/s on the same RTX 5090. Now, I understand that ideally this model should be run in native FP4 on the 5090 but this is not supported yet in pytorch, so I guess this is only possible in the NeMo engine for now.

Files changed (1) hide show

modeling_nemotron_h.py +20 -15

modeling_nemotron_h.py CHANGED Viewed

@@ -31,6 +31,9 @@ from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -168,12 +171,14 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
     def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         super().__init__()
         self.dtype = dtype
         self.hybrid_override_pattern = config.hybrid_override_pattern
         self.has_previous_state = False  # only used by mamba
-        intermediate_size = config.expand * config.hidden_size
-        ssm_state_size = config.ssm_state_size
-        conv_kernel_size = config.conv_kernel
         self.conv_states = []
         self.ssm_states = []
         self.transformer_layers = []
@@ -181,10 +186,10 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
             if self.hybrid_override_pattern[i] == "M":
                 # Mamba layer
                 self.conv_states += [
-                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
-                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 # Attention or MLP layer
@@ -245,14 +250,14 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
         self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
     ) -> torch.Tensor:
         if cache_init:
-            self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device)
         else:
             self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
-            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device)
         return self.conv_states[layer_idx]
     def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
-        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
         return self.ssm_states[layer_idx]
     def reset(self):
@@ -413,7 +418,7 @@ class NemotronHMamba2Mixer(nn.Module):
                 dt_softplus=True,
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
-            breakpoint()
             hidden_states = self.norm(hidden_states, gate)
             # 4. Final linear projection
@@ -560,7 +565,7 @@ class NemotronHMamba2Mixer(nn.Module):
         A = -torch.exp(self.A_log.float())                            # [num_heads]
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # We need to guarantee that anything regarding the cache is on the same device
-            cache_device = cache_params.ssm_states.device
             # Note: there is no need to pad parameter matrices here, as there is just one new token
             # for batched generation
@@ -1185,7 +1190,7 @@ class NemotronHOutput(ModelOutput):
 @dataclass
 # Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
-class NemotronHCausalLMOutput(ModelOutput):
     """
     Base class for causal language model (or autoregressive) outputs.
@@ -1208,7 +1213,7 @@ class NemotronHCausalLMOutput(ModelOutput):
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
-    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1568,7 +1573,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1593,7 +1598,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         nemotron_h_outputs = self.backbone(
             input_ids,
-            cache_params=cache_params,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1626,7 +1631,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         return NemotronHCausalLMOutput(
             loss=loss,
             logits=logits,
-            cache_params=nemotron_h_outputs.cache_params,
             hidden_states=nemotron_h_outputs.hidden_states,
             attentions=nemotron_h_outputs.attentions,
         )

     AttentionMaskConverter,
 )
 from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+)
 from transformers.utils import (
     ModelOutput,
     add_code_sample_docstrings,
     def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         super().__init__()
+        self.device=device
         self.dtype = dtype
         self.hybrid_override_pattern = config.hybrid_override_pattern
         self.has_previous_state = False  # only used by mamba
+        self.intermediate_size = config.expand * config.hidden_size
+        self.ssm_state_size = config.ssm_state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.conv_dim = self.intermediate_size + 2 * config.n_groups * config.ssm_state_size
         self.conv_states = []
         self.ssm_states = []
         self.transformer_layers = []
             if self.hybrid_override_pattern[i] == "M":
                 # Mamba layer
                 self.conv_states += [
+                    torch.zeros(batch_size, self.conv_dim, self.conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
+                    torch.zeros(batch_size, self.intermediate_size, self.ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 # Attention or MLP layer
         self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
     ) -> torch.Tensor:
         if cache_init:
+            self.conv_states[layer_idx] = new_conv_state.to(self.device)
         else:
             self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
+            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device)
         return self.conv_states[layer_idx]
     def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device)
         return self.ssm_states[layer_idx]
     def reset(self):
                 dt_softplus=True,
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            # TODO: why was there a breakpoint() call here?
             hidden_states = self.norm(hidden_states, gate)
             # 4. Final linear projection
         A = -torch.exp(self.A_log.float())                            # [num_heads]
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states[0].device if len(cache_params.ssm_states) > 0 else cache_params.device
             # Note: there is no need to pad parameter matrices here, as there is just one new token
             # for batched generation
 @dataclass
 # Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
+class NemotronHCausalLMOutput(MoeCausalLMOutputWithPast):
     """
     Base class for causal language model (or autoregressive) outputs.
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[HybridMambaAttentionDynamicCache] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         nemotron_h_outputs = self.backbone(
             input_ids,
+            cache_params=past_key_values,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         return NemotronHCausalLMOutput(
             loss=loss,
             logits=logits,
+            past_key_values=nemotron_h_outputs.cache_params,
             hidden_states=nemotron_h_outputs.hidden_states,
             attentions=nemotron_h_outputs.attentions,
         )