sbintuitions
/

sarashina2-vision-8b

@@ -70,7 +70,6 @@ class Sarashina2VisionForCausalLM(Sarashina2VisionPreTrainedModel, GenerationMix
         self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
         self.norm = nn.LayerNorm(config.text_config.hidden_size)
         self.llm = LlamaForCausalLM._from_config(config.text_config)
-        self._attn_implementation = config._attn_implementation
         # Initialize weights and apply final processing
         self.post_init()
@@ -113,6 +112,7 @@ class Sarashina2VisionForCausalLM(Sarashina2VisionPreTrainedModel, GenerationMix
         pixel_values: torch.FloatTensor = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
@@ -130,6 +130,11 @@ class Sarashina2VisionForCausalLM(Sarashina2VisionPreTrainedModel, GenerationMix
             pixel_values (torch.FloatTensor, optional): The tensors corresponding to the input images. Defaults to None.
             image_grid_thw (Optional[torch.LongTensor], optional): The temporal, height and width of feature shape of each image in LLM. Defaults to None.
             cache_position (Optional[torch.LongTensor], optional): Indices depicting the position of the input sequence tokens in the sequence. Defaults to None.
         Returns:
             CausalLMOutputWithPast: The output of the model.
         """
@@ -173,6 +178,7 @@ class Sarashina2VisionForCausalLM(Sarashina2VisionPreTrainedModel, GenerationMix
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
             **lm_kwargs,
         )

         self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
         self.norm = nn.LayerNorm(config.text_config.hidden_size)
         self.llm = LlamaForCausalLM._from_config(config.text_config)
         # Initialize weights and apply final processing
         self.post_init()
         pixel_values: torch.FloatTensor = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
             pixel_values (torch.FloatTensor, optional): The tensors corresponding to the input images. Defaults to None.
             image_grid_thw (Optional[torch.LongTensor], optional): The temporal, height and width of feature shape of each image in LLM. Defaults to None.
             cache_position (Optional[torch.LongTensor], optional): Indices depicting the position of the input sequence tokens in the sequence. Defaults to None.
+            logits_to_keep (Union[int, torch.Tensor]): If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
         Returns:
             CausalLMOutputWithPast: The output of the model.
         """
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )