nvidia
/

Llama-3_1-Nemotron-51B-Instruct

Text Generation

Model card Files Files and versions Community

v4.46 support

#7

by itlevy - opened Sep 26

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

Files changed (1) hide show

variable_cache.py +11 -9

variable_cache.py CHANGED Viewed

@@ -32,18 +32,20 @@ class VariableCache(Cache_4_44_2, Cache):
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
-    def __init__(self,
-                 config: DeciLMConfig,
-                 max_batch_size: int,
-                 max_cache_len: int | None,
-                 device: torch.device | str | None = None,
-                 dtype: torch.dtype | None = None,
-                 **kwargs: Any,
-                 ):
         Cache_4_44_2.__init__(self)
         self.config = config
-        self.max_batch_size = max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype

     The cache of each layer is allocated to the same gpu as the layer itself.
     """
+    def __init__(
+            self,
+            config: DeciLMConfig,
+            batch_size: int = None,
+            max_cache_len: int = None,
+            device: torch.device = None,
+            dtype: torch.dtype = torch.float32,
+            max_batch_size: Optional[int] = None,
+            **kwargs: Any,
+    ) -> None:
         Cache_4_44_2.__init__(self)
         self.config = config
+        self.max_batch_size = batch_size or max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype