cleanup-inference-code

by snowclipsed - opened 21 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+46

-106

Files changed (4) hide show

layers.py +7 -6
moondream.py +12 -14
text.py +8 -9
weights.py +19 -77

layers.py CHANGED Viewed

@@ -36,26 +36,27 @@ class QuantizedLinear(nn.Module):
         self,
         in_features: int,
         out_features: int,
-        dtype: torch.dtype,
     ):
-        # TODO: Take group_size as an input instead of hardcoding it here.
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.weight = nn.ParameterDict(
             {
                 "packed": nn.Parameter(
                     torch.empty(
-                        out_features * in_features // (128 * 2), 128, dtype=torch.uint8
                     ),
                     requires_grad=False,
                 ),
                 "scale": nn.Parameter(
-                    torch.empty(out_features * in_features // 128, 1),
                     requires_grad=False,
                 ),
                 "zero_point": nn.Parameter(
-                    torch.empty(out_features * in_features // 128, 1),
                     requires_grad=False,
                 ),
             }
@@ -86,7 +87,7 @@ class QuantizedLinear(nn.Module):
         )
         del self.weight, self.bias
-        quantize_(self, int4_weight_only(group_size=128))
         self.unpacked = True
         torch.cuda.empty_cache()

         self,
         in_features: int,
         out_features: int,
+        group_size: int = 128,
+        dtype: torch.dtype = torch.uint8,
     ):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
+        self.group_size = group_size
         self.weight = nn.ParameterDict(
             {
                 "packed": nn.Parameter(
                     torch.empty(
+                        out_features * in_features // (group_size * 2), group_size, dtype=dtype
                     ),
                     requires_grad=False,
                 ),
                 "scale": nn.Parameter(
+                    torch.empty(out_features * in_features // group_size, 1),
                     requires_grad=False,
                 ),
                 "zero_point": nn.Parameter(
+                    torch.empty(out_features * in_features // group_size, 1),
                     requires_grad=False,
                 ),
             }
         )
         del self.weight, self.bias
+        quantize_(self, int4_weight_only(group_size=self.group_size))
         self.unpacked = True
         torch.cuda.empty_cache()

moondream.py CHANGED Viewed

@@ -77,38 +77,36 @@ class MoondreamModel(nn.Module):
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
-        # Region Model
-        linear_cls = (
-            QuantizedLinear if config.region.group_size is not None else nn.Linear
-        )
         self.region = nn.ModuleDict(
             {
-                "coord_encoder": linear_cls(
-                    config.region.coord_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
-                        "fc1": linear_cls(
-                            config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
                             dtype=dtype,
                         ),
                     }
                 ),
-                "size_encoder": linear_cls(
-                    config.region.size_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
-                        "fc1": linear_cls(
-                            config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.size_out_dim,
                             dtype=dtype,
                         ),
                     }

         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
         self.region = nn.ModuleDict(
             {
+                "coord_encoder": QuantizedLinear(
+                    config.region.coord_feat_dim, config.region.dim, group_size=config.text.group_size, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
+                        "fc1": QuantizedLinear(
+                            config.region.dim, config.region.inner_dim, group_size=config.text.group_size, dtype=dtype
                         ),
+                        "fc2": QuantizedLinear(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
+                            group_size=config.text.group_size,
                             dtype=dtype,
                         ),
                     }
                 ),
+                "size_encoder": QuantizedLinear(
+                    config.region.size_feat_dim, config.region.dim, group_size=config.text.group_size, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
+                        "fc1": QuantizedLinear(
+                            config.region.dim, config.region.inner_dim, group_size=config.text.group_size, dtype=dtype
                         ),
+                        "fc2": QuantizedLinear(
                             config.region.inner_dim,
                             config.region.size_out_dim,
+                            group_size=config.text.group_size,
                             dtype=dtype,
                         ),
                     }

text.py CHANGED Viewed

@@ -152,9 +152,8 @@ def _lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
     return logits
-def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
-    linear_cls = QuantizedLinear if config.group_size is not None else nn.Linear
     text = nn.ModuleDict(
         {
@@ -165,19 +164,19 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
-                                    "qkv": linear_cls(config.dim, qkv_dim, dtype=dtype),
-                                    "proj": linear_cls(
-                                        config.dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
-                                    "fc1": linear_cls(
-                                        config.dim, config.ff_dim, dtype=dtype
                                     ),
-                                    "fc2": linear_cls(
-                                        config.ff_dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),

     return logits
+def build_text_model(config: TextConfig, dtype: torch.dtype = torch.float16) -> nn.Module:
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
     text = nn.ModuleDict(
         {
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
+                                    "qkv": QuantizedLinear(config.dim, qkv_dim, group_size=config.group_size, dtype=dtype),
+                                    "proj": QuantizedLinear(
+                                        config.dim, config.dim, group_size=config.group_size, dtype=dtype
                                     ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
+                                    "fc1": QuantizedLinear(
+                                        config.dim, config.ff_dim, group_size=config.group_size, dtype=dtype
                                     ),
+                                    "fc2": QuantizedLinear(
+                                        config.ff_dim, config.dim, group_size=config.group_size, dtype=dtype
                                     ),
                                 }
                             ),

weights.py CHANGED Viewed

@@ -6,9 +6,6 @@ import re
 from contextlib import contextmanager
 from typing import Callable, List
-from .text import build_text_model
-from .config import TextConfig
 # Our custom linear has an module named linear, so we add linear to the name
 def add_linear_to_key(k: str) -> str:
@@ -46,7 +43,6 @@ def safetensors_open(safetensors_file: str):
 def _load_weights(
     get_tensor: Callable[[str], torch.Tensor],
     model: nn.Module,
-    is_quantized: bool = False,
 ) -> None:
     """Internal function to load weights using a tensor getter function."""
     model = model.to(dtype=torch.float16)
@@ -111,42 +107,23 @@ def _load_weights(
             }
         )
-    if not is_quantized:
-        for i in range(len(model.text["blocks"])):
-            prefix = f"text_model.transformer.h.{i}"
-            blk = model.text["blocks"][i]
-            weight_map.update(
-                {
-                    f"{prefix}.ln.weight": blk["ln"].weight,
-                    f"{prefix}.ln.bias": blk["ln"].bias,
-                    f"{prefix}.mixer.Wqkv.weight": blk["attn"]["qkv"].weight,
-                    f"{prefix}.mixer.Wqkv.bias": blk["attn"]["qkv"].bias,
-                    f"{prefix}.mixer.out_proj.weight": blk["attn"]["proj"].weight,
-                    f"{prefix}.mixer.out_proj.bias": blk["attn"]["proj"].bias,
-                    f"{prefix}.mlp.fc1.weight": blk["mlp"]["fc1"].weight,
-                    f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
-                    f"{prefix}.mlp.fc2.weight": blk["mlp"]["fc2"].weight,
-                    f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
-                }
-            )
-    else:  # add special quantized path. this is specific to how bitblas expects weights to be loaded (.qweight)
-        for i in range(len(model.text["blocks"])):
-            prefix = f"text_model.transformer.h.{i}"
-            blk = model.text["blocks"][i]
-            weight_map.update(
-                {
-                    f"{prefix}.ln.qweight": blk["ln"].weight,
-                    f"{prefix}.ln.bias": blk["ln"].bias,
-                    f"{prefix}.mixer.Wqkv.qweight": blk["attn"]["qkv"].weight,
-                    f"{prefix}.mixer.Wqkv.bias": blk["attn"]["qkv"].bias,
-                    f"{prefix}.mixer.out_proj.qweight": blk["attn"]["proj"].weight,
-                    f"{prefix}.mixer.out_proj.bias": blk["attn"]["proj"].bias,
-                    f"{prefix}.mlp.fc1.qweight": blk["mlp"]["fc1"].weight,
-                    f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
-                    f"{prefix}.mlp.fc2.qweight": blk["mlp"]["fc2"].weight,
-                    f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
-                }
-            )
     for key, tensor in weight_map.items():
         tensor.data.copy_(get_tensor(key))
@@ -162,24 +139,6 @@ def load_weights_from_safetensors(weights_file: str, model: nn.Module) -> None:
     with safetensors_open(weights_file) as get_tensor:
         all_keys = get_tensor.keys()
-        is_quantized = any(
-            ".qweight" in key or "_quantized" in key or "quant." in key
-            for key in all_keys
-        )
-        if "text_model.transformer.h.0.ln.weight" in all_keys:
-            layernorm_dtype = get_tensor("text_model.transformer.h.0.ln.weight").dtype
-        else:
-            layernorm_dtype = torch.float16
-        linear_dtype = torch.int8 if is_quantized else torch.float16
-        model.text = build_text_model(
-            TextConfig, linear_dtype=linear_dtype, layernorm_dtype=layernorm_dtype
-        )
-        if model.setup_caches_flag:
-            model._setup_caches()
         if (
             "vision.blocks.0.attn.proj.bias" in all_keys
             or "model.vision.blocks.0.attn.proj.bias" in all_keys
@@ -193,7 +152,6 @@ def load_weights_from_safetensors(weights_file: str, model: nn.Module) -> None:
             _load_weights(
                 lambda x: get_tensor(name_map[x]).to(dtype=torch.float16),
                 model,
-                is_quantized,
             )
@@ -201,22 +159,6 @@ def load_weights_from_pt(weights_file: str, model: nn.Module) -> None:
     """Load weights from a PyTorch file into a MoondreamModel instance."""
     tensors = torch.load(weights_file, map_location="cpu", weights_only=True)
     all_keys = tensors.keys()
-    is_quantized = any(
-        ".qweight" in key or "_quantized" in key or "quant." in key for key in all_keys
-    )
-    if "text.blocks.0.ln.weight" in all_keys:
-        layernorm_dtype = tensors["text.blocks.0.ln.weight"].dtype
-    else:
-        layernorm_dtype = torch.float16
-    linear_dtype = torch.int8 if is_quantized else torch.float16
-    model.text = build_text_model(
-        TextConfig, linear_dtype=linear_dtype, layernorm_dtype=layernorm_dtype
-    )
-    if model.setup_caches_flag:
-        model._setup_caches()
     if (
         "vision.blocks.0.attn.proj.bias" in all_keys
         or "model.vision.blocks.0.attn.proj.bias" in all_keys
@@ -228,7 +170,7 @@ def load_weights_from_pt(weights_file: str, model: nn.Module) -> None:
             k.replace("._orig_mod", ""): v.to(dtype=torch.float16)
             for k, v in tensors.items()
         }
-        _load_weights(lambda x: tensors[x], model, is_quantized)
 def load_weights_into_model(weights_file: str, model: nn.Module) -> None:
@@ -246,4 +188,4 @@ def load_weights_into_model(weights_file: str, model: nn.Module) -> None:
     # Make all parameters contiguous
     for param in model.parameters():
-        param.data = param.data.contiguous()

 from contextlib import contextmanager
 from typing import Callable, List
 # Our custom linear has an module named linear, so we add linear to the name
 def add_linear_to_key(k: str) -> str:
 def _load_weights(
     get_tensor: Callable[[str], torch.Tensor],
     model: nn.Module,
 ) -> None:
     """Internal function to load weights using a tensor getter function."""
     model = model.to(dtype=torch.float16)
             }
         )
+    for i in range(len(model.text["blocks"])):
+        prefix = f"text_model.transformer.h.{i}"
+        blk = model.text["blocks"][i]
+        weight_map.update(
+            {
+                f"{prefix}.ln.weight": blk["ln"].weight,
+                f"{prefix}.ln.bias": blk["ln"].bias,
+                f"{prefix}.mixer.Wqkv.weight": blk["attn"]["qkv"].weight,
+                f"{prefix}.mixer.Wqkv.bias": blk["attn"]["qkv"].bias,
+                f"{prefix}.mixer.out_proj.weight": blk["attn"]["proj"].weight,
+                f"{prefix}.mixer.out_proj.bias": blk["attn"]["proj"].bias,
+                f"{prefix}.mlp.fc1.weight": blk["mlp"]["fc1"].weight,
+                f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
+                f"{prefix}.mlp.fc2.weight": blk["mlp"]["fc2"].weight,
+                f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
+            }
+        )
     for key, tensor in weight_map.items():
         tensor.data.copy_(get_tensor(key))
     with safetensors_open(weights_file) as get_tensor:
         all_keys = get_tensor.keys()
         if (
             "vision.blocks.0.attn.proj.bias" in all_keys
             or "model.vision.blocks.0.attn.proj.bias" in all_keys
             _load_weights(
                 lambda x: get_tensor(name_map[x]).to(dtype=torch.float16),
                 model,
             )
     """Load weights from a PyTorch file into a MoondreamModel instance."""
     tensors = torch.load(weights_file, map_location="cpu", weights_only=True)
     all_keys = tensors.keys()
     if (
         "vision.blocks.0.attn.proj.bias" in all_keys
         or "model.vision.blocks.0.attn.proj.bias" in all_keys
             k.replace("._orig_mod", ""): v.to(dtype=torch.float16)
             for k, v in tensors.items()
         }
+        _load_weights(lambda x: tensors[x], model)
 def load_weights_into_model(weights_file: str, model: nn.Module) -> None:
     # Make all parameters contiguous
     for param in model.parameters():
+        param.data = param.data.contiguous()