Upload HfMoondream

Files changed (3) hide show

layers.py CHANGED Viewed

@@ -43,7 +43,10 @@ class QuantizedLinear(nn.Module):
                     ),
                     requires_grad=False,
                 ),
-                "scales": nn.Parameter(
                     torch.empty(out_features, in_features // 128), requires_grad=False
                 ),
             }
@@ -55,7 +58,8 @@ class QuantizedLinear(nn.Module):
         self.weight = nn.Parameter(
             dequantize_tensor(
                 self.weight["packed"],
-                self.weight["scales"],
                 (self.weight["packed"].shape[0], self.weight["packed"].shape[1] * 128),
                 128,
                 torch.bfloat16,

                     ),
                     requires_grad=False,
                 ),
+                "scale": nn.Parameter(
+                    torch.empty(out_features, in_features // 128), requires_grad=False
+                ),
+                "zero_point": nn.Parameter(
                     torch.empty(out_features, in_features // 128), requires_grad=False
                 ),
             }
         self.weight = nn.Parameter(
             dequantize_tensor(
                 self.weight["packed"],
+                self.weight["scale"],
+                self.weight["zero_point"],
                 (self.weight["packed"].shape[0], self.weight["packed"].shape[1] * 128),
                 128,
                 torch.bfloat16,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:97076df1a9a09ff4108a69ea59b4c9abf522b248e8425c9334bab98ddbaf4b33
-size 1838828672

 version https://git-lfs.github.com/spec/v1
+oid sha256:325876fadb939f7c65f545d5d37b03f5035681b87bad1073f6d2e804ce2f4068
+size 1881750512

packing.py CHANGED Viewed

@@ -1,43 +1,35 @@
 import torch
 def dequantize_tensor(
     packed: torch.Tensor,
     scales: torch.Tensor,
     orig_shape: torch.Size,
     block_size: int,
-    dtype: torch.dtype,
 ):
-    """
-    In-place–friendly dequantization of int4-packed data back to `dtype`,
-    mutating `packed` (and reading `scales`) to avoid extra big intermediates.
-    """
-    # how many bytes encode each block of `block_size` 4-bit values
-    num_bytes = (block_size + 1) // 2
-    num_blocks = packed.numel() // num_bytes
-    # view as [blocks, bytes_per_block]
-    pr = packed.view(num_blocks, num_bytes)
-    # prepare output in the target dtype
-    out = torch.empty((num_blocks, block_size), device=packed.device, dtype=dtype)
-    # ---- lower nibble ----
-    lower = pr & 0xF  # [blocks, bytes]
-    lower = lower.to(torch.int8)  # cast to signed
-    lower[lower >= 8] -= 16  # sign-correct
-    lo_count = (block_size + 1) // 2
-    out[:, 0:block_size:2] = lower[:, :lo_count].to(dtype) * scales.view(-1, 1)
-    # ---- upper nibble ----
-    pr >>= 4  # in-place shift of the original packed bytes
-    upper = pr & 0xF
-    upper = upper.to(torch.int8)
-    upper[upper >= 8] -= 16
-    hi_count = block_size // 2
-    out[:, 1:block_size:2] = upper[:, :hi_count].to(dtype) * scales.view(-1, 1)
-    # restore original shape
-    return out.view(orig_shape)

 import torch
+def unpack_int4(packed: torch.Tensor, original_length: int) -> torch.Tensor:
+    orig_shape = packed.shape
+    last_dim = orig_shape[-1]
+    batch_shape = orig_shape[:-1]
+    flat_packed = packed.reshape(-1, last_dim)
+    batch_size = flat_packed.shape[0]
+    flat_bytes = flat_packed.reshape(-1)
+    lower = flat_bytes & 0xF
+    upper = (flat_bytes >> 4) & 0xF
+    unpacked = torch.stack([lower, upper], dim=1).reshape(batch_size, last_dim * 2)
+    unpacked = unpacked[:, :original_length]
+    unpacked = unpacked.reshape(*batch_shape, original_length)
+    return unpacked.to(torch.int8)
 def dequantize_tensor(
     packed: torch.Tensor,
     scales: torch.Tensor,
+    zero_points: torch.Tensor,
     orig_shape: torch.Size,
     block_size: int,
+    dtype: torch.dtype = torch.bfloat16,
 ):
+    out_features, num_blocks, _ = packed.shape
+    unpacked = unpack_int4(packed, block_size)
+    scales_view = scales.unsqueeze(2)  # Shape: [out_features, num_blocks, 1]
+    zero_points_view = zero_points.unsqueeze(2)  # Shape: [out_features, num_blocks, 1]
+    dequantized = (unpacked.float() - zero_points_view) * scales_view
+    dequantized = dequantized.reshape(out_features, num_blocks * block_size)
+    dequantized = dequantized[:, : orig_shape[1]]
+    dequantized = dequantized.reshape(orig_shape)
+    return dequantized.to(dtype)