moondream
/

moondream-2b-2025-04-14-4bit

Image-Text-to-Text

Safetensors

GGUF

moondream1

custom_code

Model card Files Files and versions Community

vikhyatk commited on 29 days ago

Commit

28e93ab

verified ·

1 Parent(s): 1ccf5fd

Upload HfMoondream

Browse files

Files changed (1) hide show

packing.py +31 -40

packing.py CHANGED Viewed

@@ -1,20 +1,6 @@
 import torch
-def unpack_int4(packed: torch.Tensor, original_length: int) -> torch.Tensor:
-    """
-    Unpack a tensor of uint8 packed bytes (two 4-bit values per byte) into a 1D tensor of int8 values,
-    vectorized over the entire input.
-    """
-    lower = packed & 0xF
-    upper = (packed >> 4) & 0xF
-    # Interleave lower and upper nibbles
-    nibbles = torch.stack([lower, upper], dim=-1).view(-1)[:original_length]
-    nibbles = nibbles.to(torch.int8)
-    nibbles[nibbles >= 8] -= 16
-    return nibbles
 def dequantize_tensor(
     packed: torch.Tensor,
     scales: torch.Tensor,
@@ -23,30 +9,35 @@ def dequantize_tensor(
     dtype: torch.dtype,
 ):
     """
-    Dequantizes a packed int4 tensor (with given per-block scales) back to bfloat16,
-    using vectorized operations to avoid Python loops.
     """
-    num_bytes_per_block = (block_size + 1) // 2  # number of packed bytes per block
-    num_blocks_total = packed.numel() // num_bytes_per_block
-    # Reshape to (num_blocks_total, num_bytes_per_block)
-    packed_rows = packed.view(num_blocks_total, num_bytes_per_block)
-    # Vectorized unpacking: compute lower and upper nibbles for all rows at once.
-    lower = packed_rows & 0xF
-    upper = (packed_rows >> 4) & 0xF
-    # Create a new dimension for the two nibbles and then flatten.
-    nibbles = torch.stack([lower, upper], dim=2).view(num_blocks_total, -1)
-    # Slice to get exactly block_size values per block.
-    quantized_flat = nibbles[:, :block_size].to(torch.int8)
-    quantized_flat[quantized_flat >= 8] -= 16
-    # Reshape to original block structure.
-    last_dim = orig_shape[-1]
-    num_blocks = last_dim // block_size
-    new_shape = orig_shape[:-1] + (num_blocks, block_size)
-    quantized = quantized_flat.view(new_shape)
-    # Dequantize using scales.
-    dequantized = quantized.to(torch.float32) * scales.unsqueeze(-1)
-    dequantized = dequantized.view(orig_shape)
-    return dequantized.to(dtype)

 import torch
 def dequantize_tensor(
     packed: torch.Tensor,
     scales: torch.Tensor,
     dtype: torch.dtype,
 ):
     """
+    In-place–friendly dequantization of int4-packed data back to `dtype`,
+    mutating `packed` (and reading `scales`) to avoid extra big intermediates.
     """
+    # how many bytes encode each block of `block_size` 4-bit values
+    num_bytes = (block_size + 1) // 2
+    num_blocks = packed.numel() // num_bytes
+    # view as [blocks, bytes_per_block]
+    pr = packed.view(num_blocks, num_bytes)
+    # prepare output in the target dtype
+    out = torch.empty((num_blocks, block_size), device=packed.device, dtype=dtype)
+    # ---- lower nibble ----
+    lower = pr & 0xF  # [blocks, bytes]
+    lower = lower.to(torch.int8)  # cast to signed
+    lower[lower >= 8] -= 16  # sign-correct
+    lo_count = (block_size + 1) // 2
+    out[:, 0:block_size:2] = lower[:, :lo_count].to(dtype) * scales.view(-1, 1)
+    # ---- upper nibble ----
+    pr >>= 4  # in-place shift of the original packed bytes
+    upper = pr & 0xF
+    upper = upper.to(torch.int8)
+    upper[upper >= 8] -= 16
+    hi_count = block_size // 2
+    out[:, 1:block_size:2] = upper[:, :hi_count].to(dtype) * scales.view(-1, 1)
+    # restore original shape
+    return out.view(orig_shape)