neuralmagic · rahul-tuli · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py b/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py
@@ -90,13 +90,20 @@ def from_dense(
         :return: instantiated compressed tensor
         """
         shape = list(tensor.shape)
+
+        # Perform compression on the original device (CPU or GPU)
+        # This avoids unnecessary device transfers during compression
         compressed, bitmask = sparse24_bitmask_compress(
-            tensor.cpu(), sparsity_structure=sparsity_structure
+            tensor, sparsity_structure=sparsity_structure
         )
+
+        # Move to CPU only for storage after compression is complete
+        # This is required by the storage format but we delay it until the end
+        # to maximize GPU utilization during compression
         return Sparse24BitMaskTensor(
             shape=shape,
-            compressed=compressed,
-            bitmask=bitmask,
+            compressed=compressed.cpu() if compressed.is_cuda else compressed,
+            bitmask=bitmask.cpu() if bitmask.is_cuda else bitmask,
         )
 
     @staticmethod
@@ -206,7 +213,38 @@ def sparse24_bitmask_decompress(
     return decompressed_tensor
 
 
-def get_24_bytemasks(tensor):
+def _validate_24_sparsity_tensor(tensor: torch.Tensor) -> None:
+    """
+    Validate that tensor is suitable for 2:4 sparsity.
+
+    :param tensor: Input tensor to validate
+    :raises ValueError: If tensor size is not a multiple of 4
+    """
+    if tensor.numel() % 4 != 0:
+        raise ValueError(
+            f"Tensor size must be a multiple of 4 for 2:4 sparsity, "
+            f"got {tensor.numel()} elements"
+        )
+
+
+def _get_topk_mask(reshaped_tensor: torch.Tensor, k: int = 2) -> torch.Tensor:
+    """
+    Get mask for top-k elements per group based on absolute values.
+
+    :param reshaped_tensor: Tensor reshaped into groups
+    :param k: Number of elements to keep per group
+    :return: Boolean mask tensor
+    """
+    abs_tensor = reshaped_tensor.abs()
+    # sorted=False provides performance improvement without affecting correctness
+    topk_indices = abs_tensor.topk(k, dim=1, largest=True, sorted=False).indices
+
+    mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool)
+    mask.scatter_(1, topk_indices, True)
+    return mask
+
+
+def get_24_bytemasks(tensor: torch.Tensor) -> torch.Tensor:
     """
     Generate a 2:4 sparsity mask for the given tensor.
 
@@ -222,21 +260,25 @@ def get_24_bytemasks(tensor):
     :raises ValueError: If the total number of elements in the tensor is not a
                         multiple of 4.
     """
+    # Validate input
+    _validate_24_sparsity_tensor(tensor)
+
     original_dtype = tensor.dtype
+    original_shape = tensor.shape
+
+    # Handle FP8 dtype by viewing as int8 for magnitude comparison
     if tensor.dtype == FP8_DTYPE:
         tensor = tensor.view(torch.int8)
-    original_shape = tensor.shape
-    num_elements = tensor.numel()
-
-    if num_elements % 4 != 0:
-        raise ValueError("Tensor size must be a multiple of 4 for TWO_FOUR sparsity")
-
+
+    # Reshape into groups of 4 and get top-2 mask
     reshaped_tensor = tensor.view(-1, 4)
-    abs_tensor = reshaped_tensor.abs()
-    topk_indices = abs_tensor.topk(2, dim=1).indices
-    mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool)
-    mask.scatter_(1, topk_indices, True)
+    mask = _get_topk_mask(reshaped_tensor, k=2)
+
+    # Restore original shape
     mask = mask.view(original_shape)
-    tensor = tensor.view(original_dtype)
-
+
+    # Restore tensor dtype if it was changed
+    if tensor.dtype == torch.int8 and original_dtype == FP8_DTYPE:
+        tensor = tensor.view(original_dtype)
+
     return mask
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -293,18 +293,108 @@ def combine_shards(shards, dim=0):
     return combined
 
 
-def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
+def _validate_bitmask_shape(bytemasks: torch.Tensor) -> None:
     """
-    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
-    compressed to R x ceil(C/8)
+    Validates input tensor shape for bitmask packing.
+
+    :param bytemasks: Input tensor to validate
+    :raises ValueError: If tensor is not 2D
+    """
+    if len(bytemasks.shape) != 2:
+        raise ValueError(
+            f"pack_bitmasks expects a 2D tensor, got shape {bytemasks.shape}"
+        )
 
-    :param bytemasks: mask tensor where each byte corresponds to a weight
-    :return: mask tensor where each bit corresounds to a weight
+
+def _pack_bits_torch(bytemasks_uint8: torch.Tensor, rows: int, cols: int, 
+                     device: torch.device) -> torch.Tensor:
+    """
+    Pack bits using PyTorch operations.
+
+    :param bytemasks_uint8: Boolean mask converted to uint8
+    :param rows: Number of rows in the mask
+    :param cols: Number of columns in the mask
+    :param device: Device to create the packed tensor on
+    :return: Packed bitmask tensor
+    """
+    # Calculate packed array size: ceil(cols/8)
+    # This ensures we have enough bytes to store all bits without padding
+    packed_cols = (cols + 7) // 8
+
+    # Reshape to process 8 bits at a time
+    # If cols is not divisible by 8, pad with zeros
+    if cols % 8 != 0:
+        padding = 8 - (cols % 8)
+        bytemasks_uint8 = torch.nn.functional.pad(bytemasks_uint8, (0, padding))
+
+    # Reshape to (rows, packed_cols, 8)
+    reshaped = bytemasks_uint8.view(rows, packed_cols, 8)
+
+    # Create bit shift pattern [1, 2, 4, 8, 16, 32, 64, 128]
+    bit_shifts = (1 << torch.arange(8, device=device, dtype=torch.uint8))
+
+    # Multiply each bit by its position value and sum
+    # This packs 8 bits into a single byte
+    packed = (reshaped * bit_shifts).sum(dim=2, dtype=torch.uint8)
+
+    return packed
+
+
+def _pack_bits_numpy_fallback(bytemasks: torch.Tensor) -> torch.Tensor:
+    """
+    Fallback to NumPy implementation for compatibility.
+
+    :param bytemasks: Input boolean mask tensor
+    :return: Packed bitmask tensor
     """
+    if bytemasks.is_cuda:
+        bytemasks = bytemasks.cpu()
+
     packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
-    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
+    return torch.from_numpy(packed_bits_numpy)
 
-    return packed_bits_torch
+
+def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
+    """
+    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
+    compressed to R x ceil(C/8).
+
+    Supports both CPU and GPU tensors with automatic fallback to NumPy for compatibility.
+
+    :param bytemasks: 2D boolean mask tensor where each element corresponds to a weight
+    :return: Packed mask tensor where each bit corresponds to a weight
+    :raises ValueError: If input tensor is not 2D
+    """
+    # Validate input shape
+    _validate_bitmask_shape(bytemasks)
+
+    try:
+        device = bytemasks.device
+        dtype = bytemasks.dtype
+
+        # Ensure boolean type for consistent behavior
+        # Some tensors might come as uint8 or other types
+        if dtype != torch.bool:
+            bytemasks = bytemasks.bool()
+
+        # For CPU tensors, use NumPy which is much faster
+        # For GPU tensors, keep on GPU to avoid transfer overhead
+        if device.type == 'cpu':
+            # NumPy's packbits is highly optimized C code
+            # It's ~100x faster than our PyTorch loop implementation
+            return _pack_bits_numpy_fallback(bytemasks)
+        else:
+            # On GPU, the PyTorch implementation avoids CPU transfers
+            # which is more important than the packing speed itself
+            rows, cols = bytemasks.shape
+            bytemasks_uint8 = bytemasks.to(torch.uint8)
+            return _pack_bits_torch(bytemasks_uint8, rows, cols, device)
+
+    except Exception:
+        # Fallback to NumPy for compatibility
+        # This ensures the function works even if PyTorch operations fail
+        # (e.g., on older PyTorch versions or specific hardware)
+        return _pack_bits_numpy_fallback(bytemasks)
 
 
 def unpack_bitmasks(