saronic-technologies
diff --git a/‎distributed_shampoo/utils/gpu_tests/shampoo_dist_utils_test.py
+73 b/‎distributed_shampoo/utils/gpu_tests/shampoo_dist_utils_test.py
+73
diff --git a/‎distributed_shampoo/utils/shampoo_ddp_distributor.py
+5-12 b/‎distributed_shampoo/utils/shampoo_ddp_distributor.py
+5-12
diff --git a/‎distributed_shampoo/utils/shampoo_dist_utils.py
+38 b/‎distributed_shampoo/utils/shampoo_dist_utils.py
+38
diff --git a/‎distributed_shampoo/utils/shampoo_hsdp_distributor.py
+7-32 b/‎distributed_shampoo/utils/shampoo_hsdp_distributor.py
+7-32
diff --git a/‎distributed_shampoo/utils/shampoo_preconditioner_list.py
+31-1 b/‎distributed_shampoo/utils/shampoo_preconditioner_list.py
+31-1
diff --git a/‎distributed_shampoo/utils/shampoo_quantization.py
+6-5 b/‎distributed_shampoo/utils/shampoo_quantization.py
+6-5
diff --git a/‎distributed_shampoo/utils/shampoo_utils.py
+1-1 b/‎distributed_shampoo/utils/shampoo_utils.py
+1-1
@@ -0,0 +1,73 @@
+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+"""
+
+#!/usr/bin/env python3
+
+
+from unittest import mock
+
+import torch
+
+from distributed_shampoo.utils import shampoo_dist_utils
+from distributed_shampoo.utils.shampoo_dist_utils import get_device_mesh
+from torch.distributed.device_mesh import DeviceMesh
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+class ShampooDistUtilsTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    def _verify_deivce_mesh(self, device_mesh: DeviceMesh) -> None:
+        replicate_mesh = device_mesh["replicate"]
+        shard_mesh = device_mesh["shard"]
+
+        self.assertEqual(device_mesh.get_group(0), device_mesh.get_group("replicate"))
+        self.assertEqual(device_mesh.get_group(1), device_mesh.get_group("shard"))
+
+        self.assertEqual(device_mesh.get_group("shard"), shard_mesh.get_group())
+        self.assertEqual(device_mesh.get_group("replicate"), replicate_mesh.get_group())
+
+        self.assertCountEqual(
+            device_mesh.get_all_groups(),
+            (shard_mesh.get_group(), replicate_mesh.get_group()),
+        )
+
+    @with_comms
+    def test_get_device_mesh(self) -> None:
+        mesh = torch.tensor(range(self.world_size)).view(-1, self.world_size // 2)
+
+        self._verify_deivce_mesh(
+            device_mesh=get_device_mesh(
+                device_type=self.device_type,
+                mesh=mesh,
+                mesh_dim_names=("replicate", "shard"),
+            )
+        )
+
+        # Test the caching property of get_device_mesh() by mocking DeviceMesh.__init__().
+        # DeviceMesh.__init__() should not be called due to caching, and the output of
+        # get_device_mesh() should be the same as the previous one.
+        with mock.patch.object(
+            shampoo_dist_utils.DeviceMesh,
+            "__init__",
+        ) as mock_device_mesh_init:
+            device_mesh = get_device_mesh(
+                device_type=self.device_type,
+                mesh=mesh,
+                mesh_dim_names=("replicate", "shard"),
+            )
+
+            mock_device_mesh_init.assert_not_called()
+
+        self._verify_deivce_mesh(device_mesh=device_mesh)
@@ -9,7 +9,7 @@
 
 import heapq
 import logging
-from functools import cache, partial
+from functools import partial
 from typing import Any, Dict, Optional, Tuple
 
 import torch
@@ -20,6 +20,7 @@
     PARAMS,
 )
 from distributed_shampoo.utils.shampoo_block_info import DDPBlockInfo
+from distributed_shampoo.utils.shampoo_dist_utils import get_device_mesh
 from distributed_shampoo.utils.shampoo_distributor import DistributorInterface
 from distributed_shampoo.utils.shampoo_utils import (
     compress_list,
@@ -478,19 +479,11 @@ def _allocate_zeros_distributed_tensor(
             )
         )
 
-        @cache
-        def get_device_mesh(device_mesh_ranks: Tuple[int, ...]) -> dtensor.DeviceMesh:
-            """Returns device mesh from provided ranks. This function will cache previous meshes according to the input ranks.
-
-            Args:
-                device_mesh_ranks ([Tuple[int, ...]): Ranks to use in device mesh of desired tensor.
-
-            """
-            return dtensor.DeviceMesh(device_type=device.type, mesh=device_mesh_ranks)
-
         return dtensor_zeros(
             shape,
             dtype=dtype,
-            device_mesh=get_device_mesh(device_mesh_ranks),
+            device_mesh=get_device_mesh(
+                device_type=device.type, mesh=device_mesh_ranks
+            ),
             placements=[dtensor.Replicate()],
         )
@@ -0,0 +1,38 @@
+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+"""
+
+from functools import cache
+from typing import Optional
+
+import torch
+
+from torch.distributed import _tensor as dtensor
+from torch.distributed._tensor import DeviceMesh
+
+
+@cache
+def get_device_mesh(
+    device_type: str,
+    mesh: torch.Tensor | tuple[int, ...],
+    mesh_dim_names: Optional[tuple[str, ...]] = None,
+) -> dtensor.DeviceMesh:
+    """Returns device mesh from provided ranks. This function will cache previous meshes according to the input ranks.
+
+    Args:
+        device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
+        mesh (torch.Tensor | tuple[int, ...]):  A multi-dimensional array or an integer tensor describing the layout
+                of devices, where the IDs are global IDs of the default process group.
+        mesh_dim_names (Optional[tuple[str, ...]]): Names of mesh dimensions.
+
+    Returns:
+        device_mesh (dtensor.DeviceMesh): Device mesh.
+
+
+    """
+    return DeviceMesh(device_type=device_type, mesh=mesh, mesh_dim_names=mesh_dim_names)
@@ -9,7 +9,6 @@
 
 import heapq
 import logging
-from functools import cache
 from math import prod
 from typing import Any, Dict, List, Tuple
 
@@ -23,6 +22,7 @@
     USE_MERGE_DIMS,
 )
 from distributed_shampoo.utils.shampoo_block_info import DDPBlockInfo
+from distributed_shampoo.utils.shampoo_dist_utils import get_device_mesh
 from distributed_shampoo.utils.shampoo_distributor import DistributorInterface
 from distributed_shampoo.utils.shampoo_utils import (
     compress_list,
@@ -137,9 +137,10 @@ def __init__(
             # Instantiates this by using DeviceMesh.
             ranks_in_all_replicated_groups = self._hsdp_device_mesh.mesh.T
             for ranks_in_replicated_group in ranks_in_all_replicated_groups:
-                device_mesh = self._get_device_mesh(
+                device_mesh = get_device_mesh(
                     device_type=self._hsdp_device_mesh.device_type,
-                    ranks_in_replicated_group=ranks_in_replicated_group,
+                    mesh=ranks_in_replicated_group.view(-1, self._dist_group_size),
+                    mesh_dim_names=("replicate", "shard"),
                 )
                 if dist.get_rank() in ranks_in_replicated_group:
                     self._dist_group = device_mesh.get_group("shard")
@@ -837,33 +838,6 @@ def block_within_tensor_shard_recovery(
             block_end_idx=end_idx,
         )
 
-    def _get_device_mesh(
-        self,
-        device_type: str,
-        ranks_in_replicated_group: Tensor,
-    ) -> dtensor.DeviceMesh:
-        """Returns 2D device mesh from the provided device type and ranks in replicated group.
-        The 2D device mesh is formed in the way where the shard dimension is the same as self._dist_group_size.
-
-        Args:
-            device_type (str): Device type (specified as a string).
-            ranks_in_replicated_group (Tensor): Ranks in replicated group.
-
-        Returns:
-            device_mesh (dtensor.DeviceMesh): Device mesh.
-
-        """
-
-        @cache
-        def get_device_mesh(ranks_in_replicated_group: Tensor) -> dtensor.DeviceMesh:
-            return dtensor.DeviceMesh(
-                device_type=device_type,
-                mesh=ranks_in_replicated_group.view(-1, self._dist_group_size),
-                mesh_dim_names=("replicate", "shard"),
-            )
-
-        return get_device_mesh(ranks_in_replicated_group)
-
     def _allocate_zeros_distributed_tensor(
         self,
         shape: Tuple[int, ...],
@@ -887,9 +861,10 @@ def _allocate_zeros_distributed_tensor(
         ranks_in_replicated_group = torch.tensor(
             dist.get_process_group_ranks(self._hsdp_device_mesh.get_group(0))
         )
-        device_mesh_2d = self._get_device_mesh(
+        device_mesh_2d = get_device_mesh(
             device_type=device.type,
-            ranks_in_replicated_group=ranks_in_replicated_group,
+            mesh=ranks_in_replicated_group.view(-1, self._dist_group_size),
+            mesh_dim_names=("replicate", "shard"),
         )
 
         return dtensor_zeros(
 
@@ -12,7 +12,8 @@
 from dataclasses import dataclass, field
 
 from itertools import chain
-from typing import Any, DefaultDict, Sequence, Tuple, Union
+from types import TracebackType
+from typing import Any, DefaultDict, Optional, Sequence, Tuple, Type, Union
 
 import torch
 from distributed_shampoo.utils.shampoo_block_info import BlockInfo
@@ -775,3 +776,32 @@ def compute_root_inverse_residuals(
             tuple(relative_errors),
             tuple(relative_residuals),
         )
+
+
+class DequantizePreconditionersContext:
+    """DequantizePreconditionersContext is used for automatically dequantize and then quantize the preconditioners used within this context.
+
+    Args:
+        preconditioner_list (PreconditionerList): Preconditioner list which contains the preconditioners to be dequantized and quantized.
+
+    Examples:
+        >>> with DequantizePreconditionersContext(preconditioner_list):
+        >>>     # Do something with the preconditioners, and preconditioner_list will be dequantized.
+        >>> # After the context is exited, the preconditioners will be quantized.
+
+    """
+
+    def __init__(self, preconditioner_list: PreconditionerList) -> None:
+        self._preconditioner_list = preconditioner_list
+
+    def __enter__(self) -> "DequantizePreconditionersContext":
+        self._preconditioner_list.dequantize_preconditioners()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        self._preconditioner_list.quantize_preconditioners()
@@ -76,7 +76,8 @@ def dequantize(self, dequantized_dtype: torch.dtype) -> Tensor:
                 self.quantized_values, dtype=dequantized_dtype
             )
             QuantizedTensor._convert_float_to_float(
-                dequantized_values, self.quantized_values
+                src=self.quantized_values,
+                dest=dequantized_values,
             )
             return dequantized_values
         else:
@@ -189,7 +190,7 @@ def dequantize(self) -> Tuple[Tensor, ...]:
             )
 
     def dequantize_(self) -> None:
-        if self.dequantized_value_list is not None:
+        if self.is_dequantized_stored():
             logger.warning(
                 "Dequantized values are already stored; overwriting these values..."
             )
@@ -217,7 +218,7 @@ def quantize(self, tensor_list: Tuple[Tensor, ...]) -> None:
             )
 
     def quantize_(self) -> None:
-        if self.dequantized_value_list is None:
+        if not self.is_dequantized_stored():
             logger.warning(
                 f"No stored dequantized values {self.dequantized_value_list=}. Must first call dequantize_()."
             )
@@ -232,7 +233,7 @@ def quantize_(self) -> None:
 
     @property
     def dequantized_value(self) -> Tuple[Tensor, ...]:
-        assert self.dequantized_value_list is not None
+        assert self.dequantized_value_list is not None  # make type checker happy
         return self.dequantized_value_list
 
     @property
@@ -243,7 +244,7 @@ def is_dequantized_stored(self) -> bool:
         return self.dequantized_value_list is not None
 
     def compress(self, selector: Tuple[bool, ...]) -> "QuantizedTensorList":
-        assert self.dequantized_value_list is None
+        assert not self.is_dequantized_stored()
         masked_quantized_value_list = compress_list(self.quantized_value_list, selector)
         masked_min_values = compress_list(self._min_values, selector)
         masked_max_values = compress_list(self._max_values, selector)
 
@@ -29,7 +29,7 @@ def merge_small_dims(tensor_shape: Sequence[int], threshold: int) -> Tuple[int,
 
     # Squeeze tensor shape to remove dimension with 1; if all dimensions are 1,
     # then add a 1 to the tensor shape.
-    squeezed_tensor_shape = list(filter(lambda t: t > 1, tensor_shape)) or [1]
+    squeezed_tensor_shape = list(filter(lambda t: t != 1, tensor_shape)) or [1]
     new_tensor_shape = [squeezed_tensor_shape[0]]
     for next_tensor_shape in squeezed_tensor_shape[1:]:
         if (new_dimension := new_tensor_shape[-1] * next_tensor_shape) <= threshold: