Add utilities for conv1d support (#78)

skyw · web-flow · commit e363d73bf363 · 2025-11-21T11:32:45.000-08:00
* add flattened conv1d

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/docs/apidocs/utils.md b/docs/apidocs/utils.md
@@ -14,4 +14,10 @@ emerging_optimizers.utils.eig
 =============================
 .. automodule:: emerging_optimizers.utils.eig
     :members:
+
+
+emerging_optimizers.utils.modules
+=================================
+.. automodule:: emerging_optimizers.utils.modules
+    :members:
 ```
diff --git a/emerging_optimizers/utils/modules.py b/emerging_optimizers/utils/modules.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Self
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Conv1dFlatWeights(nn.Conv1d):
+    """Conv1d with weights+bias stored in a single 2D tensor
+
+    There are conv1d used in some LLM, in mamba mixer for example. Because the weight is not 2d, we cannot apply
+    many of the emerging optimizers originally introduced for 2d weights of Linear layers without bias. Since
+    convolution can be viewed as a matrix multiplication with im2col (either implicit or explicit), we can flatten
+    the weight into a single 2D tensor and then apply the emerging optimizers to it.
+
+    Bias is not commonly used in most LLM's anymore, but they are often included in this type of conv1d.
+    Since bias is mathematically the 0 order term of the polynomial, we can combine weight and bias into a
+    single 2D tensor.
+
+    Arguments are the same as ::class:`torch.nn.Conv1d`.
+
+    Note:
+        This implementation potentially introduces a small overhead because of split weights can combining gradients
+        of it. This should be trivial compared to computational cost of LLM training. If it becomes a concern, a
+        kernel can be developed to eliminate the overhead.
+
+    Note:
+        Similar flattening logic can be applied to N-D convolution. But since we don't have use cases of them in LLM
+        yet, they are not supported despite the __init__() function is generalized enough to support N-D convolution.
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        assert self.padding_mode == "zeros", "Only zeros padding is supported"
+
+        self.weight: nn.Parameter[torch.Tensor]
+        self.bias: nn.Parameter[torch.Tensor] | None | str
+
+        flat_weight_shape = [self.out_channels, math.prod(self.weight.shape[1:])]
+        if self.bias is not None:
+            flat_weight_shape[1] += 1
+        flat_weight_buffer = torch.empty(flat_weight_shape, device=self.weight.device, dtype=self.weight.dtype)
+        if self.bias is not None:
+            flat_weight_buffer[..., :-1].copy_(self.weight.view(self.out_channels, -1))
+            flat_weight_buffer[..., -1].copy_(self.bias)
+            del self.bias
+            self.has_bias = True
+            self.bias = "dummy"  # Trick con1d.extra_repr() to not print bias=False
+        else:
+            flat_weight_buffer.copy_(self.weight.view(self.out_channels, -1))
+            self.has_bias = False
+        del self.weight
+
+        self.weight = nn.Parameter(flat_weight_buffer)
+
+    @classmethod
+    def from_conv1d(cls, conv1d: nn.Conv1d) -> Self:
+        conv1d_flat = cls(
+            in_channels=conv1d.in_channels,
+            out_channels=conv1d.out_channels,
+            kernel_size=conv1d.kernel_size,
+            bias=conv1d.bias is not None,
+            stride=conv1d.stride,
+            padding=conv1d.padding,
+            dilation=conv1d.dilation,
+            groups=conv1d.groups,
+            padding_mode=conv1d.padding_mode,
+            device=conv1d.weight.device,
+            dtype=conv1d.weight.dtype,
+        )
+
+        if conv1d.bias is not None:
+            conv1d_flat.weight.data[..., :-1].copy_(conv1d.weight.data.view(conv1d.out_channels, -1))
+            conv1d_flat.weight.data[..., -1].copy_(conv1d.bias.data)
+        else:
+            conv1d_flat.weight.data.copy_(conv1d.weight.data.view(conv1d.out_channels, -1))
+        return conv1d_flat
+
+    @property
+    def weight_shape(self) -> tuple[int, int, int]:
+        return (self.out_channels, self.in_channels // self.groups, self.kernel_size[0])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.has_bias:
+            weight = self.weight[..., :-1].view(self.weight_shape)
+            bias = self.weight[..., -1]
+        else:
+            weight = self.weight.view(self.weight_shape)
+            bias = None
+
+        return F.conv1d(x, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+
+    def extra_repr(self) -> str:
+        base_repr = super().extra_repr()
+        return f"{base_repr}, flattened_param_shape={tuple(self.weight.shape)}"
diff --git a/tests/ci/L0_Tests_GPU.sh b/tests/ci/L0_Tests_GPU.sh
@@ -30,5 +30,6 @@ coverage run -p --source=emerging_optimizers tests/normalized_optimizer_converge
 coverage run -p --source=emerging_optimizers tests/test_psgd_contractions.py --device=cuda  -v -2 || error=1
 coverage run -p --source=emerging_optimizers tests/test_psgd_utils.py --device=cuda  -v -2 || error=1
 coverage run -p --source=emerging_optimizers tests/test_psgd_convergence.py --device=cuda  -v -2 || error=1
+coverage run -p --source=emerging_optimizers tests/test_utils_modules.py -v -2 || error=1
 
 exit "${error}"
diff --git a/tests/test_utils_modules.py b/tests/test_utils_modules.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from absl.testing import absltest, parameterized
+
+from emerging_optimizers.utils.modules import Conv1dFlatWeights
+
+
+class TestConv1dFlatWeights(parameterized.TestCase):
+    @parameterized.product(
+        in_channels=[3, 5, 7],
+        out_channels=[4, 6, 8],
+        kernel_size=[2, 3, 4],
+        bias=[False, True],
+        batch_size=[4, 5, 6],
+    )
+    def test_matches_conv1d(self, in_channels, out_channels, kernel_size, bias, batch_size):
+        kwargs = dict(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias, device="cuda"
+        )
+        torch.manual_seed(42)
+        conv = nn.Conv1d(**kwargs)
+        torch.manual_seed(42)
+        conv_flat = Conv1dFlatWeights(**kwargs)
+
+        self.assertEqual(conv_flat.weight.dim(), 2)
+
+        x = torch.randn(batch_size, in_channels, kernel_size, device="cuda")
+        y_ref = conv(x)
+        y_test = conv_flat(x)
+
+        torch.testing.assert_close(y_ref, y_test, atol=0, rtol=0)
+
+        y_ref.sum().backward()
+        y_test.sum().backward()
+        if bias:
+            torch.testing.assert_close(
+                conv.weight.grad.view(-1), conv_flat.weight.grad[:, :-1].reshape(-1), atol=0, rtol=0
+            )
+            torch.testing.assert_close(conv.bias.grad, conv_flat.weight.grad[:, -1], atol=0, rtol=0)
+        else:
+            torch.testing.assert_close(conv.weight.grad.view(-1), conv_flat.weight.grad.reshape(-1), atol=0, rtol=0)
+
+    @parameterized.product(
+        bias=[False, True],
+    )
+    def test_extra_repr(self, bias):
+        conv_flat = Conv1dFlatWeights(in_channels=3, out_channels=4, kernel_size=2, bias=bias)
+        print(conv_flat)
+
+    @parameterized.product(
+        in_channels=[3, 5, 7],
+        out_channels=[4, 6, 8],
+        kernel_size=[2, 3, 4],
+        bias=[False, True],
+        batch_size=[4, 5, 6],
+    )
+    def test_from_conv1d(self, in_channels, out_channels, kernel_size, bias, batch_size):
+        kwargs = dict(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias, device="cuda"
+        )
+        torch.manual_seed(42)
+        conv = nn.Conv1d(**kwargs)
+        torch.manual_seed(42)
+        conv_flat = Conv1dFlatWeights.from_conv1d(conv)
+        x = torch.randn(batch_size, in_channels, kernel_size, device="cuda")
+        y_ref = conv(x)
+        y_test = conv_flat(x)
+        torch.testing.assert_close(y_ref, y_test, atol=0, rtol=0)
+        y_ref.sum().backward()
+        y_test.sum().backward()
+        if bias:
+            torch.testing.assert_close(
+                conv.weight.grad.view(-1), conv_flat.weight.grad[:, :-1].reshape(-1), atol=0, rtol=0
+            )
+            torch.testing.assert_close(conv.bias.grad, conv_flat.weight.grad[:, -1], atol=0, rtol=0)
+        else:
+            torch.testing.assert_close(conv.weight.grad.view(-1), conv_flat.weight.grad.reshape(-1), atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    absltest.main()