ServiceNow · jlamypoirier · Mar 27, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/fast_llm/engine/multi_stage/stage_base.py b/fast_llm/engine/multi_stage/stage_base.py
@@ -85,6 +85,10 @@ def __init__(
         # TODO: Separate fsdp for tied weights?
         self._fsdp_index = {name: i for i, fsdp in enumerate(self._fsdps) for name in fsdp.parameter_names}
 
+    @property
+    def requires_grad(self):
+        return any(fsdp.requires_grad for fsdp in self._fsdps)
+
     @property
     def mode(self) -> StageMode:
         assert self._is_setup

diff --git a/fast_llm/engine/schedule/runner.py b/fast_llm/engine/schedule/runner.py
@@ -406,7 +406,7 @@ def _forward(self, context: BatchContext, step: Step) -> None:
             losses=context.losses,
             metrics=context.metrics,
         )
-        if context.is_training:
+        if step.backward_step is not None:
             context.contexts[step.backward_step.global_index] = grad_context
         self._record_compute(context, step)
         return output

diff --git a/fast_llm/engine/schedule/schedule.py b/fast_llm/engine/schedule/schedule.py
@@ -141,7 +141,7 @@ def __init__(
             phase=self._phase,
         )
 
-        self._steps = self._create_steps()
+        self._steps, self._first_grad_stage = self._create_steps()
 
         self._create_index()
 
@@ -214,8 +214,8 @@ def _create_index(self) -> None:
         # Consistency checks
         step_map = self._step_map.copy()
         for data_index in range(self._batch_config.num_inputs):
-            for type_ in (StepType.forward, StepType.backward) if self._is_training else (StepType.forward,):
-                for stage in range(self._num_stages):
+            for type_ in (StepType.forward, StepType.backward):
+                for stage in range(0 if type_ == StepType.forward else self._first_grad_stage, self._num_stages):
                     assert (
                         step_map.pop((type_, stage, data_index), None) is not None
                     ), f"Missing {type_.value} step with stage={stage}, data_index={data_index}"
@@ -225,7 +225,8 @@ def _create_index(self) -> None:
         for i, step in enumerate(self._steps):
             if self._is_training:
                 if step.type_ == StepType.forward:
-                    step.backward_step = self.get_step(StepType.backward, *step.map_index[1:])
+                    if step.stage >= self._first_grad_stage:
+                        step.backward_step = self.get_step(StepType.backward, *step.map_index[1:])
                 else:
                     step.forward_step = self.get_step(StepType.forward, *step.map_index[1:])
             if step.type_ == StepType.forward and step.stage == 0:
@@ -236,7 +237,8 @@ def _create_index(self) -> None:
                 step.prev_step = self.get_step(
                     step.type_, step.stage + (1 if step.type_ == StepType.backward else -1), *step.map_index[2:]
                 )
-            if step.type_ == StepType.backward and step.stage == 0:
+
+            if step.type_ == StepType.backward and step.stage == self._first_grad_stage:
                 step.next_step = None
             elif step.type_ == StepType.forward and step.stage == self._num_stages - 1:
                 step.next_step = self.get_step(StepType.backward, *step.map_index[1:]) if self._is_training else None
@@ -249,11 +251,15 @@ def _create_index(self) -> None:
         for step in self._steps:
             if self._is_training:
                 if step.type_ == StepType.forward:
-                    Assert.gt(step.backward_step.global_index, step.global_index)
-                    Assert.is_(step.backward_step.forward_step, step)
+                    if step.stage >= self._first_grad_stage:
+                        Assert.gt(step.backward_step.global_index, step.global_index)
+                        Assert.is_(step.backward_step.forward_step, step)
+                    else:
+                        assert step.backward_step is None
                 else:
                     Assert.lt(step.forward_step.global_index, step.global_index)
-                    Assert.is_(step.forward_step.backward_step, step)
+                    if step.stage >= self._first_grad_stage:
+                        Assert.is_(step.forward_step.backward_step, step)
             if step.next_step is not None:
                 Assert.gt(step.next_step.global_index, step.global_index)
                 Assert.is_(step.next_step.prev_step, step)
@@ -303,7 +309,10 @@ def _setup_reduce_steps(self, grad_buffer_indices: dict[int, int]) -> None:
                 reduce_step.reduce_accumulate = reduction_count[reduce_step.stage] > 0
                 reduction_count[reduce_step.stage] += 1
             for stage, count in enumerate(reduction_count):
-                assert (count > 0) == (stage % self._distributed.pipeline_parallel == self._distributed.pipeline_rank)
+                assert (count > 0) == (
+                    stage >= self._first_grad_stage
+                    and (stage % self._distributed.pipeline_parallel == self._distributed.pipeline_rank)
+                )
 
     def _setup_timeline(self) -> None:
         # TODO: Include network time
@@ -468,8 +477,16 @@ def get_data_index_split(
             micro_sequence,
         )
 
-    def _create_steps(self) -> list[Step]:
+    def _create_steps(self) -> tuple[list[Step], int]:
         steps = []
+        if self._is_training:
+            # The first stage(s) may not have any trainable parameters,
+            # in which case we shouldn't run the backward pass.
+            first_grad_stage = 0
+            while first_grad_stage < self._num_stages and not self._multi_stage.stages[first_grad_stage].requires_grad:
+                first_grad_stage += 1
+        else:
+            first_grad_stage = self._num_stages
         for depth_first_micro_batch in range(self._batch_config.depth_first_micro_batches):
             for stage in range(self._num_stages):
                 for breadth_first_micro_batch in range(self._batch_config.breadth_first_micro_batches):
@@ -485,7 +502,7 @@ def _create_steps(self) -> list[Step]:
                             )
                         )
             if self._is_training:
-                for stage in reversed(range(self._num_stages)):
+                for stage in reversed(range(first_grad_stage, self._num_stages)):
                     for breadth_first_micro_batch in range(self._batch_config.breadth_first_micro_batches):
                         for micro_sequence in reversed(range(self._batch_config.num_micro_sequences)):
                             steps.append(
@@ -498,4 +515,4 @@ def _create_steps(self) -> list[Step]:
                                     type_=StepType.backward,
                                 )
                             )
-        return steps
+        return steps, first_grad_stage
diff --git a/fast_llm/functional/triton/normalization.py b/fast_llm/functional/triton/normalization.py
@@ -68,6 +68,7 @@ def triton_normalization_backward_kernel_1(
     n_cols,
     n_rows,
     has_bias: tl_constexpr,
+    parameter_grad: tl_constexpr,
     zero_centered: tl_constexpr,
     block_size: tl_constexpr,
     block_size_row: tl_constexpr,
@@ -108,18 +109,19 @@ def triton_normalization_backward_kernel_1(
     tl.store(grad_input_ptr + offsets, grad_input, mask=mask)
 
     # Parameter grad partial sums
-    parameter_offsets = tl.program_id(0) * n_cols + cols
-    grad_weight_partial_ptr = grad_weight_partial_ptr + parameter_offsets
-    grad_weight_partial = (grad_output * input_normalized).to(weight.dtype)
-    grad_weight_partial = tl.sum(grad_weight_partial, axis=0)[None, :]
+    if parameter_grad:
+        parameter_offsets = tl.program_id(0) * n_cols + cols
+        grad_weight_partial_ptr = grad_weight_partial_ptr + parameter_offsets
+        grad_weight_partial = (grad_output * input_normalized).to(weight.dtype)
+        grad_weight_partial = tl.sum(grad_weight_partial, axis=0)[None, :]
 
-    if has_bias:
-        grad_bias_partial_ptr = grad_bias_partial_ptr + parameter_offsets
-        grad_bias_partial = tl.sum(grad_output.to(weight.dtype), axis=0)[None, :]
+        if has_bias:
+            grad_bias_partial_ptr = grad_bias_partial_ptr + parameter_offsets
+            grad_bias_partial = tl.sum(grad_output.to(weight.dtype), axis=0)[None, :]
 
-    tl.store(grad_weight_partial_ptr, grad_weight_partial, mask=col_mask)
-    if has_bias:
-        tl.store(grad_bias_partial_ptr, grad_bias_partial, mask=col_mask)  # noqa
+        tl.store(grad_weight_partial_ptr, grad_weight_partial, mask=col_mask)
+        if has_bias:
+            tl.store(grad_bias_partial_ptr, grad_bias_partial, mask=col_mask)  # noqa
 
 
 @triton_jit()
@@ -211,6 +213,11 @@ def triton_normalization_backward(grad_output: torch.Tensor, context: list[typin
     context.clear()
     has_bias = bias is not None
 
+    parameter_grad = weight.requires_grad
+    assert parameter_grad == hasattr(weight, "grad_buffer")
+    if has_bias:
+        assert parameter_grad == bias.requires_grad
+
     grad_output = grad_output.contiguous()
 
     n_rows = grad_output.shape[:-1].numel()
@@ -232,12 +239,17 @@ def triton_normalization_backward(grad_output: torch.Tensor, context: list[typin
 
     grad_input = torch.empty_like(grad_output)
 
-    grad_is_zero = param_get_and_unset_is_zero(weight)
-    grad_weight = weight.grad_buffer
-    # TODO: Any point in making it full precision?
-    grad_weight_partial = grad_output.new_empty(num_blocks_row, n_cols)
+    if parameter_grad:
+        grad_is_zero = param_get_and_unset_is_zero(weight)
+        grad_weight = weight.grad_buffer
+        # TODO: Any point in making it full precision?
+        grad_weight_partial = grad_output.new_empty(num_blocks_row, n_cols)
+    else:
+        grad_is_zero = True
+        grad_weight = None
+        grad_weight_partial = None
 
-    if has_bias:
+    if has_bias and parameter_grad:
         assert param_get_and_unset_is_zero(bias) == grad_is_zero
         grad_bias = bias.grad_buffer
         grad_bias_partial = grad_output.new_empty(num_blocks_row, n_cols)
@@ -256,24 +268,26 @@ def triton_normalization_backward(grad_output: torch.Tensor, context: list[typin
         n_cols,
         n_rows,
         has_bias,
+        parameter_grad,
         zero_centered,
         block_size,
         block_size_row,
         num_warps=num_warps,
     )
-    triton_normalization_backward_kernel_2[(triton.cdiv(n_cols, block_size_n),)](
-        grad_weight_partial,
-        grad_bias_partial,
-        grad_weight,
-        grad_bias,
-        num_blocks_row,
-        n_cols,
-        has_bias,
-        not grad_is_zero,
-        block_size_m,
-        block_size_n,
-        num_ctas=1,
-    )
+    if parameter_grad:
+        triton_normalization_backward_kernel_2[(triton.cdiv(n_cols, block_size_n),)](
+            grad_weight_partial,
+            grad_bias_partial,
+            grad_weight,
+            grad_bias,
+            num_blocks_row,
+            n_cols,
+            has_bias,
+            not grad_is_zero,
+            block_size_m,
+            block_size_n,
+            num_ctas=1,
+        )
     return grad_input
 
 

diff --git a/fast_llm/layers/common/config.py b/fast_llm/layers/common/config.py
@@ -7,6 +7,7 @@
 
 if typing.TYPE_CHECKING:
     from fast_llm.engine.config_utils.tensor_space import TensorDim
+    from fast_llm.layers.common.linear import LinearBase, LinearLike
     from fast_llm.layers.common.normalization import LayerNorm, RMSNorm
 
 
@@ -115,3 +116,59 @@ def _from_dict(
         cls._handle_renamed_field(default, "normalization_implementation", "implementation")
         cls._handle_renamed_field(default, "layer_norm_init_range", "initialization_range")
         return super()._from_dict(default, strict, flat)
+
+
+class PeftType(str, enum.Enum):
+    # TODO : Use a dynamic config type instead.
+    none = "none"
+    lora = "lora"
+
+
+@config_class()
+class PeftArchitectureConfig(BaseModelArchitectureConfig):
+    _abstract = False
+
+
+@config_class()
+class PeftConfig(PeftArchitectureConfig, BaseModelConfig):
+    # TODO: Architecture/non-architecture split might not make much sense here.
+
+    type: PeftType = Field(
+        default=PeftType.none,
+        desc="The type of parameter-efficient fine tuning to use Only LoRA is supported at the moment.",
+        hint=FieldHint.core,
+    )
+    rank: int = Field(
+        default=8,
+        desc="The LoRA rank, i.e. the size of the intermediate dimension.",
+        hint=FieldHint.stability,
+    )
+    alpha: float = Field(
+        default=8.0,
+        desc="The LoRA scaling parameter.",
+        hint=FieldHint.stability,
+    )
+    dropout: float = Field(
+        default=0.0,
+        desc="Dropout rate for LoRA.",
+        hint=FieldHint.stability,
+    )
+
+    def apply_linear(self, linear: "LinearBase", **kwargs) -> "LinearLike":
+        if self.type == PeftType.none:
+            return linear
+        elif self.type == PeftType.lora:
+            from fast_llm.layers.common.peft import lora_linear
+
+            # TODO: Init method?
+            return lora_linear(
+                linear,
+                linear.weight.param_init_method,
+                linear.weight.param_init_method,
+                self.rank,
+                self.alpha,
+                self.dropout,
+                **kwargs,
+            )
+        else:
+            raise NotImplementedError(self.type)