ServiceNow
diff --git a/‎fast_llm/csrc/data.cpp
+60-1 b/‎fast_llm/csrc/data.cpp
+60-1
diff --git a/‎fast_llm/data/data/gpt/config.py
+9 b/‎fast_llm/data/data/gpt/config.py
+9
diff --git a/‎fast_llm/data/data/gpt/data.py
+1 b/‎fast_llm/data/data/gpt/data.py
+1
diff --git a/‎fast_llm/data/dataset/gpt/config.py
+1 b/‎fast_llm/data/dataset/gpt/config.py
+1
diff --git a/‎fast_llm/data/dataset/gpt/sampled.py
+131-54 b/‎fast_llm/data/dataset/gpt/sampled.py
+131-54
diff --git a/‎fast_llm/engine/checkpoint/external.py
+27-3 b/‎fast_llm/engine/checkpoint/external.py
+27-3
diff --git a/‎fast_llm/engine/checkpoint/state_dict.py
+4-2 b/‎fast_llm/engine/checkpoint/state_dict.py
+4-2
diff --git a/‎fast_llm/layers/language_model/config.py
+12 b/‎fast_llm/layers/language_model/config.py
+12
diff --git a/‎fast_llm/layers/language_model/head.py
+66-20 b/‎fast_llm/layers/language_model/head.py
+66-20
@@ -27,7 +27,7 @@
 
 /*
  Helper methods for fast index mapping builds.
- Changes for Fast-LLM: Use int16 for dataset index, add verbose argument to build_sample_idx.
+ Changes for Fast-LLM: Use int16 for dataset index, add verbose argument to build_sample_idx, add build_sample_idx_padded
 */
 
 #include <iostream>
@@ -129,6 +129,65 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
 
 }
 
+py::array build_padded_token_cumsum(const py::array_t<int32_t>& sizes_,
+                                const int32_t seq_length,
+                                const int32_t token_cumsum_rate,
+                                const int64_t offset
+                              ) {
+  /*
+  Build token cumsums at regular intervals from document sizes with padding in mind.
+  We inject 0 or more padding tokens at the end of every sequence to fill the sequence length.
+  */
+  int32_t seq_size = 0;
+  int64_t sizes_idx = 0;
+  int32_t samples = 0;
+  auto sizes = sizes_.unchecked<1>();
+  std::vector<int64_t> token_cumsum;
+
+  int64_t cumsum = offset;
+
+  while (sizes_idx < sizes.size()) {
+    int32_t size = sizes[sizes_idx];
+    if (size > seq_length) {
+      // Skip sequences that are too long, to avoid truncations
+      if (samples % token_cumsum_rate==0) token_cumsum.push_back(cumsum);
+      sizes_idx += 1;
+      samples += 1;
+    } else if (seq_size + size > seq_length) {
+      // add padded tokens if a document does not fit in current sequence and start a new sequence
+      cumsum += seq_length - seq_size;
+      seq_size = 0;
+    } else {
+      // Increment here to account for padding. This ensures that the stored values match the beginning of the next document.
+      if (samples % token_cumsum_rate==0) token_cumsum.push_back(cumsum);
+      seq_size += size;
+      cumsum += size;
+      sizes_idx += 1;
+      samples += 1;
+    }
+  }
+
+  // Add a final (padded) entry so we know how many tokens there are in total.
+  cumsum += seq_length - seq_size;
+  token_cumsum.push_back(cumsum);
+
+
+  int64_t* token_cumsum_result = new int64_t[token_cumsum.size()];
+  memcpy(token_cumsum_result, token_cumsum.data(), token_cumsum.size() * sizeof(int64_t));
+
+  py::capsule free_when_done(token_cumsum_result, [](void *mem_) {
+    int64_t *mem = reinterpret_cast<int64_t*>(mem_);
+    delete[] mem;
+  });
+
+  const auto byte_size = sizeof(int64_t);
+  return py::array(std::vector<int64_t>{token_cumsum.size()},
+                   {byte_size},
+                   token_cumsum_result,
+                   free_when_done);
+}
+
 PYBIND11_MODULE(data, m) {
     m.def("build_sample_idx", &build_sample_idx);
+    m.def("build_padded_token_cumsum", &build_padded_token_cumsum);
 }
@@ -57,6 +57,15 @@ class GPTDataConfig(DataConfig, GPTLegacyConfig):
         desc="Multiprocessing context. Do not touch.",
         hint=FieldHint.expert,
     )
+    truncate_documents: bool = Field(
+        default=True,
+        desc=(
+            "If enabled, documents may be truncated while being packed to fit the sequence length."
+            "Otherwise, sequences will be padded such that every document lies entirely within a sample"
+            " (and documents exceeding the sequence length will be skipped altogether)."
+        ),
+        hint=FieldHint.feature,
+    )
 
     def _validate(self) -> None:
         if not self.datasets:
 
@@ -135,6 +135,7 @@ def setup(
                     sequence_length=self._max_sequence_length,
                     vocab_size=self._vocab_size,
                     tokenizer=self._tokenizer,
+                    truncate_documents=self._config.truncate_documents,
                     cross_document_attention=self._cross_document_attention,
                 )
                 dataset = self._config.datasets[dataset_name].build_and_sample(sampling)
 
@@ -81,6 +81,7 @@ class GPTSamplingData(SamplingData):
     sequence_length: int
     vocab_size: int
     tokenizer: "Tokenizer"
+    truncate_documents: bool = True
     cross_document_attention: bool = True
 
 
 
@@ -141,12 +141,16 @@ def import_weight(
         return weight
 
 
-class IgnoreWeightConverter(WeightConverter):
+class IgnoreImportWeightConverter(WeightConverter):
+    def __post_init__(self):
+        Assert.eq(len(self.fast_llm_name), 0)
+        Assert.gt(len(self.export_name), 0)
+
     def export_weight(
         self, weight: tuple[torch.Tensor | SafeTensorSlice, ...]
     ) -> tuple[torch.Tensor | SafeTensorSlice, ...]:
         raise RuntimeError(
-            f"IgnoreWeightConverter should not be used for export: {self.fast_llm_name}, {self.export_name}"
+            f"IgnoreImportWeightConverter should not be used for export: {self.fast_llm_name}, {self.export_name}"
         )
 
     def import_weight(
@@ -155,6 +159,24 @@ def import_weight(
         return ()
 
 
+class IgnoreExportWeightConverter(WeightConverter):
+    def __post_init__(self):
+        Assert.gt(len(self.fast_llm_name), 0)
+        Assert.eq(len(self.export_name), 0)
+
+    def export_weight(
+        self, weight: tuple[torch.Tensor | SafeTensorSlice, ...]
+    ) -> tuple[torch.Tensor | SafeTensorSlice, ...]:
+        return ()
+
+    def import_weight(
+        self, weight: tuple[torch.Tensor | SafeTensorSlice, ...]
+    ) -> tuple[torch.Tensor | SafeTensorSlice, ...]:
+        raise RuntimeError(
+            f"IgnoreExportWeightConverter should not be used for import: {self.fast_llm_name}, {self.export_name}"
+        )
+
+
 class CopyWeightConverter(WeightConverter):
     def export_weight(
         self, weight: tuple[torch.Tensor | SafeTensorSlice, ...]
@@ -198,7 +220,9 @@ def __init__(self, model: "FastLLMModel"):
             if weight_converter.fast_llm_name
         }
         self._import_converters = {
-            weight_converter.export_name[0]: weight_converter for weight_converter in weight_converters
+            weight_converter.export_name[0]: weight_converter
+            for weight_converter in weight_converters
+            if weight_converter.export_name
         }
 
     @classmethod
 
@@ -56,7 +56,9 @@ def save(self, config: CheckpointSaveConfig, metadata: CheckpointMetadata) -> No
                 saver.add_tensor(self._get_key(exported_name, shard_name), exported_tensor)
 
         for shard_name, shard_state_dict in state_dict.items():
-            assert not shard_state_dict, (shard_name, list(state_dict))
+            assert (
+                not shard_state_dict
+            ), f"Un-handled entries after conversion: {({k: list(v) for k, v in state_dict.items()})}"
 
         index = saver.finalize()
         if self._model.config.distributed.rank == 0:
@@ -90,7 +92,7 @@ def load(self, config: CheckpointLoadConfig, metadata: CheckpointMetadata) -> No
                     context.mark_as_loaded(loaded, (parameter_name, shard_name))
 
             for shard_name, shard_state_dict in state_dict.items():
-                assert not shard_state_dict, (shard_name, list(state_dict))
+                assert not shard_state_dict, (shard_name, list(shard_state_dict))
 
     @classmethod
     @abc.abstractmethod
 
@@ -22,6 +22,12 @@ class LanguageModelLossNames:
     language_model_loss = "language_model_loss"
     z_loss = "z_loss"
 
+    @staticmethod
+    def multi_token_prediction_loss(index: int) -> str:
+        if index == 0:
+            return LanguageModelLossNames.language_model_loss
+        return f"language_model_loss_{index}"
+
 
 class LanguageModelKwargs:
     position_ids = "position_ids"
@@ -59,6 +65,12 @@ class LanguageModelArchitectureConfig(BaseModelArchitectureConfig):
     tie_word_embeddings: bool = Field(
         default=True, desc="Tie the output weights (logits) with the vocabulary embedding.", hint=FieldHint.core
     )
+    prediction_heads: int = Field(
+        default=1,
+        desc="Number of multi-token prediction heads.",
+        hint=FieldHint.feature,
+        valid=check_field(Assert.gt, 0),
+    )
 
     def _validate(self) -> None:
         if self.use_position_embeddings is None:
 
@@ -14,7 +14,7 @@
 from fast_llm.functional.cross_entropy import cross_entropy_forward_backward
 from fast_llm.functional.linear import output_parallel_linear_backward, output_parallel_linear_forward
 from fast_llm.functional.dpo import compute_simplified_dpo_loss
-from fast_llm.layers.common.auxiliary_loss import z_loss
+from fast_llm.layers.common.auxiliary_loss import AuxiliaryLoss, z_loss
 from fast_llm.layers.language_model.config import (
     LanguageModelBaseConfig,
     LanguageModelDimNames,
@@ -25,7 +25,9 @@
 from fast_llm.layers.transformer.config import TransformerDimNames, TransformerKwargs
 from fast_llm.logging import log_distributed_tensor
 from fast_llm.tensor import ParameterMeta, TensorMeta, init_normal_
-from fast_llm.utils import div
+from fast_llm.utils import Assert, div
+
+OUTPUT_WEIGHTS = "output_weights"
 
 
 class LanguageModelHead[ConfigType: LanguageModelBaseConfig](Configurable[LanguageModelBaseConfig], Layer):
@@ -39,6 +41,7 @@ def __init__(
         self,
         config: LanguageModelBaseConfig,
         tensor_space: TensorSpace,
+        prediction_distance: int,
     ):
         super().__init__(config)
         self._debug_transformer = config.transformer.debug_transformer
@@ -57,23 +60,24 @@ def __init__(
 
         hidden_dim = self._tensor_space.get_tensor_dim(TransformerDimNames.hidden)
 
+        self._loss_name = LanguageModelLossNames.multi_token_prediction_loss(prediction_distance)
         self.final_norm = config.transformer.normalization.get_layer(hidden_dim)
         self._logits_scale_factor = config.logits_scale_factor
         self._z_loss_factor = config.logit_z_loss
 
-        # untie embedding weights
-        if not self._tie_word_embeddings:
-            vocab_dim = self._tensor_space.get_tensor_dim(
-                LanguageModelDimNames.vocab_tp if self._parallel_embeddings else LanguageModelDimNames.vocab
-            )
-            self.output_weights = ParameterMeta.from_dims(
-                (vocab_dim, hidden_dim),
-                init_method=init_normal_(
-                    std=config.init_method_std_embed,
-                    min_val=config.init_method_min_embed,
-                    max_val=config.init_method_max_embed,
-                ),
-            )
+        # Distance of the target token prediction
+        # 0: next-token prediction
+        # >0: multi-token prediction (MTP)
+        Assert.geq(prediction_distance, 0)
+        self._prediction_distance = prediction_distance
+        self.is_last_head = self._prediction_distance == config.prediction_heads - 1
+        if self._prediction_distance > 0:
+            assert (
+                not self._sequence_parallel_logits
+            ), "Sequence parallel logits not supported for multi-token prediction."
+            assert not self._cross_entropy_splits, "Cross-entropy splits not supported for multi-token prediction."
+
+        self._init_output_weights(hidden_dim, config)
 
         self._loss_function_type = config.loss_function_type
         if self._loss_function_type == LossFunctionType.cross_entropy:
@@ -97,6 +101,23 @@ def __init__(
         if hasattr(self, "output_weights"):
             self.output_weights = self._config.transformer.peft.apply_weight(self.output_weights)
 
+    def _init_output_weights(self, hidden_dim: TensorDim, config) -> None:
+        # Only the first head defines the output weights
+        if self._tie_word_embeddings or self._prediction_distance > 0:
+            return
+        # untie embedding weights
+        vocab_dim = self._tensor_space.get_tensor_dim(
+            LanguageModelDimNames.vocab_tp if self._parallel_embeddings else LanguageModelDimNames.vocab
+        )
+        self.output_weights = ParameterMeta.from_dims(
+            (vocab_dim, hidden_dim),
+            init_method=init_normal_(
+                std=config.init_method_std_embed,
+                min_val=config.init_method_min_embed,
+                max_val=config.init_method_max_embed,
+            ),
+        )
+
     def forward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None, metrics: dict | None = None
     ) -> torch.Tensor:
@@ -107,33 +128,50 @@ def forward(
                 tensor_name="Loss",
                 reductions=((DistributedDimNames.data, ReduceOp.AVG),),  # noqa
             )
+        if not self.is_last_head:
+            # MTP: split the stacked input
+            shared_hidden, input_ = torch.unbind(input_, dim=0)
         # TODO: Pytorch copies the grads in backward for no reason (not sure if still the case)
         # TODO: Torch compile implementation sometimes break.
         # TODO: Double-check correctness, optimize a bit more.
         # TODO: Drop autograd entirely.
         # TODO: Skip cross-entropy backward if not needed.
         language_model_loss = self._forward(input_, kwargs, losses)
         if language_model_loss is not None:
-            losses[LanguageModelLossNames.language_model_loss].append(language_model_loss)
+            losses[self._loss_name].append(language_model_loss)
         # TODO: Return the model output when needed.
-        return language_model_loss
+        if self.is_last_head:
+            # Last head should return the loss for backward.
+            return language_model_loss
+        else:
+            # Backward hook to compute the gradient of the loss
+            shared_hidden = AuxiliaryLoss.apply(shared_hidden, language_model_loss, 1.0)
+            # MTP: Return shared_hidden to be used by the next head.
+            return shared_hidden
 
     def _forward_backward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        labels = kwargs[LanguageModelKwargs.labels].flatten() if LanguageModelKwargs.labels in kwargs else None
+        labels = kwargs[LanguageModelKwargs.labels] if LanguageModelKwargs.labels in kwargs else None
+        # MTP: Shift the labels
+        labels = labels[:, self._prediction_distance :].flatten() if labels is not None else None
         if self._sequence_parallel_logits:
             labels = split_op(labels, self._tensor_space.distributed.tensor_group, 0)
         do_grad = labels is not None and self.training
         input_ = input_.detach().requires_grad_(do_grad)
         with torch.enable_grad():
-            ln_output = self.final_norm(input_)
+            # MTP: truncate the input
+            if self._prediction_distance > 0:
+                truncated_input = input_[:, : -self._prediction_distance, :].contiguous()
+            else:
+                truncated_input = input_
+            ln_output = self.final_norm(truncated_input)
 
         grad_output = kwargs[TransformerKwargs.grad_output] / (
             self._group_size if self._sequence_parallel_logits else 1
         )
 
-        output_weights = kwargs[WORD_EMBEDDINGS_WEIGHT] if self._tie_word_embeddings else self.output_weights
+        output_weights = self._get_output_weights(kwargs)
         loss, ln_output_grad = self._loss_fcn(
             ln_output.detach(), labels, output_weights, grad_output, kwargs, losses
         )
@@ -176,6 +214,13 @@ def _logits_dpo(
 
 
 
+    def _get_output_weights(self, kwargs: dict) -> torch.Tensor:
+        if self._tie_word_embeddings:
+            return kwargs[WORD_EMBEDDINGS_WEIGHT]
+        if self._prediction_distance > 0:
+            return kwargs[OUTPUT_WEIGHTS]
+        return self.output_weights
+
     def _logits_cross_entropy_forward_backward_split(
         self,
         input_: torch.Tensor,
@@ -195,6 +240,7 @@ def _logits_cross_entropy_forward_backward_split(
                 return None, None
         else:
             loss = None
+            # TODO MTP: allow a _cross_entropy_splits that is not a divisor of the sequence length
             split_size = div(labels.numel(), self._cross_entropy_splits)
             grad_output /= self._cross_entropy_splits
             logit_input = input_.flatten(0, -2)
Original file line number	Diff line number	Diff line change
`@@ -135,6 +135,7 @@ def setup(`
`135`	`135`	`sequence_length=self._max_sequence_length,`
`136`	`136`	`vocab_size=self._vocab_size,`
`137`	`137`	`tokenizer=self._tokenizer,`
	`138`	`+ truncate_documents=self._config.truncate_documents,`
`138`	`139`	`cross_document_attention=self._cross_document_attention,`
`139`	`140`	`)`
`140`	`141`	`dataset = self._config.datasets[dataset_name].build_and_sample(sampling)`