disable no packing for legacy sampling and code cleaning

tobyzl2 · tobyzl2 · commit 5dfd676a33b1 · 2025-04-04T21:52:11.000Z
diff --git a/fast_llm/data/dataset/gpt/memmap.py b/fast_llm/data/dataset/gpt/memmap.py
@@ -270,13 +270,8 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: typing.Iterable[GP
             spans = np.vstack(spans, dtype=np.int32)
         else:
             spans = np.array(spans, dtype=np.int32)
-        # if len(chosen_spans) > 0:
-        #     chosen_spans = np.vstack(chosen_spans, dtype=np.int32)
-        # else:
+
         chosen_spans = np.array(chosen_spans, dtype=np.int32).reshape(-1, 2)
-        # if len(rejected_spans) > 0:
-        #     rejected_spans = np.vstack(rejected_spans, dtype=np.int32)
-        # else:
         rejected_spans = np.array(rejected_spans, dtype=np.int32).reshape(-1, 2)
 
         # Write the index file (.idx)
diff --git a/fast_llm/data/dataset/gpt/sampled.py b/fast_llm/data/dataset/gpt/sampled.py
@@ -117,7 +117,7 @@ def __init__(
             )
             # TODO: Names are confusing
 
-            # contains document indexes/pointers in order of traversal (shuffled)
+            # contains shuffled document indicies
             self._document_shuffling = MemmapArray(base_path.with_name(base_path.name + "_shuffling.npy"))
 
             # contains cumulative sum of document sizes grouped by TOKEN_CUMSUM_RATE in shuffled order
@@ -521,6 +521,10 @@ def __init__(
         self._indexed_dataset = indexed_dataset
         self._num_samples = sampling.num_samples
         self._sequence_length = sampling.sequence_length
+        if not sampling.config.enable_packing:
+            raise NotImplementedError(
+                "Legacy sampling only supports document packing. Please use the latest dataset format."
+            )
         if not sampling.truncate_documents:
             raise NotImplementedError(
                 "Legacy sampling only supports document truncation. Please use the latest dataset format."
diff --git a/fast_llm/engine/schedule/schedule.py b/fast_llm/engine/schedule/schedule.py
@@ -135,7 +135,7 @@ def __init__(
         if self._batch_config.num_inputs < self._distributed.pipeline_parallel:
             warnings.warn("Not enough input to achieve true pipeline parallelism.")
 
-        # Setup the activation metas. (metadata for sequence parallel)
+        # Setup the activation metas.
         self._preprocessed_meta = self._multi_stage.base_model.preprocess_meta(
             self._batch_config,
             phase=self._phase,
@@ -191,8 +191,8 @@ def get_step(
         return self._step_map[(type_, stage, data_index)]
 
     def _create_index(self) -> None:
-        self._device_steps: list[list[Step]] = [[] for _ in range(self._distributed.pipeline_parallel)]  # steps for each device
-        self._step_map = {} # map index (type, stage, data index) => step
+        self._device_steps: list[list[Step]] = [[] for _ in range(self._distributed.pipeline_parallel)]
+        self._step_map = {}
         for i, step in enumerate(self._steps):
             Assert.in_range(step.stage, 0, self._num_stages)
             Assert.in_range(
@@ -204,7 +204,6 @@ def _create_index(self) -> None:
             step.global_index = i
             # TODO: More configurable placement?
 
-            # perform looping here
             step.pipeline_rank = step.stage % self._distributed.pipeline_parallel
             step.local_index = len(self._device_steps[step.pipeline_rank])
             self._device_steps[step.pipeline_rank].append(step)
diff --git a/fast_llm/functional/dpo.py b/fast_llm/functional/dpo.py
@@ -12,7 +12,6 @@ def compute_logps_for_spans(
     log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
     
     # gather log probabilities corresponding to the target tokens
-    # selected_log_probs = log_probs[torch.arange(logits.shape[0] - 1), targets]
     selected_log_probs = log_probs[:-1].gather(dim=-1, index=targets.unsqueeze(-1)).squeeze(-1)
     
     # apply chosen mask
@@ -25,9 +24,6 @@ def compute_logps_for_spans(
     rejected_mask[rejected_span[:, 0]: rejected_span[:, 1] + 1] = 1
     rejected_logp = (selected_log_probs * rejected_mask).sum()
 
-    # chosen_logp = selected_log_probs[chosen_span[:, 0]: chosen_span[:, 1] + 1].sum()
-    # rejected_logp = selected_log_probs[rejected_span[:, 0]: rejected_span[:, 1] + 1].sum()
-    
     return chosen_logp, rejected_logp
 
 def compute_simplified_dpo_loss(
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
@@ -120,7 +120,6 @@ def setup(self, distributed: Distributed) -> None:
         self._is_setup = True
 
 
-    # perform preprocessing for sequence parallel
     def preprocess_meta(
         self, batch_meta: BatchConfig | torch.Tensor, phase: PhaseType
     ) -> list[tuple[TensorMeta, dict]]:
@@ -166,15 +165,13 @@ def preprocess_meta(
             else sequence_q_dim
         )
 
-        # determins if batch dim or sequence dim is first
         need_sequence_first = hidden_sequence_q_dim.size != sequence_length
         if self._config.sequence_first is None:
             sequence_first = need_sequence_first
         else:
             sequence_first = self._config.sequence_first
             assert not (need_sequence_first and not sequence_first)
 
-        # hidden dim is model hidden size
         hidden_dim = self._tensor_space.get_tensor_dim(TransformerDimNames.hidden)
         hidden_dims = (
             (hidden_sequence_q_dim, batch_dim, hidden_dim)
@@ -199,7 +196,6 @@ def preprocess_meta(
             sequence_k = sequence_k_past + sequence_q_dim.size
             sequence_k_dim = TensorDim(TransformerDimNames.sequence_k, sequence_k)
 
-            # sequence_k_past is start and sequence_k is end of sequence
             tokens = TensorMeta.from_dims(
                 hidden_dims[:2], tensor_name=f"tokens_{sequence_k_past}_to_{sequence_k-1}", dtype=torch.int64
             )
@@ -294,7 +290,7 @@ def preprocess(
                     for i, spans in enumerate(batch.loss_masking_spans):
                         if not spans.numel():
                             continue
-                        # filter spans within the sequence or partially within the sequence
+                        # only keep spans within the sequence or partially within the sequence
                         valid_spans = spans[(spans[:, 0] <= sequence_k) & (spans[:, 1] >= sequence_offset)]
                         if valid_spans.numel():
                             # if span is partially within the sequence, truncate parts of spans that are outside of the sequence
@@ -310,7 +306,7 @@ def preprocess(
                     for i, spans in enumerate(batch.chosen_loss_masking_spans):
                         if not spans.numel():
                             continue
-                        # filter spans within the sequence or partially within the sequence
+                        # only keep spans within the sequence or partially within the sequence
                         valid_spans = spans[(spans[0] <= sequence_k) & (spans[1] >= sequence_offset)]
                         if valid_spans.numel():
                             # if span is partially within the sequence, truncate parts of spans that are outside of the sequence
@@ -322,7 +318,7 @@ def preprocess(
                     for i, spans in enumerate(batch.rejected_loss_masking_spans):
                         if not spans.numel():
                             continue
-                        # filter spans within the sequence or partially within the sequence
+                        # only keep spans within the sequence or partially within the sequence
                         valid_spans = spans[(spans[0] <= sequence_k) & (spans[1] >= sequence_offset)]
                         if valid_spans.numel():
                             # if span is partially within the sequence, truncate parts of spans that are outside of the sequence