Lightning-AI · justusschock · Nov 25, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025
@@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))
 
 
+- Fixed `ModelCheckpoint.file_exists` using broadcast in DDP, reducing memory usage when checking for existing checkpoints ([#19674](https://github.com/Lightning-AI/pytorch-lightning/issues/19674))
+
+
 ---
 
 ## [2.5.6] - 2025-11-05

@@ -999,8 +999,10 @@ def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
     def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
         """Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal
         state to diverge between ranks."""
-        exists = self._fs.exists(filepath)
-        return trainer.strategy.broadcast(exists)
+        # In distributed setups, only global rank 0 touches the filesystem
+        local_decision = self._fs.exists(filepath) if trainer.is_global_zero else False
+        # Reduce the decision across ranks using an "any"-style reduction to decide if the file exists anywhere
+        return trainer.strategy.reduce_boolean_decision(local_decision, all=False)
 
     def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, current: str) -> bool:
         """Checks if the previous checkpoint should be deleted.

@@ -121,3 +121,28 @@ def on_train_epoch_end(self):
     trainer.fit(model)
     if os.getenv("LOCAL_RANK") == "0":
         assert save_mock.call_count == expected
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_model_checkpoint_ddp_monitor_none(tmp_path):
+    """Ensure that ModelCheckpoint with monitor=None works correctly under DDP and exercises the file_exists path."""
+
+    model = BoringModel()
+    checkpoint = callbacks.ModelCheckpoint(dirpath=tmp_path, monitor=None, save_top_k=1)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        callbacks=[checkpoint],
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        max_epochs=1,
+        strategy="ddp",
+        accelerator="gpu",
+        devices=2,
+        limit_train_batches=2,
+        limit_val_batches=0,
+    )
+
+    trainer.fit(model)
+    if os.getenv("LOCAL_RANK") == "0":
+        assert checkpoint.best_model_path
-Original file line number
+Diff line change
@@ Expand Up @@
     - Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))
+    - Fixed `ModelCheckpoint.file_exists` using broadcast in DDP, reducing memory usage when checking for existing checkpoints ([#19674](https://github.com/Lightning-AI/pytorch-lightning/issues/19674))
     ---
     ## [2.5.6] - 2025-11-05
@@ Expand Down @@