Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))


- Fixed `ModelCheckpoint.file_exists` using broadcast in DDP, reducing memory usage when checking for existing checkpoints ([#19674](https://github.com/Lightning-AI/pytorch-lightning/issues/19674))


---

## [2.5.6] - 2025-11-05
Expand Down
6 changes: 4 additions & 2 deletions src/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,8 +999,10 @@ def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
"""Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal
state to diverge between ranks."""
exists = self._fs.exists(filepath)
return trainer.strategy.broadcast(exists)
# In distributed setups, only global rank 0 touches the filesystem
local_decision = self._fs.exists(filepath) if trainer.is_global_zero else False
# Reduce the decision across ranks using an "any"-style reduction to decide if the file exists anywhere
return trainer.strategy.reduce_boolean_decision(local_decision, all=False)

def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, current: str) -> bool:
"""Checks if the previous checkpoint should be deleted.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,28 @@ def on_train_epoch_end(self):
trainer.fit(model)
if os.getenv("LOCAL_RANK") == "0":
assert save_mock.call_count == expected


@RunIf(min_cuda_gpus=2, standalone=True)
def test_model_checkpoint_ddp_monitor_none(tmp_path):
"""Ensure that ModelCheckpoint with monitor=None works correctly under DDP and exercises the file_exists path."""

model = BoringModel()
checkpoint = callbacks.ModelCheckpoint(dirpath=tmp_path, monitor=None, save_top_k=1)

trainer = Trainer(
default_root_dir=tmp_path,
callbacks=[checkpoint],
enable_progress_bar=False,
enable_model_summary=False,
max_epochs=1,
strategy="ddp",
accelerator="gpu",
devices=2,
limit_train_batches=2,
limit_val_batches=0,
)

trainer.fit(model)
if os.getenv("LOCAL_RANK") == "0":
assert checkpoint.best_model_path
Loading