Skip to content

Commit d113f5d

Browse files
fingertapZwwWayne
authored andcommitted
[Fix] Avoid infinite GPU waiting in dist training (open-mmlab#6501)
* [open-mmlab#6495] fix infinite GPU waiting in dist training * print log_vars keys in assertion msg * linting issue
1 parent 2629917 commit d113f5d

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

mmdet/models/detectors/base.py

+10
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,16 @@ def _parse_losses(self, losses):
198198
loss = sum(_value for _key, _value in log_vars.items()
199199
if 'loss' in _key)
200200

201+
# If the loss_vars has different length, GPUs will wait infinitely
202+
if dist.is_available() and dist.is_initialized():
203+
log_var_length = torch.tensor(len(log_vars), device=loss.device)
204+
dist.all_reduce(log_var_length)
205+
message = (f'rank {dist.get_rank()}' +
206+
f' len(log_vars): {len(log_vars)}' + ' keys: ' +
207+
','.join(log_vars.keys()))
208+
assert log_var_length == len(log_vars) * dist.get_world_size(), \
209+
'loss log variables are different across GPUs!\n' + message
210+
201211
log_vars['loss'] = loss
202212
for loss_name, loss_value in log_vars.items():
203213
# reduce loss when distributed training

0 commit comments

Comments
 (0)