Add ODS logging to all runners

Francisc Bungiu · facebook-github-bot · commit 32a186ebc229 · 2023-08-07T04:47:37.000-07:00
Summary: Pull Request resolved: #5050 X-link: facebookresearch/d2go#606 Allow attaching a monitoring service to the training loop. Reviewed By: miqueljubert Differential Revision: D47595332 fbshipit-source-id: 49d770207aeea56113c008fcd29ad7b545cec849
diff --git a/detectron2/engine/train_loop.py b/detectron2/engine/train_loop.py
@@ -388,14 +388,16 @@ def write_metrics(
         metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
         metrics_dict["data_time"] = data_time
 
+        storage = get_event_storage()
+        # Keep track of data time per rank
+        storage.put_scalar("rank_data_time", data_time, cur_iter=cur_iter)
+
         # Gather metrics among all workers for logging
         # This assumes we do DDP-style training, which is currently the only
         # supported method in detectron2.
         all_metrics_dict = comm.gather(metrics_dict)
 
         if comm.is_main_process():
-            storage = get_event_storage()
-
             # data_time among workers can have high variance. The actual latency
             # caused by data_time is the maximum among workers.
             data_time = np.max([x.pop("data_time") for x in all_metrics_dict])