Skip to content

Commit 7df7988

Browse files
committed
fix: log block eigenvalue summary events
Signed-off-by: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
1 parent 3dc98de commit 7df7988

2 files changed

Lines changed: 45 additions & 9 deletions

File tree

deepspeed/runtime/engine.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2814,6 +2814,19 @@ def zero_grad(self):
28142814
for param_name, param in self.module.named_parameters():
28152815
param.grad = None
28162816

2817+
def _eigenvalue_summary_events(self):
2818+
if not (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()):
2819+
return []
2820+
2821+
events = []
2822+
for i, ev_value in enumerate(self.block_eigenvalue.values()):
2823+
events.append((
2824+
f"Train/Eigenvalues/ModelBlockParam_{i}",
2825+
ev_value[0],
2826+
self.global_samples,
2827+
))
2828+
return events
2829+
28172830
def clip_fp32_gradients(self):
28182831
clip_grad_norm_(parameters=self.module.parameters(), max_norm=self.gradient_clipping(), mpu=self.mpu)
28192832

@@ -2963,15 +2976,7 @@ def step(self, lr_kwargs=None):
29632976
self.global_samples,
29642977
))
29652978

2966-
if (self.eigenvalue_enabled()
2967-
and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()):
2968-
ev_values = self.block_eigenvalue.values()
2969-
for i in range(len(ev_values)):
2970-
self.summary_events.append((
2971-
f"Train/Eigenvalues/ModelBlockParam_{i}",
2972-
self.ev_values[i][0],
2973-
self.global_samples,
2974-
))
2979+
self.summary_events.extend(self._eigenvalue_summary_events())
29752980
self.monitor.write_events(self.summary_events)
29762981

29772982
# Check flops profiling

tests/unit/runtime/test_engine.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from deepspeed.runtime.engine import DeepSpeedEngine
5+
6+
7+
def test_eigenvalue_summary_events_use_block_eigenvalue_values():
8+
engine = object.__new__(DeepSpeedEngine)
9+
engine.block_eigenvalue = {
10+
"block_a": (0.25, 0),
11+
"block_b": (0.5, 1),
12+
}
13+
engine.gas_boundary_ctr = 4
14+
engine.global_samples = 128
15+
engine.eigenvalue_enabled = lambda: True
16+
engine.eigenvalue_gas_boundary_resolution = lambda: 2
17+
18+
assert engine._eigenvalue_summary_events() == [
19+
("Train/Eigenvalues/ModelBlockParam_0", 0.25, 128),
20+
("Train/Eigenvalues/ModelBlockParam_1", 0.5, 128),
21+
]
22+
23+
24+
def test_eigenvalue_summary_events_skip_non_boundary_steps():
25+
engine = object.__new__(DeepSpeedEngine)
26+
engine.block_eigenvalue = {"block_a": (0.25, 0)}
27+
engine.gas_boundary_ctr = 3
28+
engine.eigenvalue_enabled = lambda: True
29+
engine.eigenvalue_gas_boundary_resolution = lambda: 2
30+
31+
assert engine._eigenvalue_summary_events() == []

0 commit comments

Comments
 (0)