Skip to content

Commit 035226e

Browse files
committed
feat: address @acasagrande review comments
1 parent 06e41ae commit 035226e

File tree

8 files changed

+839
-675
lines changed

8 files changed

+839
-675
lines changed

aiperf/common/enums/metric_enums.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,115 @@ def from_python_type(cls, type: type[MetricValueTypeT]) -> "MetricValueType":
475475
return MetricValueType(type_name)
476476

477477

478+
class FrequencyMetricUnitInfo(BaseMetricUnitInfo):
479+
"""Information about a frequency unit for metrics."""
480+
481+
long_name: str
482+
hertz: float
483+
484+
def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
485+
"""Convert a value from this unit to another unit."""
486+
if not isinstance(other_unit, FrequencyMetricUnit | FrequencyMetricUnitInfo):
487+
return super().convert_to(other_unit, value)
488+
489+
return value * (self.hertz / other_unit.hertz)
490+
491+
492+
class FrequencyMetricUnit(BaseMetricUnit):
493+
"""Defines frequency units for metrics."""
494+
495+
HERTZ = FrequencyMetricUnitInfo(
496+
tag="Hz",
497+
long_name="hertz",
498+
hertz=1.0,
499+
)
500+
MEGAHERTZ = FrequencyMetricUnitInfo(
501+
tag="MHz",
502+
long_name="megahertz",
503+
hertz=1_000_000.0,
504+
)
505+
GIGAHERTZ = FrequencyMetricUnitInfo(
506+
tag="GHz",
507+
long_name="gigahertz",
508+
hertz=1_000_000_000.0,
509+
)
510+
511+
@cached_property
512+
def info(self) -> FrequencyMetricUnitInfo:
513+
"""Get the info for the frequency unit."""
514+
return self._info # type: ignore
515+
516+
@cached_property
517+
def hertz(self) -> float:
518+
"""The number of hertz in the frequency unit."""
519+
return self.info.hertz
520+
521+
@cached_property
522+
def long_name(self) -> str:
523+
"""The long name of the frequency unit."""
524+
return self.info.long_name
525+
526+
527+
class TemperatureMetricUnitInfo(BaseMetricUnitInfo):
528+
"""Information about a temperature unit for metrics."""
529+
530+
long_name: str
531+
celsius: float
532+
offset: float = 0.0
533+
534+
def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
535+
"""Convert a value from this unit to another unit."""
536+
if not isinstance(other_unit, TemperatureMetricUnit | TemperatureMetricUnitInfo):
537+
return super().convert_to(other_unit, value)
538+
539+
# Convert to Celsius first, then to target unit
540+
celsius_value = (value + self.offset) * self.celsius
541+
return (celsius_value / other_unit.celsius) - other_unit.offset
542+
543+
544+
class TemperatureMetricUnit(BaseMetricUnit):
545+
"""Defines temperature units for metrics."""
546+
547+
CELSIUS = TemperatureMetricUnitInfo(
548+
tag="°C",
549+
long_name="celsius",
550+
celsius=1.0,
551+
offset=0.0,
552+
)
553+
FAHRENHEIT = TemperatureMetricUnitInfo(
554+
tag="°F",
555+
long_name="fahrenheit",
556+
celsius=5.0/9.0,
557+
offset=-32.0,
558+
)
559+
KELVIN = TemperatureMetricUnitInfo(
560+
tag="K",
561+
long_name="kelvin",
562+
celsius=1.0,
563+
offset=-273.15,
564+
)
565+
566+
@cached_property
567+
def info(self) -> TemperatureMetricUnitInfo:
568+
"""Get the info for the temperature unit."""
569+
return self._info # type: ignore
570+
571+
@cached_property
572+
def celsius(self) -> float:
573+
"""The celsius conversion factor."""
574+
return self.info.celsius
575+
576+
@cached_property
577+
def offset(self) -> float:
578+
"""The offset for temperature conversion."""
579+
return self.info.offset
580+
581+
@cached_property
582+
def long_name(self) -> str:
583+
"""The long name of the temperature unit."""
584+
return self.info.long_name
585+
586+
478587
class MetricFlags(Flag):
479588
"""Defines the possible flags for metrics that are used to determine how they are processed or grouped.
480589
These flags are intended to be an easy way to group metrics, or turn on/off certain features.

aiperf/common/models/telemetry_models.py

Lines changed: 92 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,18 @@ class TelemetryRecord(AIPerfBaseModel):
6363
total_gpu_memory: float | None = Field(
6464
default=None, description="Total GPU memory in GB"
6565
)
66+
sm_clock_frequency: float | None = Field(
67+
default=None, description="SM clock frequency in MHz"
68+
)
69+
memory_clock_frequency: float | None = Field(
70+
default=None, description="Memory clock frequency in MHz"
71+
)
72+
memory_temperature: float | None = Field(
73+
default=None, description="Memory temperature in °C"
74+
)
75+
gpu_temperature: float | None = Field(
76+
default=None, description="GPU temperature in °C"
77+
)
6678

6779

6880
class GpuMetadata(AIPerfBaseModel):
@@ -80,30 +92,65 @@ class GpuMetadata(AIPerfBaseModel):
8092
hostname: str | None = Field(default=None, description="Host machine name")
8193

8294

95+
class GpuTelemetrySnapshot(AIPerfBaseModel):
96+
"""All metrics for a single GPU at one point in time.
97+
98+
Groups all metric values collected during a single collection cycle,
99+
eliminating timestamp duplication across individual metrics.
100+
"""
101+
102+
timestamp_ns: int = Field(description="Collection timestamp for all metrics")
103+
metrics: dict[str, float] = Field(
104+
default_factory=dict,
105+
description="All metric values at this timestamp"
106+
)
107+
108+
83109
class GpuMetricTimeSeries(AIPerfBaseModel):
84-
"""Time series data for a single metric on a single GPU.
110+
"""Time series data for all metrics on a single GPU.
85111
86-
Stores list of (value, timestamp) tuples for data integrity and future time-series visualization.
112+
Uses grouped snapshots instead of individual metric time series to eliminate
113+
timestamp duplication and improve storage efficiency.
87114
"""
88115

89-
data_points: list[tuple[float, int]] = Field(
90-
default_factory=list, description="List of (value, timestamp_ns) pairs"
116+
snapshots: list[GpuTelemetrySnapshot] = Field(
117+
default_factory=list,
118+
description="Chronological snapshots of all metrics"
91119
)
92120

93-
def append(self, value: float, timestamp_ns: int) -> None:
94-
"""Add new data point to time series.
121+
def append_snapshot(self, metrics: dict[str, float], timestamp_ns: int) -> None:
122+
"""Add new snapshot with all metrics at once.
95123
96124
Args:
97-
value: Metric value (e.g., power usage in Watts)
98-
timestamp_ns: Timestamp when measurement was taken
125+
metrics: Dictionary of metric_name -> value for this timestamp
126+
timestamp_ns: Timestamp when measurements were taken
99127
"""
128+
snapshot = GpuTelemetrySnapshot(
129+
timestamp_ns=timestamp_ns,
130+
metrics={k: v for k, v in metrics.items() if v is not None}
131+
)
132+
self.snapshots.append(snapshot)
133+
134+
def get_metric_values(self, metric_name: str) -> list[tuple[float, int]]:
135+
"""Extract time series data for a specific metric.
100136
101-
self.data_points.append((value, timestamp_ns))
137+
Args:
138+
metric_name: Name of the metric to extract
139+
140+
Returns:
141+
List of (value, timestamp_ns) tuples for the specified metric
142+
"""
143+
return [
144+
(snapshot.metrics[metric_name], snapshot.timestamp_ns)
145+
for snapshot in self.snapshots
146+
if metric_name in snapshot.metrics
147+
]
102148

103-
def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
104-
"""Convert time series to MetricResult with statistical summary.
149+
def to_metric_result(self, metric_name: str, tag: str, header: str, unit: str) -> MetricResult:
150+
"""Convert metric time series to MetricResult with statistical summary.
105151
106152
Args:
153+
metric_name: Name of the metric to analyze
107154
tag: Unique identifier for this metric (used by dashboard, exports, API)
108155
header: Human-readable name for display
109156
unit: Unit of measurement (e.g., "W" for Watts, "%" for percentage)
@@ -112,15 +159,16 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
112159
MetricResult with min/max/avg/percentiles computed from time series
113160
114161
Raises:
115-
NoMetricValue: If no data points are available
162+
NoMetricValue: If no data points are available for the specified metric
116163
"""
164+
data_points = self.get_metric_values(metric_name)
117165

118-
if not self.data_points:
166+
if not data_points:
119167
raise NoMetricValue(
120-
"No telemetry data available for statistical computation"
168+
f"No telemetry data available for metric '{metric_name}'"
121169
)
122170

123-
values = np.array([point[0] for point in self.data_points])
171+
values = np.array([point[0] for point in data_points])
124172
p1, p5, p25, p50, p75, p90, p95, p99 = np.percentile(
125173
values, [1, 5, 25, 50, 75, 90, 95, 99]
126174
)
@@ -146,41 +194,57 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
146194

147195

148196
class GpuTelemetryData(AIPerfBaseModel):
149-
"""Complete telemetry data for one GPU: metadata + all metric time series.
197+
"""Complete telemetry data for one GPU: metadata + grouped metric time series.
150198
151199
This combines static GPU information with dynamic time-series data,
152-
providing the complete picture for one GPU's telemetry.
200+
providing the complete picture for one GPU's telemetry using efficient grouped snapshots.
153201
"""
154202

155203
metadata: GpuMetadata = Field(description="Static GPU information")
156-
metrics: dict[str, GpuMetricTimeSeries] = Field(
157-
default_factory=dict,
158-
description="Time series for each metric type (power_usage, utilization, etc.)",
204+
time_series: GpuMetricTimeSeries = Field(
205+
default_factory=GpuMetricTimeSeries,
206+
description="Grouped time series for all metrics",
159207
)
160208

161209
def add_record(self, record: TelemetryRecord) -> None:
162-
"""Add telemetry record to appropriate metric time series.
210+
"""Add telemetry record as a grouped snapshot.
163211
164212
Args:
165213
record: New telemetry data point from DCGM collector
166214
167-
Note: Automatically creates new time series for metrics that don't exist yet
215+
Note: Groups all metric values from the record into a single snapshot
168216
"""
169-
170217
metric_mapping = {
171218
"gpu_power_usage": record.gpu_power_usage,
172219
"gpu_power_limit": record.gpu_power_limit,
173220
"energy_consumption": record.energy_consumption,
174221
"gpu_utilization": record.gpu_utilization,
175222
"gpu_memory_used": record.gpu_memory_used,
176223
"total_gpu_memory": record.total_gpu_memory,
224+
"sm_clock_frequency": record.sm_clock_frequency,
225+
"memory_clock_frequency": record.memory_clock_frequency,
226+
"memory_temperature": record.memory_temperature,
227+
"gpu_temperature": record.gpu_temperature,
177228
}
178229

179-
for metric_name, value in metric_mapping.items():
180-
if value is not None:
181-
if metric_name not in self.metrics:
182-
self.metrics[metric_name] = GpuMetricTimeSeries()
183-
self.metrics[metric_name].append(value, record.timestamp_ns)
230+
# Filter out None values and add as single snapshot
231+
valid_metrics = {k: v for k, v in metric_mapping.items() if v is not None}
232+
if valid_metrics:
233+
self.time_series.append_snapshot(valid_metrics, record.timestamp_ns)
234+
235+
def get_metric_result(self, metric_name: str, tag: str, header: str, unit: str) -> MetricResult:
236+
"""Get MetricResult for a specific metric.
237+
238+
Args:
239+
metric_name: Name of the metric to analyze
240+
tag: Unique identifier for this metric
241+
header: Human-readable name for display
242+
unit: Unit of measurement
243+
244+
Returns:
245+
MetricResult with statistical summary for the specified metric
246+
"""
247+
return self.time_series.to_metric_result(metric_name, tag, header, unit)
184248

185249

186250
class TelemetryHierarchy(AIPerfBaseModel):

aiperf/gpu_telemetry/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,8 @@
2626
"DCGM_FI_DEV_GPU_UTIL": "gpu_utilization",
2727
"DCGM_FI_DEV_FB_USED": "gpu_memory_used",
2828
"DCGM_FI_DEV_FB_TOTAL": "total_gpu_memory",
29+
"DCGM_FI_DEV_SM_CLOCK": "sm_clock_frequency",
30+
"DCGM_FI_DEV_MEM_CLOCK": "memory_clock_frequency",
31+
"DCGM_FI_DEV_MEMORY_TEMP": "memory_temperature",
32+
"DCGM_FI_DEV_GPU_TEMP": "gpu_temperature",
2933
}

0 commit comments

Comments
 (0)