@@ -63,6 +63,18 @@ class TelemetryRecord(AIPerfBaseModel):
6363 total_gpu_memory : float | None = Field (
6464 default = None , description = "Total GPU memory in GB"
6565 )
66+ sm_clock_frequency : float | None = Field (
67+ default = None , description = "SM clock frequency in MHz"
68+ )
69+ memory_clock_frequency : float | None = Field (
70+ default = None , description = "Memory clock frequency in MHz"
71+ )
72+ memory_temperature : float | None = Field (
73+ default = None , description = "Memory temperature in °C"
74+ )
75+ gpu_temperature : float | None = Field (
76+ default = None , description = "GPU temperature in °C"
77+ )
6678
6779
6880class GpuMetadata (AIPerfBaseModel ):
@@ -80,30 +92,65 @@ class GpuMetadata(AIPerfBaseModel):
8092 hostname : str | None = Field (default = None , description = "Host machine name" )
8193
8294
95+ class GpuTelemetrySnapshot (AIPerfBaseModel ):
96+ """All metrics for a single GPU at one point in time.
97+
98+ Groups all metric values collected during a single collection cycle,
99+ eliminating timestamp duplication across individual metrics.
100+ """
101+
102+ timestamp_ns : int = Field (description = "Collection timestamp for all metrics" )
103+ metrics : dict [str , float ] = Field (
104+ default_factory = dict ,
105+ description = "All metric values at this timestamp"
106+ )
107+
108+
83109class GpuMetricTimeSeries (AIPerfBaseModel ):
84- """Time series data for a single metric on a single GPU.
110+ """Time series data for all metrics on a single GPU.
85111
86- Stores list of (value, timestamp) tuples for data integrity and future time-series visualization.
112+ Uses grouped snapshots instead of individual metric time series to eliminate
113+ timestamp duplication and improve storage efficiency.
87114 """
88115
89- data_points : list [tuple [float , int ]] = Field (
90- default_factory = list , description = "List of (value, timestamp_ns) pairs"
116+ snapshots : list [GpuTelemetrySnapshot ] = Field (
117+ default_factory = list ,
118+ description = "Chronological snapshots of all metrics"
91119 )
92120
93- def append (self , value : float , timestamp_ns : int ) -> None :
94- """Add new data point to time series .
121+ def append_snapshot (self , metrics : dict [ str , float ] , timestamp_ns : int ) -> None :
122+ """Add new snapshot with all metrics at once .
95123
96124 Args:
97- value: Metric value (e.g., power usage in Watts)
98- timestamp_ns: Timestamp when measurement was taken
125+ metrics: Dictionary of metric_name -> value for this timestamp
126+ timestamp_ns: Timestamp when measurements were taken
99127 """
128+ snapshot = GpuTelemetrySnapshot (
129+ timestamp_ns = timestamp_ns ,
130+ metrics = {k : v for k , v in metrics .items () if v is not None }
131+ )
132+ self .snapshots .append (snapshot )
133+
134+ def get_metric_values (self , metric_name : str ) -> list [tuple [float , int ]]:
135+ """Extract time series data for a specific metric.
100136
101- self .data_points .append ((value , timestamp_ns ))
137+ Args:
138+ metric_name: Name of the metric to extract
139+
140+ Returns:
141+ List of (value, timestamp_ns) tuples for the specified metric
142+ """
143+ return [
144+ (snapshot .metrics [metric_name ], snapshot .timestamp_ns )
145+ for snapshot in self .snapshots
146+ if metric_name in snapshot .metrics
147+ ]
102148
103- def to_metric_result (self , tag : str , header : str , unit : str ) -> MetricResult :
104- """Convert time series to MetricResult with statistical summary.
149+ def to_metric_result (self , metric_name : str , tag : str , header : str , unit : str ) -> MetricResult :
150+ """Convert metric time series to MetricResult with statistical summary.
105151
106152 Args:
153+ metric_name: Name of the metric to analyze
107154 tag: Unique identifier for this metric (used by dashboard, exports, API)
108155 header: Human-readable name for display
109156 unit: Unit of measurement (e.g., "W" for Watts, "%" for percentage)
@@ -112,15 +159,16 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
112159 MetricResult with min/max/avg/percentiles computed from time series
113160
114161 Raises:
115- NoMetricValue: If no data points are available
162+ NoMetricValue: If no data points are available for the specified metric
116163 """
164+ data_points = self .get_metric_values (metric_name )
117165
118- if not self . data_points :
166+ if not data_points :
119167 raise NoMetricValue (
120- "No telemetry data available for statistical computation "
168+ f "No telemetry data available for metric ' { metric_name } ' "
121169 )
122170
123- values = np .array ([point [0 ] for point in self . data_points ])
171+ values = np .array ([point [0 ] for point in data_points ])
124172 p1 , p5 , p25 , p50 , p75 , p90 , p95 , p99 = np .percentile (
125173 values , [1 , 5 , 25 , 50 , 75 , 90 , 95 , 99 ]
126174 )
@@ -146,41 +194,57 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
146194
147195
148196class GpuTelemetryData (AIPerfBaseModel ):
149- """Complete telemetry data for one GPU: metadata + all metric time series.
197+ """Complete telemetry data for one GPU: metadata + grouped metric time series.
150198
151199 This combines static GPU information with dynamic time-series data,
152- providing the complete picture for one GPU's telemetry.
200+ providing the complete picture for one GPU's telemetry using efficient grouped snapshots .
153201 """
154202
155203 metadata : GpuMetadata = Field (description = "Static GPU information" )
156- metrics : dict [ str , GpuMetricTimeSeries ] = Field (
157- default_factory = dict ,
158- description = "Time series for each metric type (power_usage, utilization, etc.) " ,
204+ time_series : GpuMetricTimeSeries = Field (
205+ default_factory = GpuMetricTimeSeries ,
206+ description = "Grouped time series for all metrics " ,
159207 )
160208
161209 def add_record (self , record : TelemetryRecord ) -> None :
162- """Add telemetry record to appropriate metric time series .
210+ """Add telemetry record as a grouped snapshot .
163211
164212 Args:
165213 record: New telemetry data point from DCGM collector
166214
167- Note: Automatically creates new time series for metrics that don't exist yet
215+ Note: Groups all metric values from the record into a single snapshot
168216 """
169-
170217 metric_mapping = {
171218 "gpu_power_usage" : record .gpu_power_usage ,
172219 "gpu_power_limit" : record .gpu_power_limit ,
173220 "energy_consumption" : record .energy_consumption ,
174221 "gpu_utilization" : record .gpu_utilization ,
175222 "gpu_memory_used" : record .gpu_memory_used ,
176223 "total_gpu_memory" : record .total_gpu_memory ,
224+ "sm_clock_frequency" : record .sm_clock_frequency ,
225+ "memory_clock_frequency" : record .memory_clock_frequency ,
226+ "memory_temperature" : record .memory_temperature ,
227+ "gpu_temperature" : record .gpu_temperature ,
177228 }
178229
179- for metric_name , value in metric_mapping .items ():
180- if value is not None :
181- if metric_name not in self .metrics :
182- self .metrics [metric_name ] = GpuMetricTimeSeries ()
183- self .metrics [metric_name ].append (value , record .timestamp_ns )
230+ # Filter out None values and add as single snapshot
231+ valid_metrics = {k : v for k , v in metric_mapping .items () if v is not None }
232+ if valid_metrics :
233+ self .time_series .append_snapshot (valid_metrics , record .timestamp_ns )
234+
235+ def get_metric_result (self , metric_name : str , tag : str , header : str , unit : str ) -> MetricResult :
236+ """Get MetricResult for a specific metric.
237+
238+ Args:
239+ metric_name: Name of the metric to analyze
240+ tag: Unique identifier for this metric
241+ header: Human-readable name for display
242+ unit: Unit of measurement
243+
244+ Returns:
245+ MetricResult with statistical summary for the specified metric
246+ """
247+ return self .time_series .to_metric_result (metric_name , tag , header , unit )
184248
185249
186250class TelemetryHierarchy (AIPerfBaseModel ):
0 commit comments