ai-dynamo
diff --git a/‎aiperf/common/enums/metric_enums.py‎
Lines changed: 109 additions & 0 deletions b/‎aiperf/common/enums/metric_enums.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎aiperf/common/models/telemetry_models.py‎
Lines changed: 92 additions & 28 deletions b/‎aiperf/common/models/telemetry_models.py‎
Lines changed: 92 additions & 28 deletions
diff --git a/‎aiperf/gpu_telemetry/constants.py‎
Lines changed: 4 additions & 0 deletions b/‎aiperf/gpu_telemetry/constants.py‎
Lines changed: 4 additions & 0 deletions
@@ -475,6 +475,115 @@ def from_python_type(cls, type: type[MetricValueTypeT]) -> "MetricValueType":
         return MetricValueType(type_name)
 
 
+class FrequencyMetricUnitInfo(BaseMetricUnitInfo):
+    """Information about a frequency unit for metrics."""
+
+    long_name: str
+    hertz: float
+
+    def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
+        """Convert a value from this unit to another unit."""
+        if not isinstance(other_unit, FrequencyMetricUnit | FrequencyMetricUnitInfo):
+            return super().convert_to(other_unit, value)
+
+        return value * (self.hertz / other_unit.hertz)
+
+
+class FrequencyMetricUnit(BaseMetricUnit):
+    """Defines frequency units for metrics."""
+
+    HERTZ = FrequencyMetricUnitInfo(
+        tag="Hz",
+        long_name="hertz",
+        hertz=1.0,
+    )
+    MEGAHERTZ = FrequencyMetricUnitInfo(
+        tag="MHz",
+        long_name="megahertz",
+        hertz=1_000_000.0,
+    )
+    GIGAHERTZ = FrequencyMetricUnitInfo(
+        tag="GHz",
+        long_name="gigahertz",
+        hertz=1_000_000_000.0,
+    )
+
+    @cached_property
+    def info(self) -> FrequencyMetricUnitInfo:
+        """Get the info for the frequency unit."""
+        return self._info  # type: ignore
+
+    @cached_property
+    def hertz(self) -> float:
+        """The number of hertz in the frequency unit."""
+        return self.info.hertz
+
+    @cached_property
+    def long_name(self) -> str:
+        """The long name of the frequency unit."""
+        return self.info.long_name
+
+
+class TemperatureMetricUnitInfo(BaseMetricUnitInfo):
+    """Information about a temperature unit for metrics."""
+
+    long_name: str
+    celsius: float
+    offset: float = 0.0
+
+    def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
+        """Convert a value from this unit to another unit."""
+        if not isinstance(other_unit, TemperatureMetricUnit | TemperatureMetricUnitInfo):
+            return super().convert_to(other_unit, value)
+
+        # Convert to Celsius first, then to target unit
+        celsius_value = (value + self.offset) * self.celsius
+        return (celsius_value / other_unit.celsius) - other_unit.offset
+
+
+class TemperatureMetricUnit(BaseMetricUnit):
+    """Defines temperature units for metrics."""
+
+    CELSIUS = TemperatureMetricUnitInfo(
+        tag="°C",
+        long_name="celsius",
+        celsius=1.0,
+        offset=0.0,
+    )
+    FAHRENHEIT = TemperatureMetricUnitInfo(
+        tag="°F",
+        long_name="fahrenheit",
+        celsius=5.0/9.0,
+        offset=-32.0,
+    )
+    KELVIN = TemperatureMetricUnitInfo(
+        tag="K",
+        long_name="kelvin",
+        celsius=1.0,
+        offset=-273.15,
+    )
+
+    @cached_property
+    def info(self) -> TemperatureMetricUnitInfo:
+        """Get the info for the temperature unit."""
+        return self._info  # type: ignore
+
+    @cached_property
+    def celsius(self) -> float:
+        """The celsius conversion factor."""
+        return self.info.celsius
+
+    @cached_property
+    def offset(self) -> float:
+        """The offset for temperature conversion."""
+        return self.info.offset
+
+    @cached_property
+    def long_name(self) -> str:
+        """The long name of the temperature unit."""
+        return self.info.long_name
+
+
 class MetricFlags(Flag):
     """Defines the possible flags for metrics that are used to determine how they are processed or grouped.
     These flags are intended to be an easy way to group metrics, or turn on/off certain features.
 
@@ -63,6 +63,18 @@ class TelemetryRecord(AIPerfBaseModel):
     total_gpu_memory: float | None = Field(
         default=None, description="Total GPU memory in GB"
     )
+    sm_clock_frequency: float | None = Field(
+        default=None, description="SM clock frequency in MHz"
+    )
+    memory_clock_frequency: float | None = Field(
+        default=None, description="Memory clock frequency in MHz"
+    )
+    memory_temperature: float | None = Field(
+        default=None, description="Memory temperature in °C"
+    )
+    gpu_temperature: float | None = Field(
+        default=None, description="GPU temperature in °C"
+    )
 
 
 class GpuMetadata(AIPerfBaseModel):
@@ -80,30 +92,65 @@ class GpuMetadata(AIPerfBaseModel):
     hostname: str | None = Field(default=None, description="Host machine name")
 
 
+class GpuTelemetrySnapshot(AIPerfBaseModel):
+    """All metrics for a single GPU at one point in time.
+
+    Groups all metric values collected during a single collection cycle,
+    eliminating timestamp duplication across individual metrics.
+    """
+
+    timestamp_ns: int = Field(description="Collection timestamp for all metrics")
+    metrics: dict[str, float] = Field(
+        default_factory=dict,
+        description="All metric values at this timestamp"
+    )
+
+
 class GpuMetricTimeSeries(AIPerfBaseModel):
-    """Time series data for a single metric on a single GPU.
+    """Time series data for all metrics on a single GPU.
 
-    Stores list of (value, timestamp) tuples for data integrity and future time-series visualization.
+    Uses grouped snapshots instead of individual metric time series to eliminate
+    timestamp duplication and improve storage efficiency.
     """
 
-    data_points: list[tuple[float, int]] = Field(
-        default_factory=list, description="List of (value, timestamp_ns) pairs"
+    snapshots: list[GpuTelemetrySnapshot] = Field(
+        default_factory=list,
+        description="Chronological snapshots of all metrics"
     )
 
-    def append(self, value: float, timestamp_ns: int) -> None:
-        """Add new data point to time series.
+    def append_snapshot(self, metrics: dict[str, float], timestamp_ns: int) -> None:
+        """Add new snapshot with all metrics at once.
 
         Args:
-            value: Metric value (e.g., power usage in Watts)
-            timestamp_ns: Timestamp when measurement was taken
+            metrics: Dictionary of metric_name -> value for this timestamp
+            timestamp_ns: Timestamp when measurements were taken
         """
+        snapshot = GpuTelemetrySnapshot(
+            timestamp_ns=timestamp_ns,
+            metrics={k: v for k, v in metrics.items() if v is not None}
+        )
+        self.snapshots.append(snapshot)
+
+    def get_metric_values(self, metric_name: str) -> list[tuple[float, int]]:
+        """Extract time series data for a specific metric.
 
-        self.data_points.append((value, timestamp_ns))
+        Args:
+            metric_name: Name of the metric to extract
+
+        Returns:
+            List of (value, timestamp_ns) tuples for the specified metric
+        """
+        return [
+            (snapshot.metrics[metric_name], snapshot.timestamp_ns)
+            for snapshot in self.snapshots
+            if metric_name in snapshot.metrics
+        ]
 
-    def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
-        """Convert time series to MetricResult with statistical summary.
+    def to_metric_result(self, metric_name: str, tag: str, header: str, unit: str) -> MetricResult:
+        """Convert metric time series to MetricResult with statistical summary.
 
         Args:
+            metric_name: Name of the metric to analyze
             tag: Unique identifier for this metric (used by dashboard, exports, API)
             header: Human-readable name for display
             unit: Unit of measurement (e.g., "W" for Watts, "%" for percentage)
@@ -112,15 +159,16 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
             MetricResult with min/max/avg/percentiles computed from time series
 
         Raises:
-            NoMetricValue: If no data points are available
+            NoMetricValue: If no data points are available for the specified metric
         """
+        data_points = self.get_metric_values(metric_name)
 
-        if not self.data_points:
+        if not data_points:
             raise NoMetricValue(
-                "No telemetry data available for statistical computation"
+                f"No telemetry data available for metric '{metric_name}'"
             )
 
-        values = np.array([point[0] for point in self.data_points])
+        values = np.array([point[0] for point in data_points])
         p1, p5, p25, p50, p75, p90, p95, p99 = np.percentile(
             values, [1, 5, 25, 50, 75, 90, 95, 99]
         )
@@ -146,41 +194,57 @@ def to_metric_result(self, tag: str, header: str, unit: str) -> MetricResult:
 
 
 class GpuTelemetryData(AIPerfBaseModel):
-    """Complete telemetry data for one GPU: metadata + all metric time series.
+    """Complete telemetry data for one GPU: metadata + grouped metric time series.
 
     This combines static GPU information with dynamic time-series data,
-    providing the complete picture for one GPU's telemetry.
+    providing the complete picture for one GPU's telemetry using efficient grouped snapshots.
     """
 
     metadata: GpuMetadata = Field(description="Static GPU information")
-    metrics: dict[str, GpuMetricTimeSeries] = Field(
-        default_factory=dict,
-        description="Time series for each metric type (power_usage, utilization, etc.)",
+    time_series: GpuMetricTimeSeries = Field(
+        default_factory=GpuMetricTimeSeries,
+        description="Grouped time series for all metrics",
     )
 
     def add_record(self, record: TelemetryRecord) -> None:
-        """Add telemetry record to appropriate metric time series.
+        """Add telemetry record as a grouped snapshot.
 
         Args:
             record: New telemetry data point from DCGM collector
 
-        Note: Automatically creates new time series for metrics that don't exist yet
+        Note: Groups all metric values from the record into a single snapshot
         """
-
         metric_mapping = {
             "gpu_power_usage": record.gpu_power_usage,
             "gpu_power_limit": record.gpu_power_limit,
             "energy_consumption": record.energy_consumption,
             "gpu_utilization": record.gpu_utilization,
             "gpu_memory_used": record.gpu_memory_used,
             "total_gpu_memory": record.total_gpu_memory,
+            "sm_clock_frequency": record.sm_clock_frequency,
+            "memory_clock_frequency": record.memory_clock_frequency,
+            "memory_temperature": record.memory_temperature,
+            "gpu_temperature": record.gpu_temperature,
         }
 
-        for metric_name, value in metric_mapping.items():
-            if value is not None:
-                if metric_name not in self.metrics:
-                    self.metrics[metric_name] = GpuMetricTimeSeries()
-                self.metrics[metric_name].append(value, record.timestamp_ns)
+        # Filter out None values and add as single snapshot
+        valid_metrics = {k: v for k, v in metric_mapping.items() if v is not None}
+        if valid_metrics:
+            self.time_series.append_snapshot(valid_metrics, record.timestamp_ns)
+
+    def get_metric_result(self, metric_name: str, tag: str, header: str, unit: str) -> MetricResult:
+        """Get MetricResult for a specific metric.
+
+        Args:
+            metric_name: Name of the metric to analyze
+            tag: Unique identifier for this metric
+            header: Human-readable name for display
+            unit: Unit of measurement
+
+        Returns:
+            MetricResult with statistical summary for the specified metric
+        """
+        return self.time_series.to_metric_result(metric_name, tag, header, unit)
 
 
 class TelemetryHierarchy(AIPerfBaseModel):
 
@@ -26,4 +26,8 @@
     "DCGM_FI_DEV_GPU_UTIL": "gpu_utilization",
     "DCGM_FI_DEV_FB_USED": "gpu_memory_used",
     "DCGM_FI_DEV_FB_TOTAL": "total_gpu_memory",
+    "DCGM_FI_DEV_SM_CLOCK": "sm_clock_frequency",
+    "DCGM_FI_DEV_MEM_CLOCK": "memory_clock_frequency",
+    "DCGM_FI_DEV_MEMORY_TEMP": "memory_temperature",
+    "DCGM_FI_DEV_GPU_TEMP": "gpu_temperature",
 }
Original file line number	Diff line number	Diff line change
`@@ -26,4 +26,8 @@`
`26`	`26`	`"DCGM_FI_DEV_GPU_UTIL": "gpu_utilization",`
`27`	`27`	`"DCGM_FI_DEV_FB_USED": "gpu_memory_used",`
`28`	`28`	`"DCGM_FI_DEV_FB_TOTAL": "total_gpu_memory",`
	`29`	`+ "DCGM_FI_DEV_SM_CLOCK": "sm_clock_frequency",`
	`30`	`+ "DCGM_FI_DEV_MEM_CLOCK": "memory_clock_frequency",`
	`31`	`+ "DCGM_FI_DEV_MEMORY_TEMP": "memory_temperature",`
	`32`	`+ "DCGM_FI_DEV_GPU_TEMP": "gpu_temperature",`
`29`	`33`	`}`