Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 257 additions & 0 deletions pkg/epp/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,104 @@ var (
[]string{"model_name", "target_model_name", "error_code"},
)

// Gauge for various inference request metrics
inferenceGauges = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: InferenceObjectiveComponent,
Name: "inference_request_metric",
Help: metricsutil.HelpMsgWithStability("Consolidated gauge for various inference request metrics including TTFT, TPOT, SLOs, and prediction durations.", compbasemetrics.ALPHA),
},
[]string{"model_name", "target_model_name", "type"},
)

requestTTFT = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_ttft_seconds",
Help: metricsutil.HelpMsgWithStability("Inference model TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600,
},
},
[]string{"model_name", "target_model_name"},
)

requestPredictedTTFT = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_predicted_ttft_seconds",
Help: metricsutil.HelpMsgWithStability("Inference model Predicted TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600,
},
},
[]string{"model_name", "target_model_name"},
)

// New metrics for TTFT prediction duration
requestTTFTPredictionDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_ttft_prediction_duration_seconds",
Help: metricsutil.HelpMsgWithStability("Duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0,
},
},
[]string{"model_name", "target_model_name"},
)

requestTPOT = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_tpot_seconds",
Help: metricsutil.HelpMsgWithStability("Inference model TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.0005, 0.00205, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.125, 0.15, 0.2, 0.3,
0.4, 0.5, 0.6, 0.8, 1, 1.5, 2, 3, 4.5, 6, 12, 18, 24, 30, 36, 48, 60, 90, 120, 180, 270, 360,
},
},
[]string{"model_name", "target_model_name"},
)

requestPredictedTPOT = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_predicted_tpot_seconds",
Help: metricsutil.HelpMsgWithStability("Inference model Predicted TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.0005, 0.00205, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.125, 0.15, 0.2, 0.3,
0.4, 0.5, 0.6, 0.8, 1, 1.5, 2, 3, 4.5, 6, 12, 18, 24, 30, 36, 48, 60, 90, 120, 180, 270, 360,
},
},
[]string{"model_name", "target_model_name"},
)

// New metrics for TPOT prediction duration
requestTPOTPredictionDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_tpot_prediction_duration_seconds",
Help: metricsutil.HelpMsgWithStability("Duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
Buckets: []float64{
0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0,
},
},
[]string{"model_name", "target_model_name"},
)

// Counter for SLO Violations
sloViolationCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: InferenceObjectiveComponent,
Name: "request_slo_violation_total",
Help: metricsutil.HelpMsgWithStability("Counter of SLO violations for each model, target model, and violation type.", compbasemetrics.ALPHA),
},
[]string{"model_name", "target_model_name", "type"},
)

requestLatencies = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: InferenceObjectiveComponent,
Expand Down Expand Up @@ -282,6 +380,21 @@ var registerMetrics sync.Once
// Register all metrics.
func Register(customCollectors ...prometheus.Collector) {
registerMetrics.Do(func() {
// Register inference gauges
metrics.Registry.MustRegister(inferenceGauges)

// Register Histograms
metrics.Registry.MustRegister(requestTPOT)
metrics.Registry.MustRegister(requestTTFT)
metrics.Registry.MustRegister(requestPredictedTPOT)
metrics.Registry.MustRegister(requestPredictedTTFT)
metrics.Registry.MustRegister(requestTPOTPredictionDuration)
metrics.Registry.MustRegister(requestTTFTPredictionDuration)

// Register SLO violation counters
metrics.Registry.MustRegister(sloViolationCounter)

// Register other metrics
metrics.Registry.MustRegister(requestCounter)
metrics.Registry.MustRegister(requestErrCounter)
metrics.Registry.MustRegister(requestLatencies)
Expand Down Expand Up @@ -311,6 +424,21 @@ func Register(customCollectors ...prometheus.Collector) {

// Just for integration test
func Reset() {
// Reset inference gauges
inferenceGauges.Reset()

// Reset Histograms
requestTPOT.Reset()
requestTTFT.Reset()
requestPredictedTPOT.Reset()
requestPredictedTTFT.Reset()
requestTPOTPredictionDuration.Reset()
requestTTFTPredictionDuration.Reset()

// Reset SLO violation counter
sloViolationCounter.Reset()

// Reset other metrics
requestCounter.Reset()
requestErrCounter.Reset()
requestLatencies.Reset()
Expand Down Expand Up @@ -363,6 +491,123 @@ func RecordRequestLatencies(ctx context.Context, modelName, targetModelName stri
return true
}

func RecordRequestTPOT(ctx context.Context, modelName, targetModelName string, tpot float64) bool {
if tpot < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot)
return false
}
requestTPOT.WithLabelValues(modelName, targetModelName).Observe(tpot)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot"}).Set(tpot)
return true
}

// RecordRequestTPOTWithSLO records TPOT and checks for SLO violation.
// If tpot exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
// If tpot is within limits, it sets gauge to 0.
func RecordRequestTPOTWithSLO(ctx context.Context, modelName, targetModelName string, tpot float64, sloThreshold float64) bool {
if tpot < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot)
return false
}

// Check for SLO violation (tpot exceeds threshold)
if tpot > sloThreshold {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_violation"}).Set(1)
sloViolationCounter.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot"}).Inc()
log.FromContext(ctx).V(logutil.DEFAULT).Info("TPOT SLO violation detected",
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot, "threshold", sloThreshold)
} else {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_violation"}).Set(0)
}

return true
}

// TPOT records duration of request.
func RecordRequestPredictedTPOT(ctx context.Context, modelName, targetModelName string, predicted_tpot float64) bool {
if predicted_tpot < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Predicted TPOT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "tpot", predicted_tpot)
return false
}
requestPredictedTPOT.WithLabelValues(modelName, targetModelName).Observe(predicted_tpot)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "predicted_tpot"}).Set(predicted_tpot)
return true
}

// RecordRequestTPOTPredictionDuration records the duration taken to generate TPOT predictions.
func RecordRequestTPOTPredictionDuration(ctx context.Context, modelName, targetModelName string, duration float64) bool {
if duration < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT prediction duration must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
return false
}
requestTPOTPredictionDuration.WithLabelValues(modelName, targetModelName).Observe(duration)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_prediction_duration"}).Set(duration)
return true
}

// TTFT records duration of request.
func RecordRequestTTFT(ctx context.Context, modelName, targetModelName string, ttft float64) bool {
if ttft < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft)
return false
}
requestTTFT.WithLabelValues(modelName, targetModelName).Observe(ttft)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft"}).Set(ttft)
return true
}

// RecordRequestTTFTWithSLO records TTFT and checks for SLO violation.
// If ttft exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
// If ttft is within limits, it sets gauge to 0.
func RecordRequestTTFTWithSLO(ctx context.Context, modelName, targetModelName string, ttft float64, sloThreshold float64) bool {
if ttft < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft)
return false
}

// Check for SLO violation (ttft exceeds threshold)
if ttft > sloThreshold {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_violation"}).Set(1)
sloViolationCounter.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft"}).Inc()
log.FromContext(ctx).V(logutil.DEFAULT).Info("TTFT SLO violation detected",
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft, "threshold", sloThreshold)
} else {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_violation"}).Set(0)
}

return true
}

// TPOT records duration of request.
func RecordRequestPredictedTTFT(ctx context.Context, modelName, targetModelName string, predicted_ttft float64) bool {
if predicted_ttft < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Predicted TTFT value must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "ttft", predicted_ttft)
return false
}
requestPredictedTTFT.WithLabelValues(modelName, targetModelName).Observe(predicted_ttft)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "predicted_ttft"}).Set(predicted_ttft)
return true
}

// RecordRequestTTFTPredictionDuration records the duration taken to generate TTFT predictions.
func RecordRequestTTFTPredictionDuration(ctx context.Context, modelName, targetModelName string, duration float64) bool {
if duration < 0 {
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT prediction duration must be non-negative",
"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
return false
}
requestTTFTPredictionDuration.WithLabelValues(modelName, targetModelName).Observe(duration)
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_prediction_duration"}).Set(duration)
return true
}

// RecordResponseSizes records the response sizes.
func RecordResponseSizes(modelName, targetModelName string, size int) {
responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))
Expand Down Expand Up @@ -480,3 +725,15 @@ func IncFlowControlQueueSize(fairnessID, priority string) {
func DecFlowControlQueueSize(fairnessID, priority string) {
flowControlQueueSize.WithLabelValues(fairnessID, priority).Dec()
}

// SetTTFTSLOThreshold sets the TTFT SLO threshold for a model.
// This allows dynamic threshold management and makes the threshold visible in metrics.
func SetTTFTSLOThreshold(modelName, targetModelName string, threshold float64) {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_threshold"}).Set(threshold)
}

// SetTPOTSLOThreshold sets the TPOT SLO threshold for a model.
// This allows dynamic threshold management and makes the threshold visible in metrics.
func SetTPOTSLOThreshold(modelName, targetModelName string, threshold float64) {
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_threshold"}).Set(threshold)
}
2 changes: 2 additions & 0 deletions pkg/epp/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ const (
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
PerPodQueueSizeMetrics = InferencePoolComponent + "_per_pod_queue_size"
RequestTTFTSecondsMetric = InferenceObjectiveComponent + "_request_ttft_seconds"
RequestTPOTSecondsMetric = InferenceObjectiveComponent + "_request_tpot_seconds"
)

func TestMain(m *testing.M) {
Expand Down
Loading