Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expose "Normalized Time Per Output Token" (NTPOT) metric #643

Merged
merged 7 commits into from
Apr 7, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/epp/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
if reqCtx.modelServerStreaming {
logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)
Expand Down
2 changes: 2 additions & 0 deletions pkg/epp/handlers/streamingserver.go
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add this to server.go as well?

Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
reqCtx.ResponseCompleteTimestamp = time.Now()
metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}

reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
Expand Down Expand Up @@ -226,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
}
}
Expand Down
37 changes: 37 additions & 0 deletions pkg/epp/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,21 @@ var (
[]string{"model_name"},
)

// NTPOT - Normalized Time Per Output Token
NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Subsystem: InferenceModelComponent,
Name: "normalized_time_per_output_token_seconds",
Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.",
// From few milliseconds per token to multiple seconds per token
Buckets: []float64{
0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"model_name", "target_model_name"},
)

// Inference Pool Metrics
inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Expand Down Expand Up @@ -176,6 +191,7 @@ func Register() {
legacyregistry.MustRegister(inputTokens)
legacyregistry.MustRegister(outputTokens)
legacyregistry.MustRegister(runningRequests)
legacyregistry.MustRegister(NormalizedTimePerOutputToken)

legacyregistry.MustRegister(inferencePoolAvgKVCache)
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
Expand Down Expand Up @@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
}
}

// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
if !complete.After(received) {
log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
return false
}

if outputTokenCount <= 0 {
log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation",
"modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
return false
}

elapsedSeconds := complete.Sub(received).Seconds()
secondsPerToken := elapsedSeconds / float64(outputTokenCount)

NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
return true
}

// IncRunningRequests increases the current running requests.
func IncRunningRequests(modelName string) {
if modelName != "" {
Expand Down
122 changes: 112 additions & 10 deletions pkg/epp/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,17 @@ import (
)

const (
RequestTotalMetric = InferenceModelComponent + "_request_total"
RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
RequestSizesMetric = InferenceModelComponent + "_request_sizes"
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
InputTokensMetric = InferenceModelComponent + "_input_tokens"
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
RequestTotalMetric = InferenceModelComponent + "_request_total"
RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
RequestSizesMetric = InferenceModelComponent + "_request_sizes"
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
InputTokensMetric = InferenceModelComponent + "_input_tokens"
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds"
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
)

func TestRecordRequestCounterandSizes(t *testing.T) {
Expand Down Expand Up @@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) {
}
}

func TestRecordNormalizedTimePerOutputToken(t *testing.T) {
ctx := logutil.NewTestLoggerIntoContext(context.Background())
timeBaseline := time.Now()
type tokenRequests struct {
modelName string
targetModelName string
receivedTime time.Time
completeTime time.Time
outputTokens int
}
scenarios := []struct {
name string
reqs []tokenRequests
invalid bool
}{
{
name: "multiple requests",
reqs: []tokenRequests{
{
modelName: "m10",
targetModelName: "t10",
receivedTime: timeBaseline,
completeTime: timeBaseline.Add(time.Millisecond * 1000),
outputTokens: 100, // 10ms per token
},
{
modelName: "m10",
targetModelName: "t10",
receivedTime: timeBaseline,
completeTime: timeBaseline.Add(time.Millisecond * 1600),
outputTokens: 80, // 20ms per token
},
{
modelName: "m10",
targetModelName: "t11",
receivedTime: timeBaseline,
completeTime: timeBaseline.Add(time.Millisecond * 6000),
outputTokens: 300, // 20ms per token
},
{
modelName: "m20",
targetModelName: "t20",
receivedTime: timeBaseline,
completeTime: timeBaseline.Add(time.Millisecond * 2400),
outputTokens: 400, // 6ms per token
},
},
},
{
name: "invalid elapsed time",
reqs: []tokenRequests{
{
modelName: "m10",
targetModelName: "t10",
receivedTime: timeBaseline.Add(time.Millisecond * 10),
completeTime: timeBaseline,
outputTokens: 100,
},
},
invalid: true,
},
{
name: "invalid token count",
reqs: []tokenRequests{
{
modelName: "m10",
targetModelName: "t10",
receivedTime: timeBaseline,
completeTime: timeBaseline.Add(time.Millisecond * 1000),
outputTokens: 0, // Invalid: zero tokens
},
},
invalid: true,
},
}
Register()
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
for _, req := range scenario.reqs {
success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
if success == scenario.invalid {
t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
}
}

wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric")
defer func() {
if err := wantLatencyPerToken.Close(); err != nil {
t.Error(err)
}
}()
if err != nil {
t.Fatal(err)
}
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil {
t.Error(err)
}
})
}
}

func TestRecordResponseMetrics(t *testing.T) {
type responses struct {
modelName string
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
# TYPE inference_model_normalized_time_per_output_token_seconds histogram
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1
1 change: 1 addition & 0 deletions site-src/guides/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| ntpot_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
Expand Down