Skip to content

Commit 6058b09

Browse files
authored
expose "Normalized Time Per Output Token" (NTPOT) metric (#643)
* add tpot to inference gateway exposed metrics * add tpot to inference gateway exposed metrics * update logging and add ntpot logging to server.go * update logging and add ntpot logging to server.go * fix lint error * change metric name from ntpot to normalized time per output token * update metrics.md
1 parent 66808d4 commit 6058b09

File tree

6 files changed

+203
-10
lines changed

6 files changed

+203
-10
lines changed

pkg/epp/handlers/server.go

+1
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
129129
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
130130
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
131131
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
132+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
132133
}
133134
if reqCtx.modelServerStreaming {
134135
logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)

pkg/epp/handlers/streamingserver.go

+2
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
184184
reqCtx.ResponseCompleteTimestamp = time.Now()
185185
metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
186186
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
187+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
187188
}
188189

189190
reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
@@ -226,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
226227
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
227228
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
228229
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
230+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
229231
}
230232
}
231233
}

pkg/epp/metrics/metrics.go

+37
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,21 @@ var (
131131
[]string{"model_name"},
132132
)
133133

134+
// NTPOT - Normalized Time Per Output Token
135+
NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec(
136+
&compbasemetrics.HistogramOpts{
137+
Subsystem: InferenceModelComponent,
138+
Name: "normalized_time_per_output_token_seconds",
139+
Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.",
140+
// From few milliseconds per token to multiple seconds per token
141+
Buckets: []float64{
142+
0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
143+
},
144+
StabilityLevel: compbasemetrics.ALPHA,
145+
},
146+
[]string{"model_name", "target_model_name"},
147+
)
148+
134149
// Inference Pool Metrics
135150
inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
136151
&compbasemetrics.GaugeOpts{
@@ -176,6 +191,7 @@ func Register() {
176191
legacyregistry.MustRegister(inputTokens)
177192
legacyregistry.MustRegister(outputTokens)
178193
legacyregistry.MustRegister(runningRequests)
194+
legacyregistry.MustRegister(NormalizedTimePerOutputToken)
179195

180196
legacyregistry.MustRegister(inferencePoolAvgKVCache)
181197
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
231247
}
232248
}
233249

250+
// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
251+
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
252+
if !complete.After(received) {
253+
log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
254+
"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
255+
return false
256+
}
257+
258+
if outputTokenCount <= 0 {
259+
log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation",
260+
"modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
261+
return false
262+
}
263+
264+
elapsedSeconds := complete.Sub(received).Seconds()
265+
secondsPerToken := elapsedSeconds / float64(outputTokenCount)
266+
267+
NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
268+
return true
269+
}
270+
234271
// IncRunningRequests increases the current running requests.
235272
func IncRunningRequests(modelName string) {
236273
if modelName != "" {

pkg/epp/metrics/metrics_test.go

+112-10
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,17 @@ import (
2929
)
3030

3131
const (
32-
RequestTotalMetric = InferenceModelComponent + "_request_total"
33-
RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
34-
RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
35-
RequestSizesMetric = InferenceModelComponent + "_request_sizes"
36-
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
37-
InputTokensMetric = InferenceModelComponent + "_input_tokens"
38-
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
39-
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
40-
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
41-
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
32+
RequestTotalMetric = InferenceModelComponent + "_request_total"
33+
RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
34+
RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
35+
RequestSizesMetric = InferenceModelComponent + "_request_sizes"
36+
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
37+
InputTokensMetric = InferenceModelComponent + "_input_tokens"
38+
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
39+
NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds"
40+
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
41+
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
42+
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
4243
)
4344

4445
func TestRecordRequestCounterandSizes(t *testing.T) {
@@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) {
252253
}
253254
}
254255

256+
func TestRecordNormalizedTimePerOutputToken(t *testing.T) {
257+
ctx := logutil.NewTestLoggerIntoContext(context.Background())
258+
timeBaseline := time.Now()
259+
type tokenRequests struct {
260+
modelName string
261+
targetModelName string
262+
receivedTime time.Time
263+
completeTime time.Time
264+
outputTokens int
265+
}
266+
scenarios := []struct {
267+
name string
268+
reqs []tokenRequests
269+
invalid bool
270+
}{
271+
{
272+
name: "multiple requests",
273+
reqs: []tokenRequests{
274+
{
275+
modelName: "m10",
276+
targetModelName: "t10",
277+
receivedTime: timeBaseline,
278+
completeTime: timeBaseline.Add(time.Millisecond * 1000),
279+
outputTokens: 100, // 10ms per token
280+
},
281+
{
282+
modelName: "m10",
283+
targetModelName: "t10",
284+
receivedTime: timeBaseline,
285+
completeTime: timeBaseline.Add(time.Millisecond * 1600),
286+
outputTokens: 80, // 20ms per token
287+
},
288+
{
289+
modelName: "m10",
290+
targetModelName: "t11",
291+
receivedTime: timeBaseline,
292+
completeTime: timeBaseline.Add(time.Millisecond * 6000),
293+
outputTokens: 300, // 20ms per token
294+
},
295+
{
296+
modelName: "m20",
297+
targetModelName: "t20",
298+
receivedTime: timeBaseline,
299+
completeTime: timeBaseline.Add(time.Millisecond * 2400),
300+
outputTokens: 400, // 6ms per token
301+
},
302+
},
303+
},
304+
{
305+
name: "invalid elapsed time",
306+
reqs: []tokenRequests{
307+
{
308+
modelName: "m10",
309+
targetModelName: "t10",
310+
receivedTime: timeBaseline.Add(time.Millisecond * 10),
311+
completeTime: timeBaseline,
312+
outputTokens: 100,
313+
},
314+
},
315+
invalid: true,
316+
},
317+
{
318+
name: "invalid token count",
319+
reqs: []tokenRequests{
320+
{
321+
modelName: "m10",
322+
targetModelName: "t10",
323+
receivedTime: timeBaseline,
324+
completeTime: timeBaseline.Add(time.Millisecond * 1000),
325+
outputTokens: 0, // Invalid: zero tokens
326+
},
327+
},
328+
invalid: true,
329+
},
330+
}
331+
Register()
332+
for _, scenario := range scenarios {
333+
t.Run(scenario.name, func(t *testing.T) {
334+
for _, req := range scenario.reqs {
335+
success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
336+
if success == scenario.invalid {
337+
t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
338+
}
339+
}
340+
341+
wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric")
342+
defer func() {
343+
if err := wantLatencyPerToken.Close(); err != nil {
344+
t.Error(err)
345+
}
346+
}()
347+
if err != nil {
348+
t.Fatal(err)
349+
}
350+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil {
351+
t.Error(err)
352+
}
353+
})
354+
}
355+
}
356+
255357
func TestRecordResponseMetrics(t *testing.T) {
256358
type responses struct {
257359
modelName string
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
2+
# TYPE inference_model_normalized_time_per_output_token_seconds histogram
3+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
4+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
5+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
6+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
7+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
8+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
9+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
10+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
11+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
12+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
13+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
14+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
15+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
16+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
17+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
18+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2
19+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
20+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
21+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
22+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
23+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
24+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
25+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
26+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
27+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
28+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
29+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
30+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
31+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
32+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
33+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
34+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1
35+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
36+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
37+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
38+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
39+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
40+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
41+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
42+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
43+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
44+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
45+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
46+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
47+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
48+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
49+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
50+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1

site-src/guides/metrics.md

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
2626
| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
2727
| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
2828
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
29+
| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
2930
| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3031
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3132
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |

0 commit comments

Comments
 (0)