From dc8c0d7c72e9254beda0747afdc080f02c8a6d9a Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Wed, 2 Apr 2025 18:19:40 +0000
Subject: [PATCH 1/7] add tpot to inference gateway exposed metrics

---
 pkg/epp/handlers/streamingserver.go           |   1 +
 pkg/epp/metrics/metrics.go                    |  37 ++++++
 pkg/epp/metrics/metrics_test.go               | 122 ++++++++++++++++--
 pkg/epp/metrics/testdata/ntpot_seconds_metric |  50 +++++++
 site-src/guides/metrics.md                    |   1 +
 5 files changed, 201 insertions(+), 10 deletions(-)
 create mode 100644 pkg/epp/metrics/testdata/ntpot_seconds_metric

diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go
index 874dd734..fc867674 100644
--- a/pkg/epp/handlers/streamingserver.go
+++ b/pkg/epp/handlers/streamingserver.go
@@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 					reqCtx.ResponseCompleteTimestamp = time.Now()
 					metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
 					metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
+					metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 				}
 
 				reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
index 434b8381..209954db 100644
--- a/pkg/epp/metrics/metrics.go
+++ b/pkg/epp/metrics/metrics.go
@@ -131,6 +131,21 @@ var (
 		[]string{"model_name"},
 	)
 
+	// NTPOT - Normalized Time Per Output Token
+	latencyPerOutputToken = compbasemetrics.NewHistogramVec(
+		&compbasemetrics.HistogramOpts{
+			Subsystem: InferenceModelComponent,
+			Name:      "ntpot_seconds",
+			Help:      "Inference model latency divided by number of output tokens in seconds for each model and target model.",
+			// From few milliseconds per token to multiple seconds per token
+			Buckets: []float64{
+				0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
+			},
+			StabilityLevel: compbasemetrics.ALPHA,
+		},
+		[]string{"model_name", "target_model_name"},
+	)
+
 	// Inference Pool Metrics
 	inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
 		&compbasemetrics.GaugeOpts{
@@ -176,6 +191,7 @@ func Register() {
 		legacyregistry.MustRegister(inputTokens)
 		legacyregistry.MustRegister(outputTokens)
 		legacyregistry.MustRegister(runningRequests)
+		legacyregistry.MustRegister(latencyPerOutputToken)
 
 		legacyregistry.MustRegister(inferencePoolAvgKVCache)
 		legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
 	}
 }
 
+// RecordLatencyPerOutputToken (NTPOT) records the normalized time per output token.
+func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
+	if !complete.After(received) {
+		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Request latency values are invalid for NTPOT calculation",
+			"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
+		return false
+	}
+	
+	if outputTokenCount <= 0 {
+		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Output token count must be positive for NTPOT calculation",
+			"modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
+		return false
+	}
+	
+	elapsedSeconds := complete.Sub(received).Seconds()
+	secondsPerToken := elapsedSeconds / float64(outputTokenCount)
+	
+	latencyPerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
+	return true
+}
+
 // IncRunningRequests increases the current running requests.
 func IncRunningRequests(modelName string) {
 	if modelName != "" {
diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go
index dc4c7044..419de9ed 100644
--- a/pkg/epp/metrics/metrics_test.go
+++ b/pkg/epp/metrics/metrics_test.go
@@ -29,16 +29,17 @@ import (
 )
 
 const (
-	RequestTotalMetric      = InferenceModelComponent + "_request_total"
-	RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
-	RequestLatenciesMetric  = InferenceModelComponent + "_request_duration_seconds"
-	RequestSizesMetric      = InferenceModelComponent + "_request_sizes"
-	ResponseSizesMetric     = InferenceModelComponent + "_response_sizes"
-	InputTokensMetric       = InferenceModelComponent + "_input_tokens"
-	OutputTokensMetric      = InferenceModelComponent + "_output_tokens"
-	RunningRequestsMetric   = InferenceModelComponent + "_running_requests"
-	KVCacheAvgUsageMetric   = InferencePoolComponent + "_average_kv_cache_utilization"
-	QueueAvgSizeMetric      = InferencePoolComponent + "_average_queue_size"
+	RequestTotalMetric         = InferenceModelComponent + "_request_total"
+	RequestErrorTotalMetric    = InferenceModelComponent + "_request_error_total"
+	RequestLatenciesMetric     = InferenceModelComponent + "_request_duration_seconds"
+	RequestSizesMetric         = InferenceModelComponent + "_request_sizes"
+	ResponseSizesMetric        = InferenceModelComponent + "_response_sizes"
+	InputTokensMetric          = InferenceModelComponent + "_input_tokens"
+	OutputTokensMetric         = InferenceModelComponent + "_output_tokens"
+	LatencyPerOutputTokenMetric = InferenceModelComponent + "_ntpot_seconds"
+	RunningRequestsMetric      = InferenceModelComponent + "_running_requests"
+	KVCacheAvgUsageMetric      = InferencePoolComponent + "_average_kv_cache_utilization"
+	QueueAvgSizeMetric         = InferencePoolComponent + "_average_queue_size"
 )
 
 func TestRecordRequestCounterandSizes(t *testing.T) {
@@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) {
 	}
 }
 
+func TestRecordLatencyPerOutputToken(t *testing.T) {
+	ctx := logutil.NewTestLoggerIntoContext(context.Background())
+	timeBaseline := time.Now()
+	type tokenRequests struct {
+		modelName       string
+		targetModelName string
+		receivedTime    time.Time
+		completeTime    time.Time
+		outputTokens    int
+	}
+	scenarios := []struct {
+		name    string
+		reqs    []tokenRequests
+		invalid bool
+	}{
+		{
+			name: "multiple requests",
+			reqs: []tokenRequests{
+				{
+					modelName:       "m10",
+					targetModelName: "t10",
+					receivedTime:    timeBaseline,
+					completeTime:    timeBaseline.Add(time.Millisecond * 1000),
+					outputTokens:    100, // 10ms per token
+				},
+				{
+					modelName:       "m10",
+					targetModelName: "t10",
+					receivedTime:    timeBaseline,
+					completeTime:    timeBaseline.Add(time.Millisecond * 1600),
+					outputTokens:    80, // 20ms per token
+				},
+				{
+					modelName:       "m10",
+					targetModelName: "t11",
+					receivedTime:    timeBaseline,
+					completeTime:    timeBaseline.Add(time.Millisecond * 6000),
+					outputTokens:    300, // 20ms per token
+				},
+				{
+					modelName:       "m20",
+					targetModelName: "t20",
+					receivedTime:    timeBaseline,
+					completeTime:    timeBaseline.Add(time.Millisecond * 2400),
+					outputTokens:    400, // 6ms per token
+				},
+			},
+		},
+		{
+			name: "invalid elapsed time",
+			reqs: []tokenRequests{
+				{
+					modelName:       "m10",
+					targetModelName: "t10",
+					receivedTime:    timeBaseline.Add(time.Millisecond * 10),
+					completeTime:    timeBaseline,
+					outputTokens:    100,
+				},
+			},
+			invalid: true,
+		},
+		{
+			name: "invalid token count",
+			reqs: []tokenRequests{
+				{
+					modelName:       "m10",
+					targetModelName: "t10",
+					receivedTime:    timeBaseline,
+					completeTime:    timeBaseline.Add(time.Millisecond * 1000),
+					outputTokens:    0, // Invalid: zero tokens
+				},
+			},
+			invalid: true,
+		},
+	}
+	Register()
+	for _, scenario := range scenarios {
+		t.Run(scenario.name, func(t *testing.T) {
+			for _, req := range scenario.reqs {
+				success := RecordLatencyPerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
+				if success == scenario.invalid {
+					t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
+				}
+			}
+
+			wantLatencyPerToken, err := os.Open("testdata/ntpot_seconds_metric")
+			defer func() {
+				if err := wantLatencyPerToken.Close(); err != nil {
+					t.Error(err)
+				}
+			}()
+			if err != nil {
+				t.Fatal(err)
+			}
+			if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, LatencyPerOutputTokenMetric); err != nil {
+				t.Error(err)
+			}
+		})
+	}
+}
+
 func TestRecordResponseMetrics(t *testing.T) {
 	type responses struct {
 		modelName       string
diff --git a/pkg/epp/metrics/testdata/ntpot_seconds_metric b/pkg/epp/metrics/testdata/ntpot_seconds_metric
new file mode 100644
index 00000000..a9101972
--- /dev/null
+++ b/pkg/epp/metrics/testdata/ntpot_seconds_metric
@@ -0,0 +1,50 @@
+# HELP inference_model_ntpot_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
+# TYPE inference_model_ntpot_seconds histogram
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
+inference_model_ntpot_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
+inference_model_ntpot_seconds_count{model_name="m10", target_model_name="t10"} 2
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
+inference_model_ntpot_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
+inference_model_ntpot_seconds_count{model_name="m10", target_model_name="t11"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
+inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
+inference_model_ntpot_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
+inference_model_ntpot_seconds_count{model_name="m20", target_model_name="t20"} 1
diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md
index a781f721..dde10fb3 100644
--- a/site-src/guides/metrics.md
+++ b/site-src/guides/metrics.md
@@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
 | inference_model_request_total                | Counter          | The counter of requests broken out for each model.                | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_error_total          | Counter          | The counter of requests errors broken out for each model.         | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_duration_seconds     | Distribution     | Distribution of response latency.                                 | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
+| ntpot_seconds     | Distribution     | Distribution of ntpot (response latency per output token)                                 | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_sizes                | Distribution     | Distribution of request size in bytes.                            | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_response_sizes               | Distribution     | Distribution of response size in bytes.                           | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_input_tokens                 | Distribution     | Distribution of input token count.                                | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |

From 6eeab72c7f80932df88a8df6f36fd723eda444bb Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Wed, 2 Apr 2025 18:19:40 +0000
Subject: [PATCH 2/7] add tpot to inference gateway exposed metrics

---
 pkg/epp/handlers/streamingserver.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go
index fc867674..01cb605a 100644
--- a/pkg/epp/handlers/streamingserver.go
+++ b/pkg/epp/handlers/streamingserver.go
@@ -227,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 						metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
 						metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
 						metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
+						metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 					}
 				}
 			}

From 5121c51d33e6a29d48377a84f253cb6dbf92da08 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Wed, 2 Apr 2025 22:27:16 +0000
Subject: [PATCH 3/7] update logging and add ntpot logging to server.go

---
 pkg/epp/handlers/server.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
index a92f091c..b8f29542 100644
--- a/pkg/epp/handlers/server.go
+++ b/pkg/epp/handlers/server.go
@@ -130,6 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
 				metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
 				metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
 				metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
+				metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 			}
 			if reqCtx.modelServerStreaming {
 				logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)

From f887ef6ea46299033fe95f3b99f8c5728f05f4dc Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Wed, 2 Apr 2025 22:27:16 +0000
Subject: [PATCH 4/7] update logging and add ntpot logging to server.go

---
 pkg/epp/metrics/metrics.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
index 209954db..b849d0a1 100644
--- a/pkg/epp/metrics/metrics.go
+++ b/pkg/epp/metrics/metrics.go
@@ -250,13 +250,13 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
 // RecordLatencyPerOutputToken (NTPOT) records the normalized time per output token.
 func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
 	if !complete.After(received) {
-		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Request latency values are invalid for NTPOT calculation",
+		log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
 			"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
 		return false
 	}
 	
 	if outputTokenCount <= 0 {
-		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Output token count must be positive for NTPOT calculation",
+		log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation",
 			"modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
 		return false
 	}

From 9857871238d7c9f18795fa771c99d43485edb06c Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Thu, 3 Apr 2025 05:15:41 +0000
Subject: [PATCH 5/7] fix lint error

---
 pkg/epp/metrics/metrics.go      |  6 +++---
 pkg/epp/metrics/metrics_test.go | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
index b849d0a1..7164396f 100644
--- a/pkg/epp/metrics/metrics.go
+++ b/pkg/epp/metrics/metrics.go
@@ -254,16 +254,16 @@ func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName
 			"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
 		return false
 	}
-	
+
 	if outputTokenCount <= 0 {
 		log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation",
 			"modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
 		return false
 	}
-	
+
 	elapsedSeconds := complete.Sub(received).Seconds()
 	secondsPerToken := elapsedSeconds / float64(outputTokenCount)
-	
+
 	latencyPerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
 	return true
 }
diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go
index 419de9ed..e92cd411 100644
--- a/pkg/epp/metrics/metrics_test.go
+++ b/pkg/epp/metrics/metrics_test.go
@@ -29,17 +29,17 @@ import (
 )
 
 const (
-	RequestTotalMetric         = InferenceModelComponent + "_request_total"
-	RequestErrorTotalMetric    = InferenceModelComponent + "_request_error_total"
-	RequestLatenciesMetric     = InferenceModelComponent + "_request_duration_seconds"
-	RequestSizesMetric         = InferenceModelComponent + "_request_sizes"
-	ResponseSizesMetric        = InferenceModelComponent + "_response_sizes"
-	InputTokensMetric          = InferenceModelComponent + "_input_tokens"
-	OutputTokensMetric         = InferenceModelComponent + "_output_tokens"
+	RequestTotalMetric          = InferenceModelComponent + "_request_total"
+	RequestErrorTotalMetric     = InferenceModelComponent + "_request_error_total"
+	RequestLatenciesMetric      = InferenceModelComponent + "_request_duration_seconds"
+	RequestSizesMetric          = InferenceModelComponent + "_request_sizes"
+	ResponseSizesMetric         = InferenceModelComponent + "_response_sizes"
+	InputTokensMetric           = InferenceModelComponent + "_input_tokens"
+	OutputTokensMetric          = InferenceModelComponent + "_output_tokens"
 	LatencyPerOutputTokenMetric = InferenceModelComponent + "_ntpot_seconds"
-	RunningRequestsMetric      = InferenceModelComponent + "_running_requests"
-	KVCacheAvgUsageMetric      = InferencePoolComponent + "_average_kv_cache_utilization"
-	QueueAvgSizeMetric         = InferencePoolComponent + "_average_queue_size"
+	RunningRequestsMetric       = InferenceModelComponent + "_running_requests"
+	KVCacheAvgUsageMetric       = InferencePoolComponent + "_average_kv_cache_utilization"
+	QueueAvgSizeMetric          = InferencePoolComponent + "_average_queue_size"
 )
 
 func TestRecordRequestCounterandSizes(t *testing.T) {

From ba36b6e2f3a0ae5f11764ba26eeac65a35914f90 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Thu, 3 Apr 2025 20:20:49 +0000
Subject: [PATCH 6/7] change metric name from ntpot to normalized time per
 output token

---
 pkg/epp/handlers/server.go                    |  2 +-
 pkg/epp/handlers/streamingserver.go           |  4 +-
 pkg/epp/metrics/metrics.go                    | 12 ++---
 pkg/epp/metrics/metrics_test.go               | 30 +++++------
 ...lized_time_per_output_token_seconds_metric | 50 +++++++++++++++++++
 pkg/epp/metrics/testdata/ntpot_seconds_metric | 50 -------------------
 6 files changed, 74 insertions(+), 74 deletions(-)
 create mode 100644 pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric
 delete mode 100644 pkg/epp/metrics/testdata/ntpot_seconds_metric

diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
index b8f29542..8aca4b20 100644
--- a/pkg/epp/handlers/server.go
+++ b/pkg/epp/handlers/server.go
@@ -130,7 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
 				metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
 				metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
 				metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
-				metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
+				metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 			}
 			if reqCtx.modelServerStreaming {
 				logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)
diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go
index 01cb605a..37392167 100644
--- a/pkg/epp/handlers/streamingserver.go
+++ b/pkg/epp/handlers/streamingserver.go
@@ -184,7 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 					reqCtx.ResponseCompleteTimestamp = time.Now()
 					metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
 					metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
-					metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
+					metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 				}
 
 				reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
@@ -227,7 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 						metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
 						metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
 						metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
-						metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
+						metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
 					}
 				}
 			}
diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
index 7164396f..b474df36 100644
--- a/pkg/epp/metrics/metrics.go
+++ b/pkg/epp/metrics/metrics.go
@@ -132,10 +132,10 @@ var (
 	)
 
 	// NTPOT - Normalized Time Per Output Token
-	latencyPerOutputToken = compbasemetrics.NewHistogramVec(
+	NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec(
 		&compbasemetrics.HistogramOpts{
 			Subsystem: InferenceModelComponent,
-			Name:      "ntpot_seconds",
+			Name:      "normalized_time_per_output_token_seconds",
 			Help:      "Inference model latency divided by number of output tokens in seconds for each model and target model.",
 			// From few milliseconds per token to multiple seconds per token
 			Buckets: []float64{
@@ -191,7 +191,7 @@ func Register() {
 		legacyregistry.MustRegister(inputTokens)
 		legacyregistry.MustRegister(outputTokens)
 		legacyregistry.MustRegister(runningRequests)
-		legacyregistry.MustRegister(latencyPerOutputToken)
+		legacyregistry.MustRegister(NormalizedTimePerOutputToken)
 
 		legacyregistry.MustRegister(inferencePoolAvgKVCache)
 		legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -247,8 +247,8 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
 	}
 }
 
-// RecordLatencyPerOutputToken (NTPOT) records the normalized time per output token.
-func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
+// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
+func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
 	if !complete.After(received) {
 		log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
 			"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
@@ -264,7 +264,7 @@ func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName
 	elapsedSeconds := complete.Sub(received).Seconds()
 	secondsPerToken := elapsedSeconds / float64(outputTokenCount)
 
-	latencyPerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
+	NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
 	return true
 }
 
diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go
index e92cd411..b5f19e6d 100644
--- a/pkg/epp/metrics/metrics_test.go
+++ b/pkg/epp/metrics/metrics_test.go
@@ -29,17 +29,17 @@ import (
 )
 
 const (
-	RequestTotalMetric          = InferenceModelComponent + "_request_total"
-	RequestErrorTotalMetric     = InferenceModelComponent + "_request_error_total"
-	RequestLatenciesMetric      = InferenceModelComponent + "_request_duration_seconds"
-	RequestSizesMetric          = InferenceModelComponent + "_request_sizes"
-	ResponseSizesMetric         = InferenceModelComponent + "_response_sizes"
-	InputTokensMetric           = InferenceModelComponent + "_input_tokens"
-	OutputTokensMetric          = InferenceModelComponent + "_output_tokens"
-	LatencyPerOutputTokenMetric = InferenceModelComponent + "_ntpot_seconds"
-	RunningRequestsMetric       = InferenceModelComponent + "_running_requests"
-	KVCacheAvgUsageMetric       = InferencePoolComponent + "_average_kv_cache_utilization"
-	QueueAvgSizeMetric          = InferencePoolComponent + "_average_queue_size"
+	RequestTotalMetric                 = InferenceModelComponent + "_request_total"
+	RequestErrorTotalMetric            = InferenceModelComponent + "_request_error_total"
+	RequestLatenciesMetric             = InferenceModelComponent + "_request_duration_seconds"
+	RequestSizesMetric                 = InferenceModelComponent + "_request_sizes"
+	ResponseSizesMetric                = InferenceModelComponent + "_response_sizes"
+	InputTokensMetric                  = InferenceModelComponent + "_input_tokens"
+	OutputTokensMetric                 = InferenceModelComponent + "_output_tokens"
+	NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds"
+	RunningRequestsMetric              = InferenceModelComponent + "_running_requests"
+	KVCacheAvgUsageMetric              = InferencePoolComponent + "_average_kv_cache_utilization"
+	QueueAvgSizeMetric                 = InferencePoolComponent + "_average_queue_size"
 )
 
 func TestRecordRequestCounterandSizes(t *testing.T) {
@@ -253,7 +253,7 @@ func TestRecordRequestLatencies(t *testing.T) {
 	}
 }
 
-func TestRecordLatencyPerOutputToken(t *testing.T) {
+func TestRecordNormalizedTimePerOutputToken(t *testing.T) {
 	ctx := logutil.NewTestLoggerIntoContext(context.Background())
 	timeBaseline := time.Now()
 	type tokenRequests struct {
@@ -332,13 +332,13 @@ func TestRecordLatencyPerOutputToken(t *testing.T) {
 	for _, scenario := range scenarios {
 		t.Run(scenario.name, func(t *testing.T) {
 			for _, req := range scenario.reqs {
-				success := RecordLatencyPerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
+				success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
 				if success == scenario.invalid {
 					t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
 				}
 			}
 
-			wantLatencyPerToken, err := os.Open("testdata/ntpot_seconds_metric")
+			wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric")
 			defer func() {
 				if err := wantLatencyPerToken.Close(); err != nil {
 					t.Error(err)
@@ -347,7 +347,7 @@ func TestRecordLatencyPerOutputToken(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
-			if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, LatencyPerOutputTokenMetric); err != nil {
+			if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil {
 				t.Error(err)
 			}
 		})
diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric
new file mode 100644
index 00000000..bb6e9373
--- /dev/null
+++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric
@@ -0,0 +1,50 @@
+# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
+# TYPE inference_model_normalized_time_per_output_token_seconds histogram
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1
diff --git a/pkg/epp/metrics/testdata/ntpot_seconds_metric b/pkg/epp/metrics/testdata/ntpot_seconds_metric
deleted file mode 100644
index a9101972..00000000
--- a/pkg/epp/metrics/testdata/ntpot_seconds_metric
+++ /dev/null
@@ -1,50 +0,0 @@
-# HELP inference_model_ntpot_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
-# TYPE inference_model_ntpot_seconds histogram
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
-inference_model_ntpot_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
-inference_model_ntpot_seconds_count{model_name="m10", target_model_name="t10"} 2
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
-inference_model_ntpot_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
-inference_model_ntpot_seconds_count{model_name="m10", target_model_name="t11"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
-inference_model_ntpot_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
-inference_model_ntpot_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
-inference_model_ntpot_seconds_count{model_name="m20", target_model_name="t20"} 1

From c51ada25ab0c425a614c04f5e25f5e76a4f1f0cd Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Mon, 7 Apr 2025 00:04:11 +0000
Subject: [PATCH 7/7] update metrics.md

---
 site-src/guides/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md
index dde10fb3..d16c7d47 100644
--- a/site-src/guides/metrics.md
+++ b/site-src/guides/metrics.md
@@ -26,7 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
 | inference_model_request_total                | Counter          | The counter of requests broken out for each model.                | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_error_total          | Counter          | The counter of requests errors broken out for each model.         | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_duration_seconds     | Distribution     | Distribution of response latency.                                 | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
-| ntpot_seconds     | Distribution     | Distribution of ntpot (response latency per output token)                                 | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
+| normalized_time_per_output_token_seconds     | Distribution     | Distribution of ntpot (response latency per output token)                                 | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_request_sizes                | Distribution     | Distribution of request size in bytes.                            | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_response_sizes               | Distribution     | Distribution of response size in bytes.                           | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |
 | inference_model_input_tokens                 | Distribution     | Distribution of input token count.                                | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA       |