From 4fd6ab6ceef0c48a123a51c193a2dc9597031dda Mon Sep 17 00:00:00 2001
From: Cong Liu <conliu@google.com>
Date: Wed, 19 Mar 2025 10:39:12 -0700
Subject: [PATCH 1/2] Document model server compatibility and config options

---
 config/charts/inferencepool/README.md         | 14 ++++++-
 .../templates/epp-deployment.yaml             |  9 ++++-
 config/charts/inferencepool/values.yaml       |  1 +
 mkdocs.yml                                    |  4 +-
 .../gateways.md}                              |  2 +-
 site-src/implementations/model-servers.md     | 38 +++++++++++++++++++
 6 files changed, 64 insertions(+), 4 deletions(-)
 rename site-src/{implementations.md => implementations/gateways.md} (99%)
 create mode 100644 site-src/implementations/model-servers.md

diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
index e5468cd4..42230e08 100644
--- a/config/charts/inferencepool/README.md
+++ b/config/charts/inferencepool/README.md
@@ -2,7 +2,6 @@
 
 A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.  
 
-
 ## Install
 
 To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
@@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \
 
 Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
 
+### Install for Triton TensorRT-LLM
+
+Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
+
+```txt
+$ helm install triton-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \
+  --set inferencePool.modelServerType=triton-tensorrt-llm \
+  --set provider.name=[none|gke] \
+  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
+```
+
 ## Uninstall
 
 Run the following command to uninstall the chart:
@@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart.
 | **Parameter Name**                          | **Description**                                                                                                        |
 |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
 | `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
+| `inferencePool.modelServerType`            | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm |
 | `inferencePool.modelServers.matchLabels`    | Label selector to match vllm backends managed by the inference pool.                                                   |
 | `inferenceExtension.replicas`               | Number of replicas for the endpoint picker extension service. Defaults to `1`.                                         |
 | `inferenceExtension.image.name`             | Name of the container image used for the endpoint picker.                                                              |
diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
index 0b9fa0bd..fc490210 100644
--- a/config/charts/inferencepool/templates/epp-deployment.yaml
+++ b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -35,6 +35,14 @@ spec:
         - "9003"
         - -metricsPort
         - "9090"
+        {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
+        - -totalQueuedRequestsMetric
+        - "nv_trt_llm_request_metrics{request_type=waiting}"
+        - -kvCacheUsagePercentageMetric
+        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
+        - -loraInfoMetric
+        - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
+        {{- end }}
         ports:
         - name: grpc
           containerPort: 9002
@@ -54,4 +62,3 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
-
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
index 766ee087..bd48f37e 100644
--- a/config/charts/inferencepool/values.yaml
+++ b/config/charts/inferencepool/values.yaml
@@ -9,6 +9,7 @@ inferenceExtension:
 
 inferencePool:
   targetPortNumber: 8000
+  modelServerType: vllm # vllm, triton-tensorrt-llm
   # modelServers: # REQUIRED
     # matchLabels: 
     #   app: vllm-llama3-8b-instruct
diff --git a/mkdocs.yml b/mkdocs.yml
index b67cf8b4..bdfffe05 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -54,7 +54,9 @@ nav:
         API Overview: concepts/api-overview.md
         Conformance: concepts/conformance.md
         Roles and Personas: concepts/roles-and-personas.md
-    - Implementations: implementations.md
+    - Implementations: 
+      - Gateways: implementations/gateways.md
+      - Model Servers: implementations/model-servers.md
     - FAQ: faq.md
   - Guides:
     - User Guides:
diff --git a/site-src/implementations.md b/site-src/implementations/gateways.md
similarity index 99%
rename from site-src/implementations.md
rename to site-src/implementations/gateways.md
index dc15b297..d4e919be 100644
--- a/site-src/implementations.md
+++ b/site-src/implementations/gateways.md
@@ -1,4 +1,4 @@
-# Implementations
+# Gateway Implementations
 
 This project has several implementations that are planned or in progress:
 
diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md
new file mode 100644
index 00000000..3d475aaa
--- /dev/null
+++ b/site-src/implementations/model-servers.md
@@ -0,0 +1,38 @@
+
+
+# Supported Model Servers
+
+Any model server that conform to the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) are supported by the inference extension.
+
+## Compatible Model Server Versions
+
+| Model Server         | Version                                                                                                                | Commit                                                                                                                            | Notes                                                                                                       |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
+| vLLM V0              | v0.6.4 and above                                                                                                       | [commit 0ad216f](https://github.com/vllm-project/vllm/commit/0ad216f5750742115c686723bf38698372d483fd)                            |                                                                                                             |
+| vLLM V1              | v0.8.0 and above                                                                                                       | [commit bc32bc7](https://github.com/vllm-project/vllm/commit/bc32bc73aad076849ac88565cff745b01b17d89c)                            |                                                                                                             |
+| Triton(TensorRT-LLM) | [25.03](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-03.html#rel-25-03) and above | [commit 15cb989](https://github.com/triton-inference-server/tensorrtllm_backend/commit/15cb989b00523d8e92dce5165b9b9846c047a70d). | LoRA affinity feature is not available as the required LoRA metrics haven't been implemented in Triton yet. |
+
+## vLLM
+
+vLLM is configured as the default in the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp). No further configuration is required.
+
+## Triton with TensorRT-LLM Backend
+
+Triton specific metric names need to be specified when starting the EPP.
+
+### Option 1: Use Helm
+
+Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details.
+
+### Option 2: Edit EPP deployment yaml
+
+ Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32)
+ 
+ ```
+- -totalQueuedRequestsMetric
+- "nv_trt_llm_request_metrics{request_type=waiting}"
+- -kvCacheUsagePercentageMetric
+- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
+- -loraInfoMetric
+- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
+```
\ No newline at end of file

From fc65688082585b77e369610b03ab4e838d79b09c Mon Sep 17 00:00:00 2001
From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com>
Date: Fri, 11 Apr 2025 10:18:59 -0700
Subject: [PATCH 2/2] Update config/charts/inferencepool/README.md

---
 config/charts/inferencepool/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
index 42230e08..301e3d9c 100644
--- a/config/charts/inferencepool/README.md
+++ b/config/charts/inferencepool/README.md
@@ -49,7 +49,7 @@ The following table list the configurable parameters of the chart.
 | **Parameter Name**                          | **Description**                                                                                                        |
 |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
 | `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
-| `inferencePool.modelServerType`            | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm |
+| `inferencePool.modelServerType`            | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. |
 | `inferencePool.modelServers.matchLabels`    | Label selector to match vllm backends managed by the inference pool.                                                   |
 | `inferenceExtension.replicas`               | Number of replicas for the endpoint picker extension service. Defaults to `1`.                                         |
 | `inferenceExtension.image.name`             | Name of the container image used for the endpoint picker.                                                              |