From 4fd6ab6ceef0c48a123a51c193a2dc9597031dda Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Wed, 19 Mar 2025 10:39:12 -0700 Subject: [PATCH 1/2] Document model server compatibility and config options --- config/charts/inferencepool/README.md | 14 ++++++- .../templates/epp-deployment.yaml | 9 ++++- config/charts/inferencepool/values.yaml | 1 + mkdocs.yml | 4 +- .../gateways.md} | 2 +- site-src/implementations/model-servers.md | 38 +++++++++++++++++++ 6 files changed, 64 insertions(+), 4 deletions(-) rename site-src/{implementations.md => implementations/gateways.md} (99%) create mode 100644 site-src/implementations/model-servers.md diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index e5468cd4..42230e08 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -2,7 +2,6 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment. - ## Install To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command: @@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \ Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed. +### Install for Triton TensorRT-LLM + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g., + +```txt +$ helm install triton-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \ + --set inferencePool.modelServerType=triton-tensorrt-llm \ + --set provider.name=[none|gke] \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 +``` + ## Uninstall Run the following command to uninstall the chart: @@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | +| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm | | `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | | `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. | diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index 0b9fa0bd..fc490210 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -35,6 +35,14 @@ spec: - "9003" - -metricsPort - "9090" + {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} + - -totalQueuedRequestsMetric + - "nv_trt_llm_request_metrics{request_type=waiting}" + - -kvCacheUsagePercentageMetric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" + - -loraInfoMetric + - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. + {{- end }} ports: - name: grpc containerPort: 9002 @@ -54,4 +62,3 @@ spec: service: inference-extension initialDelaySeconds: 5 periodSeconds: 10 - diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 766ee087..bd48f37e 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -9,6 +9,7 @@ inferenceExtension: inferencePool: targetPortNumber: 8000 + modelServerType: vllm # vllm, triton-tensorrt-llm # modelServers: # REQUIRED # matchLabels: # app: vllm-llama3-8b-instruct diff --git a/mkdocs.yml b/mkdocs.yml index b67cf8b4..bdfffe05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,7 +54,9 @@ nav: API Overview: concepts/api-overview.md Conformance: concepts/conformance.md Roles and Personas: concepts/roles-and-personas.md - - Implementations: implementations.md + - Implementations: + - Gateways: implementations/gateways.md + - Model Servers: implementations/model-servers.md - FAQ: faq.md - Guides: - User Guides: diff --git a/site-src/implementations.md b/site-src/implementations/gateways.md similarity index 99% rename from site-src/implementations.md rename to site-src/implementations/gateways.md index dc15b297..d4e919be 100644 --- a/site-src/implementations.md +++ b/site-src/implementations/gateways.md @@ -1,4 +1,4 @@ -# Implementations +# Gateway Implementations This project has several implementations that are planned or in progress: diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md new file mode 100644 index 00000000..3d475aaa --- /dev/null +++ b/site-src/implementations/model-servers.md @@ -0,0 +1,38 @@ + + +# Supported Model Servers + +Any model server that conform to the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) are supported by the inference extension. + +## Compatible Model Server Versions + +| Model Server | Version | Commit | Notes | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| vLLM V0 | v0.6.4 and above | [commit 0ad216f](https://github.com/vllm-project/vllm/commit/0ad216f5750742115c686723bf38698372d483fd) | | +| vLLM V1 | v0.8.0 and above | [commit bc32bc7](https://github.com/vllm-project/vllm/commit/bc32bc73aad076849ac88565cff745b01b17d89c) | | +| Triton(TensorRT-LLM) | [25.03](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-03.html#rel-25-03) and above | [commit 15cb989](https://github.com/triton-inference-server/tensorrtllm_backend/commit/15cb989b00523d8e92dce5165b9b9846c047a70d). | LoRA affinity feature is not available as the required LoRA metrics haven't been implemented in Triton yet. | + +## vLLM + +vLLM is configured as the default in the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp). No further configuration is required. + +## Triton with TensorRT-LLM Backend + +Triton specific metric names need to be specified when starting the EPP. + +### Option 1: Use Helm + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details. + +### Option 2: Edit EPP deployment yaml + + Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) + + ``` +- -totalQueuedRequestsMetric +- "nv_trt_llm_request_metrics{request_type=waiting}" +- -kvCacheUsagePercentageMetric +- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" +- -loraInfoMetric +- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. +``` \ No newline at end of file From fc65688082585b77e369610b03ab4e838d79b09c Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 11 Apr 2025 10:18:59 -0700 Subject: [PATCH 2/2] Update config/charts/inferencepool/README.md --- config/charts/inferencepool/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 42230e08..301e3d9c 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -49,7 +49,7 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | -| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm | +| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. | | `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | | `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. |