triton-inference-server
diff --git a/‎Dockerfile.QA
+19-12 b/‎Dockerfile.QA
+19-12
diff --git a/‎Dockerfile.sdk
+1-1 b/‎Dockerfile.sdk
+1-1
diff --git a/‎TRITON_VERSION
+1-1 b/‎TRITON_VERSION
+1-1
diff --git a/‎build.py
+14-46 b/‎build.py
+14-46
diff --git a/‎compose.py
+7-15 b/‎compose.py
+7-15
diff --git a/‎deploy/alibaba-cloud/README.md
+5-5 b/‎deploy/alibaba-cloud/README.md
+5-5
diff --git a/‎deploy/aws/README.md
+2-2 b/‎deploy/aws/README.md
+2-2
diff --git a/‎deploy/aws/values.yaml
+1-1 b/‎deploy/aws/values.yaml
+1-1
diff --git a/‎deploy/fleetcommand/Chart.yaml
+1-1 b/‎deploy/fleetcommand/Chart.yaml
+1-1
diff --git a/‎deploy/fleetcommand/values.yaml
+3-3 b/‎deploy/fleetcommand/values.yaml
+3-3
@@ -61,6 +61,7 @@ RUN apt-get update && \
             python3-pip \
             python3-wheel \
             python3-setuptools \
+            python3-venv \
             rapidjson-dev \
             software-properties-common && \
     rm -rf /var/lib/apt/lists/*
@@ -74,12 +75,19 @@ RUN apt update -q=2 \
     && apt-get install -y --no-install-recommends cmake=3.28.3* cmake-data=3.28.3*
 
 # Add inception_graphdef model to example repo
+# FIXME: This should be changed to using the fetch_models.sh script
+# in order to ensure the public facing docs are up-to-date.
 WORKDIR /workspace/docs/examples/model_repository
-RUN mkdir -p inception_graphdef/1 && \
-    wget -O ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb.tar.gz \
-        https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \
-    (cd ${TRITONTMP_DIR} && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \
-    mv ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb inception_graphdef/1/model.graphdef
+RUN mkdir -p model_repository/inception_onnx/1 && \
+        wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
+            https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \
+        (cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \
+        python3 -m venv tf2onnx && \
+            source ./tf2onnx/bin/activate && \
+            pip3 install "numpy<2" tensorflow tf2onnx && \
+            python3 -m tf2onnx.convert --graphdef /tmp/inception_v3_2016_08_28_frozen.pb --output inception_v3_onnx.model.onnx --inputs input:0 --outputs InceptionV3/Predictions/Softmax:0 && \
+        deactivate  && \
+        mv inception_v3_onnx.model.onnx model_repository/inception_onnx/1/model.onnx
 
 # Update the qa/ directory with test executables, models, etc.
 WORKDIR /workspace
@@ -109,7 +117,7 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/simple_identity qa/L0_grpc/models && \
     cp -r docs/examples/model_repository/simple_sequence qa/L0_grpc/models && \
     cp -r docs/examples/model_repository/simple_string qa/L0_grpc/models && \
-    cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \
+    cp -r docs/examples/model_repository/inception_onnx qa/L0_grpc/models && \
     mkdir qa/L0_grpc_state_cleanup/models && \
     cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \
     mkdir qa/L0_http/models && \
@@ -118,7 +126,7 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/simple_identity qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_sequence qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_string qa/L0_http/models && \
-    cp -r docs/examples/model_repository/inception_graphdef qa/L0_http/models && \
+    cp -r docs/examples/model_repository/inception_onnx qa/L0_grpc/models && \
     mkdir qa/L0_https/models && \
     cp -r docs/examples/model_repository/simple qa/L0_https/models/. && \
     mkdir qa/L0_secure_grpc/models && \
@@ -149,21 +157,20 @@ RUN mkdir -p qa/common && \
     cp bin/triton_json_test qa/L0_json/. && \
     cp bin/backend_output_detail_test qa/L0_backend_output_detail/. && \
     cp -r deploy/mlflow-triton-plugin qa/L0_mlflow/. && \
-    cp bin/input_byte_size_test qa/L0_input_validation/. && \
-    cp -r docs/examples/model_repository/simple_identity qa/L0_input_validation/models
+    cp bin/input_byte_size_test qa/L0_input_validation/.
 
 RUN mkdir -p qa/pkgs && \
     cp python/triton*.whl qa/pkgs/. && \
     cp -rf python/test/. qa/L0_python_api/.
 
 RUN mkdir -p qa/L0_simple_ensemble/models/simple/1 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_simple_ensemble/models/simple/1/. && \
     mkdir -p qa/L0_simple_ensemble/models/simple/2 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_simple_ensemble/models/simple/2/. && \
     mkdir -p qa/L0_socket/models/simple/1 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_socket/models/simple/1/.
 
 RUN mkdir -p qa/L0_backend_identity/models && \
 
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.02-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.03-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
 
@@ -1 +1 @@
-2.56.0dev
+2.57.0dev
@@ -71,14 +71,14 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.56.0dev",
-    "triton_container_version": "25.03dev",
-    "upstream_container_version": "25.02",
-    "ort_version": "1.20.1",
+    "release_version": "2.57.0dev",
+    "triton_container_version": "25.04dev",
+    "upstream_container_version": "25.03",
+    "ort_version": "1.21.0",
     "ort_openvino_version": "2025.0.0",
     "standalone_openvino_version": "2025.0.0",
     "dcgm_version": "3.3.6",
-    "vllm_version": "0.7.0",
+    "vllm_version": "0.7.3",
     "rhel_py_version": "3.12.3",
 }
 
@@ -562,8 +562,6 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
         args = onnxruntime_cmake_args(images, library_paths)
     elif be == "openvino":
         args = openvino_cmake_args()
-    elif be == "tensorflow":
-        args = tensorflow_cmake_args(images, library_paths)
     elif be == "python":
         args = python_cmake_args()
     elif be == "dali":
@@ -795,23 +793,6 @@ def tensorrt_cmake_args():
     return cargs
 
 
-def tensorflow_cmake_args(images, library_paths):
-    backend_name = "tensorflow"
-    extra_args = []
-
-    # If a specific TF image is specified use it, otherwise pull from NGC.
-    if backend_name in images:
-        image = images[backend_name]
-    else:
-        image = "nvcr.io/nvidia/tensorflow:{}-tf2-py3".format(
-            FLAGS.upstream_container_version
-        )
-    extra_args = [
-        cmake_backend_arg(backend_name, "TRITON_TENSORFLOW_DOCKER_IMAGE", None, image)
-    ]
-    return extra_args
-
-
 def dali_cmake_args():
     return [
         cmake_backend_enable("dali", "TRITON_DALI_SKIP_DOWNLOAD", False),
@@ -1233,10 +1214,10 @@ def create_dockerfile_linux(
         argmap["BASE_IMAGE"],
     )
 
-    # PyTorch and TensorFlow backends need extra CUDA and other
+    # PyTorch backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (("pytorch" in backends) or ("tensorflow" in backends)):
+    if not FLAGS.enable_gpu and ("pytorch" in backends):
         df += """
 ############################################################################
 ##  Triton Min image
@@ -1602,10 +1583,10 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
             cuda_arch=cuda_arch, libs_arch=libs_arch
         )
 
-    if ("pytorch" in backends) or ("tensorflow" in backends):
-        # Add NCCL dependency for tensorflow/pytorch backend.
+    if "pytorch" in backends:
+        # Add NCCL dependency for pytorch backend.
         # Note: Even though the build is CPU-only, the version of
-        # tensorflow/pytorch we are using depends upon the NCCL library.
+        # pytorch we are using depends upon the NCCL library.
         # Since this dependency is not present in the ubuntu base image,
         # we must copy it from the Triton min container ourselves.
         df += """
@@ -1720,11 +1701,10 @@ def create_build_dockerfiles(
     }
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
-    # since we are using PyTorch and TensorFlow containers that
-    # are not CPU-only.
+    # since we are using PyTorch containers that are not CPU-only.
     if (
         not FLAGS.enable_gpu
-        and (("pytorch" in backends) or ("tensorflow" in backends))
+        and ("pytorch" in backends)
         and (target_platform() != "windows")
     ):
         if "gpu-base" in images:
@@ -2351,7 +2331,6 @@ def enable_all():
             "identity",
             "square",
             "repeat",
-            "tensorflow",
             "onnxruntime",
             "python",
             "dali",
@@ -2586,7 +2565,7 @@ def enable_all():
         "--image",
         action="append",
         required=False,
-        help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", "tensorflow", or "pytorch".',
+        help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", or "pytorch".',
     )
 
     parser.add_argument(
@@ -2887,12 +2866,6 @@ def enable_all():
         parts = be.split(":")
         if len(parts) == 1:
             parts.append(default_repo_tag)
-        if parts[0] == "tensorflow1":
-            fail(
-                "Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2."
-            )
-        if parts[0] == "tensorflow2":
-            parts[0] = "tensorflow"
         log('backend "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
         backends[parts[0]] = parts[1]
 
@@ -2939,13 +2912,10 @@ def enable_all():
             len(parts) != 2, "--image must specify <image-name>,<full-image-registry>"
         )
         fail_if(
-            parts[0]
-            not in ["base", "gpu-base", "pytorch", "tensorflow", "tensorflow2"],
+            parts[0] not in ["base", "gpu-base", "pytorch"],
             "unsupported value for --image",
         )
         log('image "{}": "{}"'.format(parts[0], parts[1]))
-        if parts[0] == "tensorflow2":
-            parts[0] = "tensorflow"
         images[parts[0]] = parts[1]
 
     # Initialize map of library paths for each backend.
@@ -2954,8 +2924,6 @@ def enable_all():
         parts = lpath.split(":")
         if len(parts) == 2:
             log('backend "{}" library path "{}"'.format(parts[0], parts[1]))
-            if parts[0] == "tensorflow2":
-                parts[0] = "tensorflow"
             library_paths[parts[0]] = parts[1]
 
     # Parse any explicitly specified cmake arguments
 
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -71,14 +71,10 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
         argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"]
     )
 
-    # PyTorch, TensorFlow backends need extra CUDA and other
+    # PyTorch backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (
-        ("pytorch" in backends)
-        or ("tensorflow" in backends)
-        or ("tensorflow2" in backends)
-    ):
+    if not FLAGS.enable_gpu and "pytorch" in backends:
         df += """
 FROM {} AS min_container
 
@@ -302,7 +298,7 @@ def create_argmap(images, skip_pull):
     dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
     dcgm_version = ""
     if dcgm_ver is None:
-        dcgm_version = "2.2.3"
+        dcgm_version = "3.3.6"
         log(
             "WARNING: DCGM version not found from image, installing the earlierst version {}".format(
                 dcgm_version
@@ -406,7 +402,7 @@ def create_argmap(images, skip_pull):
         '<image-name>,<full-image-name>. <image-name> can be "min", "gpu-min" '
         'or "full". Both "min" and "full" need to be specified at the same time.'
         'This will override "--container-version". "gpu-min" is needed for '
-        "CPU-only container to copy TensorFlow and PyTorch deps.",
+        "CPU-only container to copy PyTorch deps.",
     )
     parser.add_argument(
         "--enable-gpu",
@@ -504,13 +500,9 @@ def create_argmap(images, skip_pull):
     fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all")
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
-    # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that
+    # since we are using PyTorch containers that
     # are not CPU-only.
-    if (
-        ("pytorch" in FLAGS.backend)
-        or ("tensorflow" in FLAGS.backend)
-        or ("tensorflow2" in FLAGS.backend)
-    ) and ("gpu-min" not in images):
+    if ("pytorch" in FLAGS.backend) and ("gpu-min" not in images):
         images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
             FLAGS.container_version
         )
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -39,7 +39,7 @@ This repository contains information about how to deploy NVIDIA Triton Inference
 - EAS provides a simple way for deep learning developers to deploy their models in Alibaba Cloud.
 - Using **Triton Processor** is the recommended way on EAS to deploy Triton Inference Server. Users can simply deploy a Triton Server by preparing models and creating a EAS service by setting processor type to `triton`.
 - Models should be uploaded to Alibaba Cloud's OSS(Object Storage Service). User's model repository in OSS will be mounted onto local path visible to Triton Server.
-- This documentation uses Triton's own example models for demo. The tensorflow inception model can be downloaded by the `fetch_models.sh` script.
+- This documentation uses Triton's own example models for demo. The ONNX inception v3 model can be obtained by the `fetch_models.sh` script.
 
 # Prerequisites
 - You should register an Alibaba Cloud Account, and being able to use EAS by [eascmd](https://help.aliyun.com/document_detail/111031.html?spm=a2c4g.11186623.6.752.42356f46FN5fU1), which is a command line tool to create stop or scale services on EAS.
@@ -48,10 +48,10 @@ This repository contains information about how to deploy NVIDIA Triton Inference
 
 # Demo Instruction
 ## Prepare a model repo directory in OSS
-Download the tensorflow inception model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want.
+Download the ONNX inception v3 model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want.
 
 ```
-./ossutil cp inception_graphdef/ oss://triton-model-repo/models
+./ossutil cp inception_v3_onnx/ oss://triton-model-repo/models
 ```
 ## Create Triton Service with json config by eascmd
 The following is the json we use when creating a Triton Server on EAS.
@@ -125,7 +125,7 @@ triton_client = httpclient.InferenceServerClient(url=URL, verbose=False)
 start = time.time()
 for i in range(10):
     results = triton_client.infer(
-        "inception_graphdef", inputs=[input_img], outputs=[output], headers=HEADERS
+        "inception_v3_onnx", inputs=[input_img], outputs=[output], headers=HEADERS
     )
     res_body = results.get_response()
     elapsed_ms = (time.time() - start) * 1000
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -224,7 +224,7 @@ using image classification models being served by the inference
 server. For example,
 
 ```
-$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+$ image_client -u 34.83.9.133:8000 -m inception_v3_onnx -s INCEPTION -c3 mug.jpg
 Request 0, batch size 1
 Image 'images/mug.jpg':
     504 (COFFEE MUG) = 0.723992
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.02-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.03-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
 
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.55.0"
+appVersion: "2.56.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.02-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.03-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r25.02/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r25.03/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r25.02/README.md
+    # see https://github.com/triton-inference-server/server/blob/r25.03/README.md
     #  for more details
 
 service:
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`#`
`30`	`30`
`31`	`31`	`# Base image on the minimum Triton container`
`32`		`-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.02-py3-min`
	`32`	`+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.03-py3-min`
`33`	`33`
`34`	`34`	`ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo`
`35`	`35`	`ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo`