diff --git a/docker/dockerfiles/Dockerfile.onnx.gpu b/docker/dockerfiles/Dockerfile.onnx.gpu
index ceb0c495a1..668c6edcb1 100644
--- a/docker/dockerfiles/Dockerfile.onnx.gpu
+++ b/docker/dockerfiles/Dockerfile.onnx.gpu
@@ -1,5 +1,4 @@
-FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 as builder
-#has python 3.10
+FROM nvcr.io/nvidia/cuda:13.0.1-cudnn-devel-ubuntu24.04 as builder
 
 WORKDIR /app
 
@@ -14,8 +13,13 @@ RUN rm -rf /var/lib/apt/lists/* && apt-get clean && apt-get update -y && DEBIAN_
     wget \
     rustc \
     cargo \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 
+# Install uv using standalone installer (installs to /root/.local/bin)
+RUN curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh && \
+    ln -s /root/.local/bin/uv /usr/local/bin/uv
+
 COPY requirements/requirements.sam.txt \
     requirements/requirements.clip.txt \
     requirements/requirements.http.txt \
@@ -30,13 +34,12 @@ COPY requirements/requirements.sam.txt \
     requirements/requirements.easyocr.txt \
     ./
 
-RUN python3 -m pip install -U pip
-RUN python3 -m pip install \
+# Use uv for much faster package installation (WITHOUT onnxruntime-gpu, we'll build from source)
+RUN uv pip install --system --break-system-packages \
     -r _requirements.txt \
     -r requirements.sam.txt \
     -r requirements.clip.txt \
     -r requirements.http.txt \
-    -r requirements.gpu.txt \
     -r requirements.gaze.txt \
     -r requirements.groundingdino.txt \
     -r requirements.doctr.txt \
@@ -45,22 +48,83 @@ RUN python3 -m pip install \
     -r requirements.easyocr.txt \
     jupyterlab \
     "setuptools<=75.5.0" \
-    --upgrade \
-    && rm -rf ~/.cache/pip
+    packaging \
+    numpy \
+    && rm -rf ~/.cache/uv
+
+# Install build tools for ONNX Runtime
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    cmake \
+    ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+# Fix CUDA 13.0 missing CCCL headers - fetch from NVIDIA repository
+# Fix CUDA 13.0 missing CCCL headers - include cuda/std headers
+RUN git clone --depth=1 --branch v3.0.0 https://github.com/NVIDIA/cccl.git /tmp/cccl && \
+    mkdir -p /usr/local/cuda-13.0/targets/sbsa-linux/include/cccl && \
+    mkdir -p /usr/local/cuda-13.0/targets/sbsa-linux/include/cuda && \
+    cp -r /tmp/cccl/libcudacxx/include/* /usr/local/cuda-13.0/targets/sbsa-linux/include/ && \
+    cp -r /tmp/cccl/cub/cub /usr/local/cuda-13.0/targets/sbsa-linux/include/cccl/ && \
+    cp -r /tmp/cccl/thrust/thrust /usr/local/cuda-13.0/targets/sbsa-linux/include/cccl/ && \
+    ln -sf /usr/local/cuda-13.0/targets/sbsa-linux /usr/local/cuda/targets/sbsa-linux && \
+    rm -rf /tmp/cccl
+
+# Build ONNX Runtime from source for CUDA 13.0 (using main branch for latest CUDA 13 fixes)
+WORKDIR /tmp
+RUN git clone --recursive --branch main https://github.com/microsoft/onnxruntime.git /tmp/onnxruntime
+WORKDIR /tmp/onnxruntime
+
+# Build ONNX Runtime with CUDA 13 - using exact working config from GitHub PR
+RUN ./build.sh \
+    --config Release \
+    --build_dir build/cuda13 \
+    --parallel 16 \
+    --use_cuda \
+    --cuda_version 13.0 \
+    --cuda_home /usr/local/cuda \
+    --cudnn_home /usr/local/cuda \
+    --build_wheel \
+    --build_shared_lib \
+    --skip_tests \
+    --cmake_generator Ninja \
+    --enable_cuda_nhwc_ops \
+    --use_binskim_compliant_compile_flags \
+    --allow_running_as_root \
+    --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="120-real;121-real;121-virtual" \
+    --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+
+# Install the built ONNX Runtime wheel
+RUN uv pip install --system --break-system-packages /tmp/onnxruntime/build/cuda13/Release/dist/onnxruntime_gpu-*.whl
+
+# Install GPU-enabled PyTorch 2.9.0 with CUDA 13.0 support
+RUN uv pip uninstall torch torchvision torchaudio || true && \
+    uv pip install --system --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+
+# Remove any existing install and clone fresh
+RUN uv pip uninstall xformers || true && \
+    rm -rf /tmp/xformers && \
+    git clone --recursive https://github.com/facebookresearch/xformers.git /tmp/xformers
 
-# Install setup.py requirements for flash_attn
-RUN python3 -m pip install packaging==24.1 && rm -rf ~/.cache/pip
+WORKDIR /tmp/xformers
+RUN MAX_JOBS=8 CMAKE_BUILD_PARALLEL_LEVEL=8 uv pip install --system --break-system-packages . --no-build-isolation -v
 
-# Install flash_attn required for Paligemma and Florence2
-RUN python3 -m pip install -r requirements.pali.flash_attn.txt --no-dependencies --no-build-isolation && rm -rf ~/.cache/pip
+ENV CMAKE_BUILD_PARALLEL_LEVEL=4
+ENV MAX_JOBS=4
+ENV SETUPTOOLS_BUILD_PARALLEL=1
+# Clone and build FlashAttention from source
+WORKDIR /tmp
+RUN git clone https://github.com/Dao-AILab/flash-attention.git
+WORKDIR /tmp/flash-attention
+RUN MAX_JOBS=4 CMAKE_BUILD_PARALLEL_LEVEL=4 uv pip install --system --break-system-packages . --no-build-isolation -v
 
-# Start runtime stage
-FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 as runtime
+################################################################################
+# RUNTIME STAGE
+################################################################################
+FROM nvcr.io/nvidia/cuda:13.0.1-cudnn-runtime-ubuntu24.04 as runtime
 
 WORKDIR /app
 
 # Copy Python and installed packages from builder
-COPY --from=builder /usr/local/lib/python3.10 /usr/local/lib/python3.10
+COPY --from=builder /usr/local/lib/python3.12 /usr/local/lib/python3.12
 COPY --from=builder /usr/local/bin /usr/local/bin
 
 # Install runtime dependencies
@@ -75,15 +139,52 @@ RUN rm -rf /var/lib/apt/lists/* && apt-get clean && apt-get update -y && DEBIAN_
     wget \
     rustc \
     cargo \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 
+# uv was already copied from builder stage, no need to reinstall
+
 WORKDIR /build
 COPY . .
 RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN /bin/make create_wheels_for_gpu_notebook
-RUN pip3 install --no-cache-dir dist/inference_cli*.whl dist/inference_core*.whl dist/inference_gpu*.whl dist/inference_sdk*.whl "setuptools<=75.5.0"
+
+# Build wheels directly without upgrading pip (Debian-installed pip issue)
+RUN python -m pip install --break-system-packages wheel twine requests && \
+    rm -f dist/* && \
+    python .release/pypi/inference.core.setup.py bdist_wheel && \
+    python .release/pypi/inference.gpu.setup.py bdist_wheel && \
+    python .release/pypi/inference.cli.setup.py bdist_wheel && \
+    python .release/pypi/inference.sdk.setup.py bdist_wheel
+
+# First install the GPU wheel with no dependencies to avoid re-installing onnxruntime-gpu
+RUN python -m pip install --break-system-packages --no-deps dist/inference_gpu*.whl
+
+# Then install the rest with dependency resolution enabled
+RUN python -m pip install --break-system-packages \
+    dist/inference_core*.whl \
+    dist/inference_cli*.whl \
+    dist/inference_sdk*.whl \
+    "setuptools<=75.5.0"
+
 
 WORKDIR /notebooks
+
+# Install Node.js 20.x and build JupyterLab assets in runtime container
+RUN apt-get update && \
+    apt-get install -y ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y nodejs && \
+    node --version && \
+    jupyter lab build --minimize=False --dev-build=False && \
+    jupyter lab clean && \
+    npm cache clean --force && \
+    apt-get remove -y nodejs gnupg && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
 COPY examples/notebooks .
 
 WORKDIR /app/