aws-samples · mvinci12 · Nov 19, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 9, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "3.test_cases/post-training/rlvr/verl"]
+	path = 3.test_cases/post-training/rlvr/verl
+	url = https://github.com/volcengine/verl.git
diff --git a/3.test_cases/post-training/rlvr/.gitignore b/3.test_cases/post-training/rlvr/.gitignore
@@ -0,0 +1,121 @@
+# Project-specific directories
+official-verl/
+debug-pod/
+nccl-tests/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Testing
+.pytest_cache/
+.coverage
+.coverage.*
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.tox/
+htmlcov/
+
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# IDEs and editors
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+Thumbs.db
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# PyCharm
+.idea/
+
+# Logs
+*.log
+logs/
+*.out
+*.err
+
+# Ray
+/tmp/ray/
+ray_results/
+
+# Model checkpoints and data
+checkpoints/
+*.ckpt
+*.pth
+*.pt
+*.bin
+*.safetensors
+models/
+data/
+*.parquet
+*.jsonl
+*.json.gz
+
+# Docker
+.dockerignore
+
+# Temporary files
+tmp/
+temp/
+*.tmp
+*.temp
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Kubernetes
+*.yaml.bak
+*.yml.bak
+
+# Environment variables (if they contain secrets)
+.env.local
+.env.production
+.env.staging
+
+# Profiling
+outputs/profile/
+*.prof
+*.trace
+
+# CUDA compilation cache
+.nv_cache/
diff --git a/3.test_cases/post-training/rlvr/.gitmodules b/3.test_cases/post-training/rlvr/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "verl"]
+	path = verl
+	url = https://github.com/volcengine/verl.git
diff --git a/3.test_cases/post-training/rlvr/Dockerfile b/3.test_cases/post-training/rlvr/Dockerfile
@@ -0,0 +1,124 @@
+# Dockerfile for VERL with EFA support
+# Using hiyouga/verl base image and adding EFA capabilities
+FROM hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0
+
+# EFA configuration
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
+ENV EFA_VERSION=1.43.3
+
+# Install system dependencies including EFA requirements
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-dev \
+    python3-pip \
+    git \
+    wget \
+    curl \
+    ninja-build \
+    autoconf \
+    build-essential \
+    pciutils \
+    environment-modules \
+    tcl \
+    tcl-dev \
+    libnl-3-dev \
+    libnl-route-3-dev \
+    dmidecode \
+    ethtool \
+    iproute2 \
+    libevent-dev \
+    libhwloc-dev \
+    openssh-server \
+    openssh-client \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Install udev in a container-safe way
+RUN apt-get update && apt-get install -y \
+    systemd \
+    udev \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Upgrade pip
+RUN python3 -m pip install --upgrade pip setuptools wheel
+
+#################################################
+## Clean up HPC-X to avoid conflicts with EFA
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+#################################################
+## EFA SETUP - Install EFA with all dependencies
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VERSION}.tar.gz \
+    && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify
+
+# Set environment paths for EFA components (order matters!)
+ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:$PATH"
+ENV LD_LIBRARY_PATH="/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+
+# OpenMPI configuration to use EFA and avoid conflicts
+ENV OMPI_MCA_pml=^ucx
+ENV OMPI_MCA_btl=tcp,self
+ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent
+ENV OPAL_PREFIX=/opt/amazon/openmpi
+
+# EFA/NCCL configuration for optimal performance
+ENV FI_PROVIDER=efa
+ENV FI_EFA_USE_DEVICE_RDMA=1
+ENV FI_EFA_FORK_SAFE=1
+ENV FI_EFA_ENABLE_SHM_TRANSFER=1
+ENV NCCL_PROTO=simple
+ENV NCCL_NET_GDR_LEVEL=LOC
+ENV NCCL_SOCKET_IFNAME=^docker,lo,veth
+ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
+ENV PMIX_MCA_gds=hash
+
+#################################################
+## Optional: Install NCCL tests for verification
+RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/opt/nccl/build
+
+
+# Install core ML libraries
+RUN pip install \
+    transformers>=4.45.0 \
+    datasets \
+    accelerate \
+    tokenizers \
+    numpy \
+    scipy \
+    scikit-learn \
+    vllm>=0.7.0 \
+    hydra-core \
+    omegaconf \
+    wandb \
+    tensorboard \
+    boto3 \
+    botocore \
+    tenacity \
+    s3torchconnector
+
+# Clone and install VERL
+WORKDIR /workspace
+RUN git clone https://github.com/volcengine/verl.git
+WORKDIR /workspace/verl
+
+# Install VERL in development mode
+RUN pip install -e .
+
+# Set working directory
+WORKDIR /workspace
+
+# Expose Ray ports
+EXPOSE 8265 10001 6379
diff --git a/3.test_cases/post-training/rlvr/README.md b/3.test_cases/post-training/rlvr/README.md
@@ -0,0 +1,73 @@
+# rlvr-recipe
+
+This repository provides a complete setup for running reinforcement learning from verifiable rewards (RLVR) on EKS clusters using Ray and verl. RLVR trains language models using verifiable rewards from math and coding tasks, where correctness can be automatically verified. The project uses verl, an efficient RL training framework from ByteDance, to run algorithms like GRPO (Group Relative Policy Optimization) and DAPO (Direct Advantage Policy Optimization) on distributed GPU clusters.
+
+## What is verl?
+
+[verl (Volcano Engine Reinforcement Learning)](https://github.com/volcengine/verl) is a flexible, production-ready RL training library for large language models. It provides seamless integration with popular frameworks like FSDP, Megatron-LM, vLLM, and Ray, enabling efficient distributed training with state-of-the-art throughput. This repo includes the full verl codebase with custom run scripts optimized for HyperPod.
+
+## What is RLVR?
+
+[Reinforcement Learning from Verifiable Rewards (RLVR)](https://arxiv.org/abs/2506.14245) is a training approach where models learn from tasks with objectively verifiable outcomes, such as math problems or code execution. Unlike human preference-based RL, RLVR uses ground-truth correctness as the reward signal, making it particularly effective for reasoning tasks.
+
+## Getting started
+
+From here on out, we will assume you have an EKS cluster with GPU nodes (e.g., p5en.48xlarge).
+
+### Clone this repo
+```bash
+git clone https://github.com/aws-samples/awsome-distributed-training.git #gitlab or github not sure yet
+cd awsome-distributed-training/3.test_cases/post-training/rlvr
+```
+
+### Create RayCluster
+
+Install KubeRay operator to manage Ray clusters on Kubernetes:
+```bash
+./setup/install-kuberay.sh
+```
+
+Configure your cluster settings (AWS region, cluster name, GPU counts, model paths):
+```bash
+vim setup/env_vars
+```
+
+Load the environment variables into your shell session:
+```bash
+source setup/env_vars
+```
+
+Build a Docker image with verl, EFA networking support, and push to ECR:
+```bash
+./setup/build-push.sh
+```
+
+Deploy the Ray cluster with head and worker pods configured for distributed training:
+```bash
+envsubst < setup/raycluster.yaml | kubectl apply -f -
+```
+
+Download the GSM8K math dataset and prepare it for GRPO training:
+```bash
+./setup/load_data_grpo.sh
+```
+
+Forward the Ray dashboard to localhost for monitoring training progress:
+```bash
+./ray-expose.sh
+```
+
+Submit a GRPO training job to the Ray cluster. This trains a language model on math reasoning using group relative policy optimization:
+```bash
+./recipe/run_grpo_configurable.sh
+```
+
+The `verl/` directory contains the official verl framework, and `recipe/` includes custom run scripts (`run_grpo_configurable.sh`, `run_dapo_configurable.sh`) that integrate with your environment variables for easy configuration.
+
+### Observability
+
+For EKS:
+Please see this documentation to set up Prometheus and Grafana dashboards for Ray clusters: [Using Prometheus & Grafana](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html)
+
+For HyperPod EKS:
+Check out the `observability/` directory to integrate Ray's native metrics dashboards with HyperPod's Amazon Managed Prometheus and Grafana
diff --git a/3.test_cases/post-training/rlvr/fsx/pvc.yaml b/3.test_cases/post-training/rlvr/fsx/pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-claim
+  namespace: default
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: fsx-sc
+  resources:
+    requests:
+      storage: 1200Gi
diff --git a/3.test_cases/post-training/rlvr/fsx/storageclass.yaml b/3.test_cases/post-training/rlvr/fsx/storageclass.yaml
@@ -0,0 +1,16 @@
+kind: StorageClass
+apiVersion: storage.k8s.io/v1
+metadata:
+  name: fsx-sc
+provisioner: fsx.csi.aws.com
+parameters:
+  subnetId: subnet-0d808f7937de12b4f
+  securityGroupIds: sg-0aa152ec598835c93
+  deploymentType: PERSISTENT_2
+  automaticBackupRetentionDays: "0"
+  copyTagsToBackups: "true"
+  perUnitStorageThroughput: "250"
+  dataCompressionType: "LZ4"
+  fileSystemTypeVersion: "2.15"
+mountOptions:
+  - flock
diff --git a/3.test_cases/post-training/rlvr/img/ray-dashboard.png b/3.test_cases/post-training/rlvr/img/ray-dashboard.png
diff --git a/3.test_cases/post-training/rlvr/job-stop.sh b/3.test_cases/post-training/rlvr/job-stop.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+./ray-expose.sh
+
+# Check if the submission_id is passed as an argument
+if [ -z "$1" ]; then
+    echo "Error: No submission_id provided."
+    echo "Usage: ./job-status.sh <submission_id>"
+    echo "List of jobs to choose from:"
+    echo ""
+    ray job list --address http://localhost:8266 | sed -n "s/.*submission_id='\([^']*\)'.*entrypoint='\([^']*\)'.*/submission_id: \1, entrypoint: \2/p"
+    echo -e "\n"
+    exit 1
+fi
+
+# Assign the user's input to a variable
+submission_id=$(ray job list --address http://localhost:8266 | sed -n "s/.*submission_id='\([^']*\)'.*entrypoint='\([^']*\)'.*/submission_id: \1, entrypoint: \2/p" | grep $1 | head -n 1 | cut -d ' ' -f 2 | cut -d ',' -f 1)
+
+# submission_id=$1
+
+CMD="ray job stop --address http://localhost:8266 $submission_id"
+
+if [ ! "$VERBOSE" == "false" ]; then echo -e "\n${CMD}\n"; fi
+eval "$CMD"
+
+echo -e "\n"