Skip to content

Commit a523939

Browse files
Update the disagg multi host sh file to setup the disagg inference in… (#922)
1 parent cb170bc commit a523939

File tree

1 file changed

+92
-10
lines changed

1 file changed

+92
-10
lines changed

examples/disagg/run_disagg_multi_host.sh

Lines changed: 92 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,26 @@
44

55
set -e
66

7-
docker stop $(docker ps -a --filter "name=node*" -q) && \
8-
docker rm -f $(docker ps -a --filter "name=node*" -q)
7+
echo "--- DEBUG: The HOME variable is set to: $HOME ---"
8+
9+
CONTAINERS=$(docker ps -a --filter "name=node*" -q)
10+
if [ -n "$CONTAINERS" ]; then
11+
docker stop $CONTAINERS
12+
docker rm -f $CONTAINERS
13+
fi
14+
15+
# NOTE: Strange ray race condition between dock build and ray start in single machine
16+
# if you find error, comments these line and rerun this .sh file
17+
# _wait_until_pg_ready(current_placement_group)
18+
# tpu-inference/tpu_inference/executors/ray_distributed_executor.py
919

1020
docker image prune -f
1121
docker build -f docker/Dockerfile -t ullm:test .
1222
DOCKER_IMAGE="ullm:test"
1323

1424
HOST_HF_HOME="/mnt/disks/data/hf-docker"
1525
NUM_HOSTS_PER_INSTANCE=4
16-
26+
COMMON_SIDE_PORT=8900
1727
MODEL="Qwen/Qwen3-0.6B"
1828

1929
######## Prefill hosts setup ########
@@ -27,7 +37,7 @@ for port in "${PREFILL_TPU_PORTS[@]}"; do
2737
done
2838
PREFILL_TPU_ADDRS=$(IFS=, ; echo "${PREFILL_TPU_ADDRS[*]}")
2939

30-
PREFILL_RAY_PORT=9100
40+
PREFILL_RAY_PORT=8100
3141

3242
for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
3343
tpu_port=${PREFILL_TPU_PORTS[$i]}
@@ -38,8 +48,8 @@ for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
3848
DOCKER_CMD="ray start --block --address=127.0.0.1:${PREFILL_RAY_PORT}"
3949
fi
4050

41-
KV_PORT=$((9200 + i))
42-
SIDE_PORT=$((9300 + i))
51+
KV_PORT=$((8200 + i))
52+
SIDE_PORT=$((COMMON_SIDE_PORT + i))
4353

4454
set -x
4555
docker run -d \
@@ -67,25 +77,97 @@ for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
6777
-v $HOME/test:/root/test \
6878
-v $HOME/logs:/root/logs \
6979
-v $HOME/vllm:/workspace/vllm \
80+
-v $HOME/tpu-inference:/workspace/tpu_inference \
7081
--entrypoint /bin/bash \
7182
"${DOCKER_IMAGE}" -c "${DOCKER_CMD}"
83+
sleep 2
7284
set +x
7385
done
7486

75-
7687
# Start vllm on host-0
7788

78-
PREFILL_VLLM_PORT="7400"
89+
PREFILL_VLLM_PORT="8400"
7990

8091
set -x
8192
docker exec node-0 /bin/bash -c \
8293
"vllm serve $MODEL \
8394
--port ${PREFILL_VLLM_PORT} \
84-
--gpu-memory-utilization 0.2 \
95+
--gpu-memory-utilization 0.3 \
8596
--tensor-parallel-size 4 \
8697
--kv-transfer-config '{\"kv_connector\":\"TPUConnector\",\"kv_connector_module_path\":\"tpu_inference.distributed.tpu_connector\",\"kv_role\":\"kv_producer\"}' \
8798
> /root/logs/prefill.txt 2>&1 &"
8899
set +x
89100

90101

91-
# TODO: add decode
102+
######## Decode hosts setup ########
103+
104+
# Start ray cluster on 4 hosts.
105+
106+
DECODE_TPU_PORTS=(9476 9477 9478 9479)
107+
DECODE_TPU_ADDRS=()
108+
for port in "${DECODE_TPU_PORTS[@]}"; do
109+
DECODE_TPU_ADDRS+=("127.0.0.1:$port")
110+
done
111+
DECODE_TPU_ADDRS=$(IFS=, ; echo "${DECODE_TPU_ADDRS[*]}")
112+
113+
DECODE_RAY_PORT=9100
114+
115+
for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
116+
tpu_port=${DECODE_TPU_PORTS[$i]}
117+
tpu_index=$((i + NUM_HOSTS_PER_INSTANCE))
118+
119+
if [[ i -eq 0 ]]; then
120+
DOCKER_CMD="ray start --block --head --port=${DECODE_RAY_PORT}"
121+
else
122+
DOCKER_CMD="ray start --block --address=127.0.0.1:${DECODE_RAY_PORT}"
123+
fi
124+
125+
KV_PORT=$((9200 + i))
126+
SIDE_PORT=$((COMMON_SIDE_PORT + i))
127+
128+
set -x
129+
docker run -d \
130+
--privileged \
131+
--network host \
132+
--shm-size 16G \
133+
--name "node-2${i}" \
134+
\
135+
-e TPU_MULTIHOST_BACKEND="ray" \
136+
-e TPU_NODE_ID="${i}" \
137+
-e TPU_KV_TRANSFER_PORT="${KV_PORT}" \
138+
-e TPU_SIDE_CHANNEL_PORT="${SIDE_PORT}" \
139+
-e RAY_DEDUP_LOGS="0" \
140+
-e SKIP_JAX_PRECOMPILE="1" \
141+
\
142+
-e TPU_CHIPS_PER_PROCESS_BOUNDS="1,1,1" \
143+
-e TPU_PROCESS_BOUNDS="2,2,1" \
144+
-e TPU_VISIBLE_CHIPS="${tpu_index}" \
145+
-e CLOUD_TPU_TASK_ID="${i}" \
146+
-e TPU_PROCESS_ADDRESSES="${DECODE_TPU_ADDRS}" \
147+
-e TPU_PROCESS_PORT="${tpu_port}" \
148+
\
149+
-e HF_HOME="/root/hf" \
150+
-v "${HOST_HF_HOME}:/root/hf" \
151+
-v $HOME/test:/root/test \
152+
-v $HOME/logs:/root/logs \
153+
-v $HOME/vllm:/workspace/vllm \
154+
-v $HOME/tpu-inference:/workspace/tpu_inference \
155+
--entrypoint /bin/bash \
156+
"${DOCKER_IMAGE}" -c "${DOCKER_CMD}"
157+
sleep 2
158+
set +x
159+
done
160+
161+
# Start vllm on host-20
162+
163+
DECODE_VLLM_PORT="9400"
164+
165+
set -x
166+
docker exec node-20 /bin/bash -c \
167+
"vllm serve $MODEL \
168+
--port ${DECODE_VLLM_PORT} \
169+
--gpu-memory-utilization 0.3 \
170+
--tensor-parallel-size 4 \
171+
--kv-transfer-config '{\"kv_connector\":\"TPUConnector\",\"kv_connector_module_path\":\"tpu_inference.distributed.tpu_connector\",\"kv_role\":\"kv_consumer\"}' \
172+
> /root/logs/decode.txt 2>&1 &"
173+
set +x

0 commit comments

Comments
 (0)