44
55set -e
66
7- docker stop $( docker ps -a --filter " name=node*" -q) && \
8- docker rm -f $( docker ps -a --filter " name=node*" -q)
7+ echo " --- DEBUG: The HOME variable is set to: $HOME ---"
8+
9+ CONTAINERS=$( docker ps -a --filter " name=node*" -q)
10+ if [ -n " $CONTAINERS " ]; then
11+ docker stop $CONTAINERS
12+ docker rm -f $CONTAINERS
13+ fi
14+
15+ # NOTE: Strange ray race condition between dock build and ray start in single machine
16+ # if you find error, comments these line and rerun this .sh file
17+ # _wait_until_pg_ready(current_placement_group)
18+ # tpu-inference/tpu_inference/executors/ray_distributed_executor.py
919
1020docker image prune -f
1121docker build -f docker/Dockerfile -t ullm:test .
1222DOCKER_IMAGE=" ullm:test"
1323
1424HOST_HF_HOME=" /mnt/disks/data/hf-docker"
1525NUM_HOSTS_PER_INSTANCE=4
16-
26+ COMMON_SIDE_PORT=8900
1727MODEL=" Qwen/Qwen3-0.6B"
1828
1929# ####### Prefill hosts setup ########
@@ -27,7 +37,7 @@ for port in "${PREFILL_TPU_PORTS[@]}"; do
2737done
2838PREFILL_TPU_ADDRS=$( IFS=, ; echo " ${PREFILL_TPU_ADDRS[*]} " )
2939
30- PREFILL_RAY_PORT=9100
40+ PREFILL_RAY_PORT=8100
3141
3242for (( i= 0 ; i< NUM_HOSTS_PER_INSTANCE; i++ )) ; do
3343 tpu_port=${PREFILL_TPU_PORTS[$i]}
@@ -38,8 +48,8 @@ for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
3848 DOCKER_CMD=" ray start --block --address=127.0.0.1:${PREFILL_RAY_PORT} "
3949 fi
4050
41- KV_PORT=$(( 9200 + i))
42- SIDE_PORT=$(( 9300 + i))
51+ KV_PORT=$(( 8200 + i))
52+ SIDE_PORT=$(( COMMON_SIDE_PORT + i))
4353
4454 set -x
4555 docker run -d \
@@ -67,25 +77,97 @@ for ((i=0; i<NUM_HOSTS_PER_INSTANCE; i++)); do
6777 -v $HOME /test:/root/test \
6878 -v $HOME /logs:/root/logs \
6979 -v $HOME /vllm:/workspace/vllm \
80+ -v $HOME /tpu-inference:/workspace/tpu_inference \
7081 --entrypoint /bin/bash \
7182 " ${DOCKER_IMAGE} " -c " ${DOCKER_CMD} "
83+ sleep 2
7284 set +x
7385done
7486
75-
7687# Start vllm on host-0
7788
78- PREFILL_VLLM_PORT=" 7400 "
89+ PREFILL_VLLM_PORT=" 8400 "
7990
8091set -x
8192docker exec node-0 /bin/bash -c \
8293 " vllm serve $MODEL \
8394 --port ${PREFILL_VLLM_PORT} \
84- --gpu-memory-utilization 0.2 \
95+ --gpu-memory-utilization 0.3 \
8596 --tensor-parallel-size 4 \
8697 --kv-transfer-config '{\" kv_connector\" :\" TPUConnector\" ,\" kv_connector_module_path\" :\" tpu_inference.distributed.tpu_connector\" ,\" kv_role\" :\" kv_producer\" }' \
8798 > /root/logs/prefill.txt 2>&1 &"
8899set +x
89100
90101
91- # TODO: add decode
102+ # ####### Decode hosts setup ########
103+
104+ # Start ray cluster on 4 hosts.
105+
106+ DECODE_TPU_PORTS=(9476 9477 9478 9479)
107+ DECODE_TPU_ADDRS=()
108+ for port in " ${DECODE_TPU_PORTS[@]} " ; do
109+ DECODE_TPU_ADDRS+=(" 127.0.0.1:$port " )
110+ done
111+ DECODE_TPU_ADDRS=$( IFS=, ; echo " ${DECODE_TPU_ADDRS[*]} " )
112+
113+ DECODE_RAY_PORT=9100
114+
115+ for (( i= 0 ; i< NUM_HOSTS_PER_INSTANCE; i++ )) ; do
116+ tpu_port=${DECODE_TPU_PORTS[$i]}
117+ tpu_index=$(( i + NUM_HOSTS_PER_INSTANCE))
118+
119+ if [[ i -eq 0 ]]; then
120+ DOCKER_CMD=" ray start --block --head --port=${DECODE_RAY_PORT} "
121+ else
122+ DOCKER_CMD=" ray start --block --address=127.0.0.1:${DECODE_RAY_PORT} "
123+ fi
124+
125+ KV_PORT=$(( 9200 + i))
126+ SIDE_PORT=$(( COMMON_SIDE_PORT + i))
127+
128+ set -x
129+ docker run -d \
130+ --privileged \
131+ --network host \
132+ --shm-size 16G \
133+ --name " node-2${i} " \
134+ \
135+ -e TPU_MULTIHOST_BACKEND=" ray" \
136+ -e TPU_NODE_ID=" ${i} " \
137+ -e TPU_KV_TRANSFER_PORT=" ${KV_PORT} " \
138+ -e TPU_SIDE_CHANNEL_PORT=" ${SIDE_PORT} " \
139+ -e RAY_DEDUP_LOGS=" 0" \
140+ -e SKIP_JAX_PRECOMPILE=" 1" \
141+ \
142+ -e TPU_CHIPS_PER_PROCESS_BOUNDS=" 1,1,1" \
143+ -e TPU_PROCESS_BOUNDS=" 2,2,1" \
144+ -e TPU_VISIBLE_CHIPS=" ${tpu_index} " \
145+ -e CLOUD_TPU_TASK_ID=" ${i} " \
146+ -e TPU_PROCESS_ADDRESSES=" ${DECODE_TPU_ADDRS} " \
147+ -e TPU_PROCESS_PORT=" ${tpu_port} " \
148+ \
149+ -e HF_HOME=" /root/hf" \
150+ -v " ${HOST_HF_HOME} :/root/hf" \
151+ -v $HOME /test:/root/test \
152+ -v $HOME /logs:/root/logs \
153+ -v $HOME /vllm:/workspace/vllm \
154+ -v $HOME /tpu-inference:/workspace/tpu_inference \
155+ --entrypoint /bin/bash \
156+ " ${DOCKER_IMAGE} " -c " ${DOCKER_CMD} "
157+ sleep 2
158+ set +x
159+ done
160+
161+ # Start vllm on host-20
162+
163+ DECODE_VLLM_PORT=" 9400"
164+
165+ set -x
166+ docker exec node-20 /bin/bash -c \
167+ " vllm serve $MODEL \
168+ --port ${DECODE_VLLM_PORT} \
169+ --gpu-memory-utilization 0.3 \
170+ --tensor-parallel-size 4 \
171+ --kv-transfer-config '{\" kv_connector\" :\" TPUConnector\" ,\" kv_connector_module_path\" :\" tpu_inference.distributed.tpu_connector\" ,\" kv_role\" :\" kv_consumer\" }' \
172+ > /root/logs/decode.txt 2>&1 &"
173+ set +x
0 commit comments