diff --git a/dockerfiles/llmbench/vllm-workspace/bench.sh b/dockerfiles/llmbench/vllm-workspace/bench.sh index 66323fe..5be5141 100644 --- a/dockerfiles/llmbench/vllm-workspace/bench.sh +++ b/dockerfiles/llmbench/vllm-workspace/bench.sh @@ -9,8 +9,8 @@ declare -a isl_list=(3000 1000 500) declare -a osl_list=(150 1000 1000) for i in "${!isl_list[@]}"; do - ISL=${isl_list[$i]} - OSL=${osl_list[$i]} + local isl=${isl_list[$i]} + local osl=${osl_list[$i]} echo "======================================================" echo " Running benchmark with 'ISL:OSL=${ISL}:${OSL}' " echo " OpenAI Server 'http://${HOST}:${PORT}' " @@ -18,7 +18,13 @@ for i in "${!isl_list[@]}"; do echo " Tokenizer '${TOKENIZER}' " echo "======================================================" - ISL=${ISL} OSL=${OSL} bash benchmark_serving_concurrency.sh + bash benchmark_serving_concurrency_list.sh \ + --host ${HOST} \ + --port ${PORT} \ + --model ${MODEL} \ + --tokenizer ${TOKENIZER} \ + -isl ${isl} \ + -osl ${osl} echo "" echo "" diff --git a/dockerfiles/llmbench/vllm-workspace/benchmark_cron.sh b/dockerfiles/llmbench/vllm-workspace/benchmark_cron.sh new file mode 100644 index 0000000..9620eff --- /dev/null +++ b/dockerfiles/llmbench/vllm-workspace/benchmark_cron.sh @@ -0,0 +1,252 @@ +#!/bin/bash + +# Default values +PEAK_HOURS="9-10,13-15" +MAX_CONCURRENCY="256" +MIN_CONCURRENCY="16" +HOST="localhost" +PORT="8000" +MODEL="deepseek-ai/DeepSeek-R1" +TOKENIZER="/workspace/tokenizer/${MODEL}" +ISL="1000" +OSL="1000" +TIMEZONE="$(date +%Z)" # Default to system timezone +OUTPUT_RESULT_FILE="benchmark_result_${ISL}_${OSL}.md" + +# Signal handling variables +should_exit=0 + +# Function to handle SIGINT (Ctrl+C) +handle_sigint() { + echo "\nReceived interrupt signal (Ctrl+C), exiting gracefully..." + should_exit=1 +} + +# Trap SIGINT signal +trap handle_sigint SIGINT + +# Function to print help message +print_help() { + echo "Usage: $0 [OPTIONS]" + echo + echo "Examples: $0 \\" + echo " --host localhost \\" + echo " --port 8000 \\" + echo " --model Qwen/Qwen3-32B \\" + echo " --tokenizer /workspace/tokenizer/Qwen/Qwen3-32B \\" + echo " --peak-hours 9-10,13-15 \\" + echo " --timezone Asia/Shanghai" + echo + echo "Options:" + echo " --peak-hours Peak hours range (default: ${PEAK_HOURS})" + echo " -max, --max-concurrency Base concurrency during peak hours (default: ${MAX_CONCURRENCY})" + echo " -min, --min-concurrency Base concurrency during off-peak hours (default: ${MIN_CONCURRENCY})" + echo " --timezone Timezone for hour calculation (default: ${TIMEZONE})" + echo " --host Target host for inference requests (default: ${HOST})" + echo " --port Target port for inference requests (default: ${PORT})" + echo " --model Model ID to benchmark (default: ${MODEL})" + echo " --tokenizer Tokenizer path (default: ${TOKENIZER})" + echo " -isl, --input-seq-len Input sequence length (default: ${ISL})" + echo " -osl, --output-seq-len Output sequence length (default: ${OSL})" + echo " -h, --help Show this help message and exit" + exit 0 +} + + +# Function to get current hour with timezone support +get_current_hour() { + date +%-H +} + +# Function to check if current hour is within any peak range +is_peak_hour() { + local current_hour=$1 + for range in "${PEAK_RANGES[@]}"; do + IFS='-' read -r start end <<< "${range}" + if [[ ${current_hour} -ge ${start} && ${current_hour} -le ${end} ]]; then + return 0 + fi + done + return 1 +} + +# Function to calculate distance to the nearest peak hour +calculate_distance_to_peak() { + local current_hour=$1 + local min_distance=24 # Initialize with maximum possible distance + + for range in "${PEAK_RANGES[@]}"; do + IFS='-' read -r peak_start peak_end <<< "${range}" + + # Calculate distance to peak period start (handles both before and after cases) + x=${peak_start} + y=${current_hour} + distance_to_start_1=$(( ($y - $x + 24) % 24 )) + distance_to_start_2=$(( ($x - $y + 24) % 24 )) + if [[ ${distance_to_start_1} -lt ${distance_to_start_2} ]]; then + distance_to_start=${distance_to_start_1} + else + distance_to_start=${distance_to_start_2} + fi + + # Calculate distance to peak period end (handles both before and after cases) + x=${peak_end} + y=${current_hour} + distance_to_end_1=$(( ($y - $x + 24) % 24 )) + distance_to_end_2=$(( ($x - $y + 24) % 24 )) + if [[ ${distance_to_end_1} -lt ${distance_to_end_2} ]]; then + distance_to_end=${distance_to_end_1} + else + distance_to_end=${distance_to_end_2} + fi + + # The actual distance is the minimum of these two values + # This covers cases where current time is before, during or after the peak period + if [[ ${distance_to_start} -lt ${distance_to_end} ]]; then + current_distance=${distance_to_start} + else + current_distance=${distance_to_end} + fi + + # Keep track of the smallest distance across all peak ranges + if [[ ${current_distance} -lt ${min_distance} ]]; then + min_distance=${current_distance} + fi + done + + echo ${min_distance} +} + +# Function to run the benchmark +run_benchmark() { + local concurrency=$1 + local temp_dir=$(mktemp -d) + local result_file="${temp_dir}/bench_${ISL}_${OSL}_c${concurrency}.json" + # Initialize results file + echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL})" > ${OUTPUT_RESULT_FILE} + echo "| Concurrency | Request Throughput | Output Throughput | Mean E2EL (ms) | P99 E2EL (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |" >> ${OUTPUT_RESULT_FILE} + echo "|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|" >> ${OUTPUT_RESULT_FILE} + + + echo "[$(date +'%Y-%m-%d %H:%M:%S')] Running benchmark with concurrency: ${concurrency}" + uv run python3 vllm-benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model ${TOKENIZER} \ + --served-model-name ${MODEL} \ + --host ${HOST} --port ${PORT} \ + --endpoint /v1/chat/completions \ + --dataset-name random \ + --random_input_len ${ISL} \ + --random_output_len ${OSL} \ + --max-concurrency ${concurrency} \ + --num-prompts $((${concurrency}*10)) \ + --save-result --result-filename ${result_file} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --ignore-eos 2>&1 > /dev/null + echo "[$(date +'%Y-%m-%d %H:%M:%S')] Finshed benchmark with concurrency: ${concurrency}" + + # Parse and log results + eval $(cat ${result_file} | jq -r ' + . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} + | to_entries[] + | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" + ') + echo "| ${concurrency} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} |" >> ${OUTPUT_RESULT_FILE} +} + +# Function to precompute base_concurrency for all hours +precompute_concurrency() { + for hour in {0..23}; do + # Check if hour is within any peak range + if is_peak_hour ${hour}; then + HOURLY_CONCURRENCY[$hour]=${MAX_CONCURRENCY} + else + distance=$(calculate_distance_to_peak ${hour}) + # Calculate base concurrency + HOURLY_CONCURRENCY[$hour]=$(( ${MIN_CONCURRENCY} + ( (${MAX_CONCURRENCY} - ${MIN_CONCURRENCY}) / 12 * (10 - ${distance})) )) + fi + done + + # Print hourly concurrency + echo "Hourly Base Concurrency Preview:" + for hour in {0..23}; do + printf "Hour %2d: %3d\n" ${hour} ${HOURLY_CONCURRENCY[${hour}]} + done +} + +# Global variables +declare -a HOURLY_CONCURRENCY + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --timezone) + TIMEZONE="$2" + shift 2 + ;; + --peak-hours) + PEAK_HOURS="$2" + shift 2 + ;; + -max|--max-concurrency) + MAX_CONCURRENCY="$2" + shift 2 + ;; + -min|--min-concurrency) + MIN_CONCURRENCY="$2" + shift 2 + ;; + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --tokenizer) + TOKENIZER="$2" + shift 2 + ;; + -isl|--input-seq-len) + ISL="$2" + shift 2 + ;; + -osl|--output-seq-len) + OSL="$2" + shift 2 + ;; + -h|--help) + print_help + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +export TZ=${TIMEZONE} + +# Split PEAK_HOURS into an array of ranges +IFS=',' read -ra PEAK_RANGES <<< "${PEAK_HOURS}" + +# Precompute and show hourly concurrency +precompute_concurrency + +# Simulate business tides with dynamic concurrency +while [[ ${should_exit} -eq 0 ]]; do + current_hour=$(get_current_hour) + base_concurrency=${HOURLY_CONCURRENCY[${current_hour}]} + + # Add random spikes (毛刺) to concurrency + spike=$((RANDOM % 16)) + concurrency=$((base_concurrency + spike)) + + # Run the benchmark + run_benchmark ${concurrency} +done diff --git a/dockerfiles/llmbench/vllm-workspace/benchmark_scaling.sh b/dockerfiles/llmbench/vllm-workspace/benchmark_scaling.sh new file mode 100644 index 0000000..78de56b --- /dev/null +++ b/dockerfiles/llmbench/vllm-workspace/benchmark_scaling.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# Default values +INITIAL_CONCURRENCY=16 +MAX_CONCURRENCY=256 +INTERVAL_SECONDS=60 +HOST="localhost" +PORT="8000" +MODEL="deepseek-ai/DeepSeek-R1" +TOKENIZER="/workspace/tokenizer/${MODEL}" +ISL="1000" +OSL="1000" +MAX_CONCURRENCY_LOOP=3 + +# Function to print help message +print_help() { + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " --initial-concurrency Initial concurrency level (default: ${INITIAL_CONCURRENCY})" + echo " --max-concurrency Maximum concurrency level (default: ${MAX_CONCURRENCY})" + echo " --max-concurrency-loop Number of times to repeat max concurrency (default: ${MAX_CONCURRENCY_LOOP})" + echo " --interval Interval in seconds between concurrency increases (default: ${INTERVAL_SECONDS})" + echo " --host Target host for inference requests (default: ${HOST})" + echo " --port Target port for inference requests (default: ${PORT})" + echo " --model Model ID to benchmark (default: ${MODEL})" + echo " --tokenizer Tokenizer path (default: ${TOKENIZER})" + echo " -isl, --input-seq-len Input sequence length (default: ${ISL})" + echo " -osl, --output-seq-len Output sequence length (default: ${OSL})" + echo " -h, --help Show this help message and exit" + exit 0 +} + +# Function to parse command line arguments +get_options() { + while [[ $# -gt 0 ]]; do + case $1 in + --initial-concurrency) + INITIAL_CONCURRENCY="$2" + shift 2 + ;; + --max-concurrency) + MAX_CONCURRENCY="$2" + shift 2 + ;; + --interval) + INTERVAL_SECONDS="$2" + shift 2 + ;; + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --tokenizer) + TOKENIZER="$2" + shift 2 + ;; + -isl|--input-seq-len) + ISL="$2" + shift 2 + ;; + -osl|--output-seq-len) + OSL="$2" + shift 2 + ;; + -h|--help) + print_help + exit 1 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# Function to log messages +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +# Function to run the benchmark +run_benchmark() { + local concurrency=$1 + local temp_dir=$(mktemp -d) + local result_file="${temp_dir}/bench_${ISL}_${OSL}_c${concurrency}.json" + local output_result_file="benchmark_result_${ISL}_${OSL}_c${concurrency}.md" + + log "Running benchmark with concurrency: ${concurrency}" + uv run python3 vllm-benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model ${TOKENIZER} \ + --served-model-name ${MODEL} \ + --host ${HOST} --port ${PORT} \ + --endpoint /v1/chat/completions \ + --dataset-name random \ + --random_input_len ${ISL} \ + --random_output_len ${OSL} \ + --max-concurrency ${concurrency} \ + --num-prompts $((${concurrency} * 10)) \ + --save-result --result-filename ${result_file} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --ignore-eos + + # Generate markdown table for benchmark result + eval $(cat ${result_file} | jq -r ' + . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} + | to_entries[] + | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" + ') + log "Benchmark completed. Results saved to '${output_result_file}'" + echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL}) +| Concurrency | Request Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Latency (ms) | P99 E2E Latency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) | +|:-----------:|:---------------------------:|:-------------------------------:|:---------------------:|:-------------------:|:--------------:|:-------------:|:--------------:|:-------------:| +| ${concurrency} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} | +" > ${output_result_file} +} + +# Main script execution +main() { + # Parse command line arguments + get_options "$@" + + # Initialize concurrency + current_concurrency=${INITIAL_CONCURRENCY} + + # Run benchmark in a loop, doubling concurrency until max is reached + while true; do + run_benchmark ${current_concurrency} + current_concurrency=$((current_concurrency * 2)) + if [[ ${current_concurrency} -gt ${MAX_CONCURRENCY} ]]; then + break + fi + log "Waiting for ${INTERVAL_SECONDS} seconds before next run (with concurrency: ${current_concurrency})" + sleep ${INTERVAL_SECONDS} + done + + log "Max concurrency reached, running ${MAX_CONCURRENCY_LOOP} loops." + for i in $(seq 1 ${MAX_CONCURRENCY_LOOP}); do + run_benchmark ${MAX_CONCURRENCY} + log "Waiting for ${INTERVAL_SECONDS} seconds before next run (with concurrency: ${MAX_CONCURRENCY})" + sleep ${INTERVAL_SECONDS} + done + + while true; do + current_concurrency=$((current_concurrency / 2)) + run_benchmark ${current_concurrency} + if [[ ${current_concurrency} -lt ${INITIAL_CONCURRENCY} ]]; then + break + fi + log "Waiting for ${INTERVAL_SECONDS} seconds before next run (with concurrency: ${current_concurrency})" + sleep ${INTERVAL_SECONDS} + done + + log "Benchmark completed." +} + +# Execute the main function +main "$@" diff --git a/dockerfiles/llmbench/vllm-workspace/benchmark_serving.sh b/dockerfiles/llmbench/vllm-workspace/benchmark_serving.sh index dac20f4..00c78e8 100644 --- a/dockerfiles/llmbench/vllm-workspace/benchmark_serving.sh +++ b/dockerfiles/llmbench/vllm-workspace/benchmark_serving.sh @@ -1,43 +1,125 @@ #!/bin/bash +set -euo pipefail -HOST=${HOST:-"localhost"} -PORT=${PORT:-"8000"} -MODEL=${MODEL:-"deepseek-ai/DeepSeek-R1"} -TOKENIZER=${TOKENIZER:-"/workspace/tokenizer/${MODEL}"} - -CONCURRENCY=${CONCURRENCY:-"64"} -ISL=${ISL:-"1000"} -OSL=${OSL:-"1000"} - -RESULT_FILENAME=${RESULT_FILENAME:-"benchmark_result_${ISL}_${OSL}_c${CONCURRENCY}.md"} - -mkdir -p /tmp/llmbench - -uv run python3 vllm-benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model ${TOKENIZER} \ - --served-model-name ${MODEL} \ - --host ${HOST} --port ${PORT} \ - --endpoint /v1/chat/completions \ - --dataset-name random \ - --random_input_len ${ISL} \ - --random_output_len ${OSL} \ - --max-concurrency ${CONCURRENCY} \ - --num-prompts $(($CONCURRENCY*10)) \ - --save-result --result-filename /tmp/llmbench/bench_${ISL}_${OSL}_c${CONCURRENCY}.json \ - --percentile-metrics ttft,tpot,itl,e2el \ - --ignore-eos - -# generate markdown table for benchmark result -eval $(cat /tmp/llmbench/bench_${ISL}_${OSL}_c${CONCURRENCY}.json | jq -r ' -. | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} -| to_entries[] -| "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" -') -echo "[tke-llmbench] save result to '${RESULT_FILENAME}'" -echo "# Benchmark Result +# Default values +HOST="localhost" +PORT="8000" +MODEL="deepseek-ai/DeepSeek-R1" +TOKENIZER="/workspace/tokenizer/${MODEL}" +CONCURRENCY="64" +ISL="1000" +OSL="1000" +# Function to print help message +print_help() { + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " --host Target host for inference requests (default: ${HOST})" + echo " --port Target port for inference requests (default: ${PORT})" + echo " --model Model ID to benchmark (default: ${MODEL})" + echo " --tokenizer Tokenizer path (default: ${TOKENIZER})" + echo " -isl, --input-sequence-length Input sequence length (default: ${ISL})" + echo " -osl, --output-sequence-length Output sequence length (default: ${OSL})" + echo " --concurrency Concurrency level (default: ${CONCURRENCY})" + echo " -h, --help Show this help message and exit" + echo + exit 0 +} + +# Function to parse command line arguments +get_options() { + while [[ $# -gt 0 ]]; do + case $1 in + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --tokenizer) + TOKENIZER="$2" + shift 2 + ;; + -isl|--input-sequence-length) + ISL="$2" + shift 2 + ;; + -osl|--output-sequence-length) + OSL="$2" + shift 2 + ;; + --concurrency) + CONCURRENCY="$2" + shift 2 + ;; + -h|--help) + print_help + exit 1 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# Function to log messages +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +run_benchmark() { + local temp_dir=$(mktemp -d) + local result_file="${temp_dir}/bench_${ISL}_${OSL}_c${CONCURRENCY}.json" + local output_result_file="benchmark_result_${ISL}_${OSL}_c${CONCURRENCY}.md" + + + log "Running benchmark with concurrency: ${CONCURRENCY}" + uv run python3 vllm-benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model ${TOKENIZER} \ + --served-model-name ${MODEL} \ + --host ${HOST} --port ${PORT} \ + --endpoint /v1/chat/completions \ + --dataset-name random \ + --random_input_len ${ISL} \ + --random_output_len ${OSL} \ + --max-concurrency ${CONCURRENCY} \ + --num-prompts $(($CONCURRENCY*10)) \ + --save-result --result-filename ${result_file} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --ignore-eos + + # generate markdown table for benchmark result + eval $(cat ${result_file} | jq -r ' + . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} + | to_entries[] + | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" + ') + log "Benchmark completed. Results saved to '${output_result_file}'" + echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL}) | Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) | -|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:| +|:-----------:|:---------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:--------------:|:-------------:| | ${CONCURRENCY} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} | -" > ${RESULT_FILENAME} +" > ${output_result_file} +} + +# Main script execution +main() { + # Parse command line arguments + get_options "$@" + + # Runs benchmark with the given concurrency + run_benchmark +} + +# Execute the main function +main "$@" \ No newline at end of file diff --git a/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency.sh b/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency.sh deleted file mode 100644 index a12328d..0000000 --- a/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -HOST=${HOST:-"localhost"} -PORT=${PORT:-"8000"} -MODEL=${MODEL:-"deepseek-ai/DeepSeek-R1"} -TOKENIZER=${TOKENIZER:-"/workspace/tokenizer/${MODEL}"} -ISL=${ISL:-"1000"} -OSL=${OSL:-"1000"} -OUTPUT_RESULT_FILE=${OUTPUT_RESULT_FILE:-"benchmark_result_${ISL}_${OSL}.md"} - -BENCH_LOOP=${BENCH_LOOP:-"1"} - -mkdir -p /tmp/llmbench -echo "[tke-llmbench] the result will save to '${OUTPUT_RESULT_FILE}'" -echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL}) - -| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) | -|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|" > ${OUTPUT_RESULT_FILE} - -for i in {1..${BENCH_LOOP}}; do - for concurrency in 1 2 4 8 16 32 64 128; do - set -x - uv run python3 vllm-benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model ${TOKENIZER} \ - --served-model-name ${MODEL} \ - --host ${HOST} --port ${PORT} \ - --endpoint /v1/chat/completions \ - --dataset-name random \ - --random_input_len ${ISL} \ - --random_output_len ${OSL} \ - --max-concurrency ${concurrency} \ - --num-prompts $((${concurrency}*10)) \ - --save-result --result-filename /tmp/llmbench/bench_${ISL}_${OSL}_c${concurrency}.json \ - --percentile-metrics ttft,tpot,itl,e2el \ - --ignore-eos - set +x - - eval $(cat /tmp/llmbench/bench_${ISL}_${OSL}_c${concurrency}.json | jq -r ' - . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} - | to_entries[] - | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" - ') - echo "| ${concurrency} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} |" >> ${OUTPUT_RESULT_FILE} - done -done - -cat ${OUTPUT_RESULT_FILE} \ No newline at end of file diff --git a/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency_list.sh b/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency_list.sh new file mode 100644 index 0000000..ff31d5f --- /dev/null +++ b/dockerfiles/llmbench/vllm-workspace/benchmark_serving_concurrency_list.sh @@ -0,0 +1,158 @@ +#!/bin/bash +set -euo pipefail + +# Default values +HOST="localhost" +PORT="8000" +MODEL="deepseek-ai/DeepSeek-R1" +TOKENIZER="/workspace/tokenizer/${MODEL}" +ISL="1000" +OSL="1000" +BENCH_LOOP="1" +OUTPUT_RESULT_FILE="benchmark_result_${ISL}_${OSL}.md" +CONCURRENCY_LIST="1,2,4,8,16,32,64,128" + +# Function to print help message +print_help() { + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " --host Target host for inference requests (default: ${HOST})" + echo " --port Target port for inference requests (default: ${PORT})" + echo " --model Model ID to benchmark (default: ${MODEL})" + echo " --tokenizer Tokenizer path (default: ${TOKENIZER})" + echo " -isl, --input-sequence-length Input sequence length (default: ${ISL})" + echo " -osl, --output-sequence-length Output sequence length (default: ${OSL})" + echo " --bench-loop Number of benchmark loops (default: ${BENCH_LOOP})" + echo " --concurrency Comma-separated concurrency levels (default: ${CONCURRENCY_LIST})" + echo " -h, --help Show this help message and exit" + echo + exit 0 +} + +# Function to parse command line arguments +get_options() { + while [[ $# -gt 0 ]]; do + case $1 in + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --model) + MODEL="$2" + shift 2 + ;; + --tokenizer) + TOKENIZER="$2" + shift 2 + ;; + -isl|--input-sequence-length) + ISL="$2" + shift 2 + ;; + -osl|--output-sequence-length) + OSL="$2" + shift 2 + ;; + --bench-loop) + BENCH_LOOP="$2" + shift 2 + ;; + --concurrency) + CONCURRENCY_LIST="$2" + shift 2 + ;; + -h|--help) + print_help + exit 1 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + done + + OUTPUT_RESULT_FILE="benchmark_result_${ISL}_${OSL}.md" +} + + +# Function to log messages +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +# Function to validate concurrency values +validate_concurrency() { + for val in "${concurrency_array[@]}"; do + if ! [[ "$val" =~ ^[0-9]+$ ]] || [ "$val" -le 0 ]; then + log "Error: Invalid concurrency value '$val'. Must be a positive integer." + exit 1 + fi + done +} + +# Function to run the benchmark +run_benchmark() { + local concurrency=$1 + local temp_dir=$(mktemp -d) + local result_file="${temp_dir}/bench_${ISL}_${OSL}_c${concurrency}.json" + + log "Running benchmark with concurrency: ${concurrency}" + set -x + uv run python3 vllm-benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model ${TOKENIZER} \ + --served-model-name ${MODEL} \ + --host ${HOST} --port ${PORT} \ + --endpoint /v1/chat/completions \ + --dataset-name random \ + --random_input_len ${ISL} \ + --random_output_len ${OSL} \ + --max-concurrency ${concurrency} \ + --num-prompts $((${concurrency}*10)) \ + --save-result --result-filename ${result_file} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --ignore-eos + { set +x; } 2>/dev/null + + # Parse and log results + eval $(cat ${result_file} | jq -r ' + . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} + | to_entries[] + | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" + ') + echo "| ${concurrency} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} |" >> ${OUTPUT_RESULT_FILE} +} + +# Main script execution +main() { + # Parse command line arguments + get_options "$@" + + IFS=',' read -r -a concurrency_array <<< "${CONCURRENCY_LIST}" + validate_concurrency + + log "Starting benchmark. Results will be saved to '${OUTPUT_RESULT_FILE}'" + echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL}) + +| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) | +|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|" > ${OUTPUT_RESULT_FILE} + + for i in $(seq 1 ${BENCH_LOOP}); do + log "Benchmark loop ${i}/${BENCH_LOOP}" + for concurrency in "${concurrency_array[@]}"; do + run_benchmark ${concurrency} + done + done + + log "Benchmark completed. Results saved to '${OUTPUT_RESULT_FILE}'" + cat ${OUTPUT_RESULT_FILE} +} + +# Execute the main function +main "$@" \ No newline at end of file diff --git a/dockerfiles/llmbench/vllm-workspace/sla_benchmark.sh b/dockerfiles/llmbench/vllm-workspace/sla_benchmark.sh new file mode 100644 index 0000000..2cc6604 --- /dev/null +++ b/dockerfiles/llmbench/vllm-workspace/sla_benchmark.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +HOST=${HOST:-"localhost"} +PORT=${PORT:-"8000"} +MODEL=${MODEL:-"deepseek-ai/DeepSeek-R1"} +TOKENIZER=${TOKENIZER:-"/workspace/tokenizer/${MODEL}"} + +ISL=${ISL:-"2500"} +OSL=${OSL:-"500"} +TTFT=${TTFT:-"5000"} +TPOT=${TPOT:-"100"} +OUTPUT_RESULT_FILE=${OUTPUT_RESULT_FILE:-"sla_ttft${TTFT}_tpot${TPOT}_benchmark_result_${ISL}_${OSL}.md"} + +TEMP_DIR=$(mktemp -d) +echo "[tke-llmbench] the result will save to '${OUTPUT_RESULT_FILE}'" +echo "# Benchmark Result (ISL:OSL=${ISL}:${OSL}) + +| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) | +|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|" > ${OUTPUT_RESULT_FILE} + +concurrency=1 +while true; do + set -x + + uv run python3 vllm-benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model ${TOKENIZER} \ + --served-model-name ${MODEL} \ + --host ${HOST} --port ${PORT} \ + --endpoint /v1/chat/completions \ + --dataset-name random \ + --random_input_len ${ISL} \ + --random_output_len ${OSL} \ + --concurrency ${concurrency} \ + --request_timeout ${TTFT} \ + --request_timeout_per_output_token ${TPOT} \ + --output_dir ${TEMP_DIR} \ + --output_result_file ${OUTPUT_RESULT_FILE} \ + --output_result_file_append + set +x + + eval $(cat /tmp/llmbench/bench_${ISL}_${OSL}_c${concurrency}.json | jq -r ' + . | {request_throughput, output_throughput, mean_e2el_ms, p99_e2el_ms, mean_ttft_ms, p99_ttft_ms, mean_tpot_ms, p99_tpot_ms} + | to_entries[] + | "\(.key)=\(.value | if type == "number" then (. * 100 | round) / 100 else . end)" + ') + echo "| ${concurrency} | ${request_throughput} | ${output_throughput} | ${mean_e2el_ms} | ${p99_e2el_ms} | ${mean_ttft_ms} | ${p99_ttft_ms} | ${mean_tpot_ms} | ${p99_tpot_ms} |" >> ${OUTPUT_RESULT_FILE} + + if [ ${mean_ttft_ms} -gt ${TTFT} ]; then + echo "[concurrency:${concurrency}] The mean_ttft_ms is ${mean_ttft_ms}, which is greater than SLA ${TTFT} ms." + break + fi + if [ ${mean_tpot_ms} -gt ${TPOT} ]; then + echo "[concurrency:${concurrency}] The mean_tpot_ms is ${mean_tpot_ms}, which is greater than SLA ${TPOT} ms." + break + fi + echo "[concurrency:${concurrency}] The mean_ttft_ms is ${mean_ttft_ms}, which is less than SLA ${TTFT} ms." + echo "[concurrency:${concurrency}] The mean_tpot_ms is ${mean_tpot_ms}, which is less than SLA ${TPOT} ms." + + if [ ${concurrency} -ge 64 ]; then + concurrency=$((${concurrency} + 8)) + else + concurrency=$((${concurrency} * 2)) + fi + echo "increase concurrency to ${concurrency}" +done + +