@@ -30,24 +30,23 @@ accelerate==0.21.0
3030## Download data file
3131```
3232cd /
33- export DATA_DISK_DIR=/loadgen_run_data
33+ export DATA_DISK_DIR=/home/$USER/ loadgen_run_data
3434mkdir -p ${DATA_DISK_DIR}
3535cd ${DATA_DISK_DIR}
3636gsutil cp gs://cloud-tpu-inference-public/mlcommons/inference/language/llama2-70b/data/processed-openorca/open_orca_gpt4_tokenized_llama.calibration_1000.pkl .
3737mv open_orca_gpt4_tokenized_llama.calibration_1000.pkl processed-calibration-data.pkl
3838
3939gsutil cp gs://cloud-tpu-inference-public/mlcommons/inference/language/llama2-70b/data/processed-openorca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl .
4040mv open_orca_gpt4_tokenized_llama.sampled_24576.pkl processed-data.pkl
41- cd /inference_mlperf4.1
4241```
4342
44- ## Install Maxtext
43+ ## Install Maxtext and Jetstream
4544```
4645cd /
47- git clone [email protected] :google /maxtext.git 48- cd maxtext
49- git checkout offline_inf
50- cd maxtext/MaxText
46+ git clone [email protected] :AI-Hypercomputer /maxtext.git 47+ cd /
48+ git clone [email protected] :AI-Hypercomputer/JetStream.git 49+ cd /
5150```
5251
5352## Checkpoint generation
@@ -87,9 +86,9 @@ huggingface-cli login
8786
8887## Loadgen settings
8988```
90- cd Google/code/llama2-70b/tpu_v5e_8_jetstream_maxtext/scripts/
89+ cd /home/$USER/Jetstream/benchmarks/mlperf/scripts
9190export API_URL=0.0.0.0:9000
92- export DATA_DISK_DIR=/loadgen_run_data
91+ export DATA_DISK_DIR=/home/$USER/ loadgen_run_data
9392export DATASET_TYPE=full # for calibration run, DATASET_TYPE=calibration
9493
9594export MODEL_NAME=llama70b
@@ -99,46 +98,40 @@ export BATCH_SIZE_EXP=8
9998export USER_CONFIG=user.conf
10099```
101100
102- ## Offline Setup
103- ```
104- cd /
105- git clone [email protected] :google/maxtext.git 106- cd maxtext
107- git checkout offline_inf
108- cd maxtext/MaxText
109-
110- # For v5e use
111- export BATCH_AND_PREFILL_LEN=“256,80|512,40|1024,20”
112-
113- # For v6e use
114- export BATCH_AND_PREFILL_LEN=“256,216|512,108|1024,54”
115- export TOKENIZER_PATH=maxtext/assets/tokenizer.llama2
101+ ## Start Jetstream server
116102
117- export MAXENGINE_ARGS="model_name=llama2-70b tokenizer_path=${TOKENIZER_PATH} quantization=int8 quantize_kvcache=True load_parameters_path=${SAVE_QUANT_PARAMS_PATH} checkpoint_is_quantized=True compute_axis_order=0,1,2,3 ar_cache_axis_order=0,1,2,3"
103+ Start Jetstream server in a terminal.
118104```
119-
120- ## Run offline performance
121-
122- ```
123- cd /maxtext/MaxText
124- bash ./llama_offline_performance_run.sh
105+ cd ~/maxtext
106+ python MaxText/maxengine_server.py MaxText/configs/base.yml tokenizer_path=assets/tokenizer.llama2 load_parameters_path="gs://msingh-bkt/checkpoints/quant_llama2-70b-chat/mlperf_070924/int8_" max_prefill_predict_length=1024 max_target_length=2048 model_name=llama2-70b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 checkpoint_is_quantized=True quantization=int8 quantize_kvcache=True compute_axis_order=0,2,1,3 ar_cache_axis_order=0,2,1,3 enable_jax_profiler=True per_device_batch_size=60 optimize_mesh_for_tpu_v6e=True
125107```
126108
127- ## Run offline accuracy
109+ Wait until you see these server logs to indicate server is ready to process requests:
128110```
129- cd /maxtext/MaxText
130- bash ./llama_offline_accuracy_run.sh
131- ```
132-
133- ## Run offline audit
134- ```
135- cd /maxtext/MaxText
136- bash ./llama_offline_audit_run.sh
111+ Memstats: After load_params:
112+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_0(process=0,(0,0,0,0))
113+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_1(process=0,(1,0,0,0))
114+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_2(process=0,(0,1,0,0))
115+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_3(process=0,(1,1,0,0))
116+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_4(process=0,(0,2,0,0))
117+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_5(process=0,(1,2,0,0))
118+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_6(process=0,(0,3,0,0))
119+ Using (GB) 8.1 / 31.25 (25.920000%) on TPU_7(process=0,(1,3,0,0))
120+ WARNING:root:Initialising driver with 1 prefill engines and 1 generate engines.
121+ 2025-02-10 22:10:34,122 - root - WARNING - Initialising driver with 1 prefill engines and 1 generate engines.
122+ WARNING:absl:T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
123+ 2025-02-10 22:10:34,152 - absl - WARNING - T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
124+ WARNING:absl:T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
125+ 2025-02-10 22:10:34,260 - absl - WARNING - T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
126+ WARNING:absl:T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
127+ 2025-02-10 22:10:34,326 - absl - WARNING - T5 library uses PAD_ID=0, which is different from the sentencepiece vocabulary, which defines pad_id=-1
128+ GC tweaked (allocs, gen1, gen2): 60000 20 30
129+ 2025-02-10 22:10:36.360296: I external/xla/xla/tsl/profiler/rpc/profiler_server.cc:46] Profiler server listening on [::]:9999 selected port:9999
137130```
138131
139132## Run server performance
140133```
141- cd Google/code/llama2-70b/tpu_v5e_8_jetstream_maxtext/scripts/
134+ cd /home/$USER/Jetstream/benchmarks/mlperf/scripts
142135bash ./generate_server_performance_run.sh
143136```
144137
0 commit comments