File tree Expand file tree Collapse file tree 4 files changed +7
-9
lines changed
recml/inference/benchmarks Expand file tree Collapse file tree 4 files changed +7
-9
lines changed Original file line number Diff line number Diff line change @@ -6,14 +6,14 @@ export XLA_FLAGS=
66
77export TPU_NAME=< TPU_NAME>
88export LEARNING_RATE=0.0034
9- export BATCH_SIZE=135168
9+ export BATCH_SIZE=4224
1010export EMBEDDING_SIZE=128
1111export MODEL_DIR=/tmp/
1212export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
1313export NUM_STEPS=28000
1414export CHECKPOINT_INTERVAL=1500
1515export EVAL_INTERVAL=1500
16- export EVAL_FILE_PATTER =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
16+ export EVAL_FILE_PATTERN =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
1717export EVAL_STEPS=660
1818export MODE=eval
1919export EMBEDDING_THRESHOLD=21000
@@ -23,7 +23,6 @@ export RESTORE_CHECKPOINT=true
2323
2424
2525python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
26-
2726--learning_rate=${LEARNING_RATE} \
2827--batch_size=${BATCH_SIZE} \
2928--embedding_size=${EMBEDDING_SIZE} \
Original file line number Diff line number Diff line change @@ -6,22 +6,21 @@ export XLA_FLAGS=
66
77export TPU_NAME=< TPU_NAME>
88export LEARNING_RATE=0.0034
9- export BATCH_SIZE=135168
9+ export BATCH_SIZE=4224
1010export EMBEDDING_SIZE=128
1111export MODEL_DIR=/tmp/
1212export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
1313export NUM_STEPS=28000
1414export CHECKPOINT_INTERVAL=1500
1515export EVAL_INTERVAL=1500
16- export EVAL_FILE_PATTER =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
16+ export EVAL_FILE_PATTERN =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
1717export EVAL_STEPS=660
1818export MODE=train
1919export EMBEDDING_THRESHOLD=21000
2020export LOGGING_INTERVAL=1500
2121export RESTORE_CHECKPOINT=true
2222
2323python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
24-
2524--learning_rate=${LEARNING_RATE} \
2625--batch_size=${BATCH_SIZE} \
2726--embedding_size=${EMBEDDING_SIZE} \
Original file line number Diff line number Diff line change @@ -54,10 +54,10 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z
5454gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="pip install -U tensorflow dm-tree flax google-metrax"
5555```
5656
57- #### Run workload
57+ #### Make script executable & Run workload
5858
5959Note: Please update the MODEL_NAME & TASK_NAME before running the below command
6060
6161```
62- gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="TPU_NAME=${TPU_NAME} ./inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
62+ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="cd RecML && chmod +x ./recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME> && TPU_NAME=${TPU_NAME} ./recml /inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
6363```
Original file line number Diff line number Diff line change @@ -63,7 +63,7 @@ platformdirs==4.3.7
6363pluggy == 1.5.0
6464pre-commit == 4.2.0
6565promise == 2.3
66- protobuf == 5.29.4
66+ protobuf == 4.21.12
6767psutil == 7.0.0
6868pyarrow == 19.0.1
6969pygments == 2.19.1
You can’t perform that action at this time.
0 commit comments