simple CreationMixin (#15567) #39
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| env: | |
| # TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower | |
| # TODO: very slow for llama 70B and resnet training 6 GPU | |
| CAPTURE_PROCESS_REPLAY: "1" | |
| ASSERT_PROCESS_REPLAY: "0" | |
| PYTHONPATH: . | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| on: | |
| push: | |
| branches: | |
| - master | |
| - update_benchmark | |
| - update_benchmark_staging | |
| workflow_dispatch: | |
| jobs: | |
| # the goal of this test is to replicate a normal person on a laptop running the test | |
| # no process replay, no benchmarks, no CI, just a normal laptop person | |
| # the 3 minute timeout should not be raised | |
| testmacpytest: | |
| name: Mac pytest | |
| env: | |
| CI: "" | |
| CAPTURE_PROCESS_REPLAY: "0" | |
| runs-on: [self-hosted, macOS] | |
| timeout-minutes: 3 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| # brew install uv | |
| - name: setup python environment | |
| run: | | |
| rm -rf /tmp/tinygrad_pytest_ci | |
| uv venv /tmp/tinygrad_pytest_ci | |
| source /tmp/tinygrad_pytest_ci/bin/activate | |
| uv pip install .[testing] | |
| - name: setup staging db | |
| run: | | |
| echo "CACHEDB=/tmp/pytest-db-ci.db" >> $GITHUB_ENV | |
| rm -f /tmp/pytest-db-ci* | |
| - name: Run pytest -nauto | |
| run: | | |
| source /tmp/tinygrad_pytest_ci/bin/activate | |
| pytest -nauto --durations=20 | |
| - name: openpilot compile3 0.10.1 driving_vision | |
| run: FLOAT16=1 DEV=CL IMAGE=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx | |
| testmacbenchmark: | |
| name: Mac Benchmark | |
| env: | |
| # since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root | |
| PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache | |
| runs-on: [self-hosted, macOS] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| mkdir -p extra/disassemblers | |
| ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu | |
| ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: python3.11 test/external/process_replay/reset.py | |
| - name: Print macOS version | |
| run: sw_vers | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | |
| - name: Run Stable Diffusion without fp16 | |
| run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | |
| - name: Run Stable Diffusion v2 | |
| # TODO: very slow step time | |
| run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=4500 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | |
| # process replay can't capture this, the graph is too large | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=5000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | |
| - name: Run model inference benchmark | |
| run: DEV=METAL NOCLANG=1 python3.11 test/external/external_model_benchmark.py | |
| - name: Test speed vs torch | |
| run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | |
| - name: Test tensor cores | |
| run: DEV=METAL python3.11 test/opt/test_tensor_cores.py | |
| - name: Test AMX tensor cores | |
| run: | | |
| DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_tensor_cores.py | |
| DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_tensor_cores.py | |
| DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx | |
| DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx | |
| - name: Run Tensor Core GEMM (float) | |
| run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (half) | |
| run: DEBUG=2 SHOULD_USE_TC=1 HALF=1 python3.11 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (bfloat16) | |
| run: DEBUG=2 SHOULD_USE_TC=1 BFLOAT16=1 python3.11 extra/gemm/simple_matmul.py | |
| - name: Fuzz Padded Tensor Core GEMM | |
| run: DEV=METAL M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run quantized LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | |
| BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | |
| - name: Run quantized LLaMA3 | |
| run: | | |
| BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | |
| BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | |
| #- name: Run LLaMA 7B on 4 (virtual) GPUs | |
| # run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=gpt2 JIT=1 ASSERT_MIN_STEP_TIME=13 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - name: Run OLMoE | |
| run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | |
| # NOTE: this is failing in CI. it is not failing on my machine and I don't really have a way to debug it | |
| # the error is "RuntimeError: Internal Error (0000000e:Internal Error)" | |
| #- name: Run 10 CIFAR training steps | |
| # run: BENCHMARK_LOG=cifar_10steps JIT=1 ASSERT_MIN_STEP_TIME=3000 STEPS=10 python3.11 examples/hlb_cifar10.py | |
| #- name: Run 10 CIFAR training steps w HALF | |
| # run: BENCHMARK_LOG=cifar_10steps_half JIT=2 ASSERT_MIN_STEP_TIME=3000 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | |
| #- name: Run 10 CIFAR training steps w BF16 | |
| # run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | |
| # TODO: too slow | |
| # - name: Run 10 CIFAR training steps w winograd | |
| # run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | |
| - uses: actions/upload-artifact@v7 | |
| with: | |
| name: Speed (Mac) | |
| path: | | |
| onnx_inference_speed.csv | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py | |
| testusbgpu: | |
| name: UsbGPU Benchmark | |
| env: | |
| PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache | |
| runs-on: [self-hosted, macOS] | |
| timeout-minutes: 10 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: Kill stale pids | |
| run: | | |
| PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids | |
| PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids | |
| - name: UsbGPU boot time | |
| run: sudo -E PYTHONPATH=. GMMU=0 DEBUG=2 AM_RESET=1 DEV=AMD AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus | |
| - name: UsbGPU tiny tests | |
| run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB python3.11 test/test_tiny.py | |
| - name: UsbGPU copy speeds | |
| run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds | |
| #- name: UsbGPU openpilot test | |
| # run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx | |
| - name: UsbGPU (USB4/TB) boot time | |
| run: PYTHONPATH=. DEBUG=3 DEV=NV:NAK NV_IFACE=PCI time python3.11 test/test_tiny.py TestTiny.test_plus | |
| - name: UsbGPU (USB4/TB) tiny tests | |
| run: PYTHONPATH=. DEV=NV:NAK NV_IFACE=PCI python3.11 test/test_tiny.py | |
| testnvidiabenchmark: | |
| name: tinybox green Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Print nvidia-smi | |
| run: nvidia-smi | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run model inference benchmark | |
| run: DEV=NV CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| - name: Test speed vs torch | |
| run: DEV=NV CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | |
| - name: Test speed vs theoretical | |
| run: DEV=NV IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test benchmark allreduce | |
| run: DEV=NV python test/external/external_benchmark_multitensor_allreduce.py | |
| - name: Test tensor cores | |
| run: | | |
| DEV=NV ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| DEV=NV:PTX ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| - name: Run Tensor Core GEMM (CUDA) | |
| run: | | |
| DEV=CUDA SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| DEV=CUDA SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| DEV=CUDA SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | |
| DEV=CUDA SHOULD_USE_TC=1 FP8E4M3=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (PTX) | |
| run: DEV=NV:PTX SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (NV) | |
| run: DEV=NV SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Test DEV=NV | |
| run: DEBUG=2 DEV=NV python -m pytest -rA test/test_tiny.py | |
| - name: Test DEV=CUDA | |
| run: DEBUG=2 DEV=CUDA python -m pytest -rA test/test_tiny.py | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion DEV=NV python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | |
| # TODO: too slow | |
| # - name: Run SDXL | |
| # run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=2000 CAPTURE_PROCESS_REPLAY=0 DEV=NV CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit DEV=NV JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=llama DEV=NV JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam DEV=NV JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: DEV=NV CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: DEV=NV CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam DEV=NV JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu DEV=NV JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| - name: Run quantized LLaMA3 | |
| run: BENCHMARK_LOG=llama3_fp8 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --temperature 0 --benchmark --quantize fp8 | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: DEV=NV CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| # - name: Run LLaMA-2 70B | |
| # run: DEV=NV CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral DEV=NV CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit DEV=NV JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=gpt2 DEV=NV JIT=1 ASSERT_MIN_STEP_TIME=4 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half DEV=NV HALF=1 ASSERT_MIN_STEP_TIME=6 python3 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam DEV=NV HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - uses: actions/upload-artifact@v7 | |
| with: | |
| name: Speed (NVIDIA) | |
| path: | | |
| onnx_inference_speed.csv | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmorenvidiabenchmark: | |
| name: tinybox green Training Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| # TODO: too slow | |
| # - name: Fuzz Padded Tensor Core GEMM (NV) | |
| # run: DEV=NV M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| # TODO: too slow | |
| # - name: Fuzz Padded Tensor Core GEMM (PTX) | |
| # run: DEV=NV:PTX M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| - name: HEVC Decode Benchmark | |
| run: VALIDATE=1 MAX_FRAMES=100 ASSERT_FPS=1400 JITBEAM=1 DEV=NV PYTHONPATH=. python3 extra/hevc/decode.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. DEV=NV TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=120 DEV=NV STEPS=10 python3 examples/hlb_cifar10.py | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=120 DEV=NV STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | |
| - name: Run 10 CIFAR training steps w BF16 | |
| run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=120 DEV=NV STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | |
| # - name: Run 10 CIFAR training steps w winograd | |
| # run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=350 DEV=NV WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar DEV=NV DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| - name: Run full CIFAR training steps w 6 GPUS | |
| run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 DEV=NV DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| - name: Run MLPerf resnet eval on training data | |
| run: time BENCHMARK_LOG=resnet_eval DEV=NV MODEL=resnet python3 examples/mlperf/model_eval.py | |
| - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps DEV=NV DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps_6gpu DEV=NV CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu DEV=NV CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testamdbenchmark: | |
| name: tinybox red Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Setcap to python | |
| run: ./extra/amdpci/setup_python_cap.sh | |
| - name: Remove amd modules | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod | |
| - name: Kill stale pids | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids | |
| #- name: Insert amdgpu | |
| # run: sudo modprobe amdgpu | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| #- name: setup perflevel | |
| # run: | | |
| # examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh | |
| # rocm-smi | |
| #- name: Show off tinybox | |
| # run: /opt/rocm/bin/rocm-bandwidth-test | |
| # TODO: unstable on AMD | |
| #- name: Run model inference benchmark | |
| # run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| # TODO: unstable on AMD | |
| #- name: Test speed vs torch | |
| # run: | | |
| # python3 -c "import torch; print(torch.__version__)" | |
| # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | |
| - name: Test speed vs theoretical | |
| run: DEV=AMD IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test tensor cores (no LLVM) | |
| run: DEV=AMD python3 test/opt/test_tensor_cores.py | |
| # TODO: this is flaky | |
| # - name: Test tensor cores AMD:LLVM | |
| # run: DEV=AMD:LLVM python3 test/opt/test_tensor_cores.py | |
| - name: Run Tensor Core GEMM (AMD) | |
| run: | | |
| DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| DEV=AMD SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | |
| - name: Test DEV=AMD | |
| run: DEBUG=2 DEV=AMD python -m pytest -rA test/test_tiny.py | |
| #- name: Test HIP=1 | |
| # run: DEBUG=2 HIP=1 python -m pytest -rA test/test_tiny.py | |
| # TODO: AMD compiler bug causes this to fail | |
| #- name: Fuzz Padded Tensor Core GEMM | |
| # run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py | |
| #- name: Remove amdgpu | |
| # run: sleep 10 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid. | |
| - name: Test AM cold start time | |
| run: time DEV=AMD AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test AM warm start time | |
| run: time DEV=AMD python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=550 DEV=AMD python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 DEV=AMD python3 examples/sdxl.py --seed 0 --noshow --timing | |
| - name: Run LLaMA 7B | |
| run: | | |
| BENCHMARK_LOG=llama_nojit DEV=AMD JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=llama DEV=AMD JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run LLaMA 7B with BEAM | |
| run: BENCHMARK_LOG=llama_beam DEV=AMD JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: DEV=AMD CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: DEV=AMD CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam DEV=AMD JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu DEV=AMD JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: DEV=AMD CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | |
| #- name: Restore amdgpu | |
| # run: sudo modprobe amdgpu | |
| # - name: Run LLaMA-2 70B | |
| # run: DEV=AMD CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral DEV=AMD python3 examples/mixtral.py --temperature 0 --count 10 --timing | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit DEV=AMD JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| BENCHMARK_LOG=gpt2 DEV=AMD JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half DEV=AMD HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam DEV=AMD HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmoreamdbenchmark: | |
| name: tinybox red Training Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Setcap to python | |
| run: ./extra/amdpci/setup_python_cap.sh | |
| - name: Remove amd modules | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod | |
| - name: Kill stale pids | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. DEV=AMD TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=200 DEV=AMD STEPS=10 python3 examples/hlb_cifar10.py | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=230 DEV=AMD STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | |
| # - name: Run 10 CIFAR training steps w BF16 | |
| # run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 DEV=AMD STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | |
| # TODO: too slow | |
| # - name: Run 10 CIFAR training steps w winograd | |
| # run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 DEV=AMD WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar DEV=AMD DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| - name: Run full CIFAR training steps w 6 GPUS | |
| run: time BENCHMARK_LOG=cifar_6gpu DEV=AMD DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| # TODO: broken on some of the machines | |
| #- name: Test full tinyfs load | |
| # run: TINYFS_ENDPOINT=10.0.52.11:6767 PYTHONPATH=. python extra/tinyfs/fetch_file.py --hash d734f5e3be9f1e9d863bfaa4fc6c1ef2 --len 175866113 --dest mapping.json --check | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmlperfamdbenchmark: | |
| name: tinybox red MLPerf Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Setcap to python | |
| run: ./extra/amdpci/setup_python_cap.sh | |
| - name: Remove amd modules | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod | |
| - name: Kill stale pids | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run MLPerf resnet eval | |
| run: time BENCHMARK_LOG=resnet_eval DEV=AMD MODEL=resnet python3 examples/mlperf/model_eval.py | |
| - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps DEV=AMD DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps_6gpu DEV=AMD CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu DEV=AMD CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testqualcommbenchmark: | |
| name: comma Benchmark | |
| runs-on: [self-hosted, Linux, comma] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: openpilot compile3 0.11.0 driving_vision | |
| run: BENCHMARK_LOG=openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx | |
| - name: IR3 openpilot compile3 0.11.0 driving_vision | |
| run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM:IR3 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx | |
| - name: openpilot compile3 0.11.0 driving_policy | |
| run: BENCHMARK_LOG=openpilot_0_11_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_policy.onnx | |
| - name: openpilot compile3 0.11.0 dmonitoring | |
| run: BENCHMARK_LOG=openpilot_0_11_0_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/dmonitoring_model.onnx | |
| - name: DEBUG=2 openpilot compile3 0.10.1 driving_vision | |
| run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx | |
| - name: openpilot compile3 0.10.1 driving_vision | |
| run: BENCHMARK_LOG=openpilot_0_10_1_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx | |
| - name: openpilot compile3 0.10.1 driving_policy | |
| run: BENCHMARK_LOG=openpilot_0_10_1_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_policy.onnx | |
| - name: openpilot compile3 0.10.1 dmonitoring | |
| run: BENCHMARK_LOG=openpilot_0_10_1_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/dmonitoring_model.onnx | |
| - name: benchmark MobileNetV2 on DSP | |
| run: | | |
| # generate quantized weights | |
| ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet | |
| ln -s /data/home/tiny/tinygrad/testsig-*.so . | |
| PYTHONPATH=. CC=clang-19 DEV=CPU QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx | |
| # benchmark on DSP with NOOPT=1, the devectorizer has issues | |
| PYTHONPATH=. CC=clang-19 DEV=DSP NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testcommausbgpubenchmark: | |
| name: UsbGPU Benchmark (comma) | |
| runs-on: [self-hosted, Linux, comma4] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: openpilot compile3 0.10.1 driving_vision | |
| run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD:LLVM AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx | |
| - name: openpilot load_pickle 0.10.1 driving_vision | |
| run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision_load_pickle PYTHONPATH="." GMMU=0 DEV=AMD AMD_IFACE=USB ASSERT_MIN_LOAD_TIME=15 python3 examples/openpilot/load_pickle.py | |
| testreddriverbenchmark: | |
| name: AM Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxrandom] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Setcap to python | |
| run: ./extra/amdpci/setup_python_cap.sh | |
| - name: Remove amd modules | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod | |
| - name: Kill stale pids | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Test driver cold start time | |
| run: time DEBUG=3 DEV=AMD AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test driver warm start time | |
| run: time DEBUG=3 DEV=AMD python3 test/test_tiny.py TestTiny.test_plus | |
| # Fails on 9070 | |
| # - name: Test tensor cores | |
| # run: | | |
| # DEV=AMD python3 test/test_linearizer.py test/opt/test_tensor_cores.py | |
| # DEV=AMD:LLVM python3 test/test_linearizer.py test/opt/test_tensor_cores.py | |
| # DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (AMD) | |
| run: DEV=AMD SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | |
| - name: Test DEV=AMD | |
| run: DEBUG=2 DEV=AMD python -m pytest -rA test/test_tiny.py | |
| - name: Test DISK copy time | |
| run: DEV=AMD TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py | |
| - name: Test CPU copy time | |
| run: | | |
| DEV=AMD GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit | |
| DEV=AMD GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar DEV=AMD DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| # - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps DEV=AMD MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf Bert training steps (1 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps DEV=AMD CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | |
| - name: Remote | |
| run: | | |
| pkill -f 'extra/remote/serve.py' || true | |
| PYTHONPATH=. python3 extra/remote/serve.py 6482 & | |
| sleep 1 | |
| DEBUG=2 PYTHONPATH=. REMOTE=127.0.0.1:6482 AM_RESET=1 DEV=AMD AMD_IFACE=PCI python3 test/test_tiny.py | |
| DEBUG=2 PYTHONPATH=. REMOTE=127.0.0.1:6482 AM_RESET=1 DEV=AMD AMD_AQL=1 AMD_IFACE=PCI python3 test/test_tiny.py | |
| pkill -f 'extra/remote/serve.py' || true | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testgreendriverbenchmark: | |
| name: NV Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxrandom] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6 | |
| - name: Setcap to python | |
| run: ./extra/amdpci/setup_python_cap.sh | |
| - name: Remove nv modules | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv rmmod | |
| - name: Kill stale pids | |
| run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Test driver start time | |
| run: time DEBUG=3 DEV=NV python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test tensor cores | |
| run: DEV=NV ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| - name: Test DISK copy time | |
| run: DEV=NV TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py | |
| - name: Test CPU copy time | |
| run: | | |
| DEV=NV GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit | |
| DEV=NV GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit | |
| - name: Test LLAMA-3 | |
| run: BENCHMARK_LOG=llama3_beam DEV=NV JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar DEV=NV DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | |
| - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps DEV=NV MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | |
| - name: Run 10 MLPerf Bert training steps (1 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps DEV=NV CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | |
| - name: Remote | |
| run: | | |
| pkill -f 'extra/remote/serve.py' || true | |
| PYTHONPATH=. python3 extra/remote/serve.py 6483 & | |
| sleep 1 | |
| DEBUG=2 PYTHONPATH=. REMOTE=127.0.0.1:6483 DEV=NV python3 test/test_tiny.py | |
| pkill -f 'extra/remote/serve.py' || true | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py |