chore: install tf profiler in tf images (#234)

azhou-determined · web-flow · commit 622d5121f332 · 2023-11-13T11:51:14.000-08:00
diff --git a/Dockerfile-default-cpu b/Dockerfile-default-cpu
@@ -33,6 +33,9 @@ RUN if [ "$TORCHVISION_PIP" ]; then pip install $TORCHVISION_PIP; fi
 ARG TORCH_TB_PROFILER_PIP
 RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then pip install $TORCH_TB_PROFILER_PIP; fi
 
+ARG TF_PROFILER_PIP
+RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi
+
 ARG HOROVOD_WITH_TENSORFLOW
 RUN if [ "$HOROVOD_WITH_TENSORFLOW" ]; then export HOROVOD_WITH_TENSORFLOW=$HOROVOD_WITH_TENSORFLOW; fi
 
diff --git a/Dockerfile-default-gpu b/Dockerfile-default-gpu
@@ -41,6 +41,9 @@ RUN if [ "$TF_CUDA_SYM" ]; then ln -s /usr/local/cuda/lib64/libcusolver.so.11 /o
 ARG TORCH_TB_PROFILER_PIP
 RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then python -m pip install $TORCH_TB_PROFILER_PIP; fi
 
+ARG TF_PROFILER_PIP
+RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi
+
 ARG TORCH_CUDA_ARCH_LIST
 ARG APEX_GIT
 RUN /tmp/det_dockerfile_scripts/install_apex.sh
diff --git a/Dockerfile-default-rocm b/Dockerfile-default-rocm
@@ -53,6 +53,12 @@ RUN pip install protobuf==3.20.1
 ARG TENSORFLOW_PIP
 RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
 
+ARG TORCH_TB_PROFILER_PIP
+RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then pip install $TORCH_TB_PROFILER_PIP; fi
+
+ARG TF_PROFILER_PIP
+RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi
+
 # Reset these because we set GPU_OPERATIONS later.
 ENV HOROVOD_GPU_BROADCAST=
 ENV HOROVOD_GPU_ALLREDUCE=
diff --git a/Makefile b/Makefile
@@ -179,12 +179,16 @@ build-gpu-cuda-118-base:
 		.
 
 export ROCM50_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_50_PREFIX)pytorch-1.10-tf-2.7-rocm
+export TF_PROFILER_PIP := tensorboard-plugin-profile
+export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1
 
 .PHONY: build-pytorch10-tf27-rocm50
 build-pytorch10-tf27-rocm50:
 	docker build -f Dockerfile-default-rocm \
 		--build-arg BASE_IMAGE="amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0" \
+		--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
 		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.7.1" \
+		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
 		--build-arg HOROVOD_PIP="horovod==0.25.0" \
 		-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
 		-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
@@ -194,7 +198,6 @@ DEEPSPEED_VERSION := 0.8.3
 export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
 export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
 export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
-export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1
 
 # This builds deepspeed environment off of upstream microsoft/DeepSpeed.
 .PHONY: build-deepspeed-gpu
@@ -254,6 +257,7 @@ build-tf28-cpu: build-cpu-py-38-base
 		--platform "$(PLATFORMS)" \
 		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(CPU_PY_38_BASE_NAME)-$(SHORT_GIT_HASH)" \
 		--build-arg TENSORFLOW_PIP="tensorflow-cpu==2.8.4" \
+		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
 		--build-arg HOROVOD_PIP="horovod==0.24.2" \
 		--build-arg HOROVOD_WITH_PYTORCH=0 \
 		--build-arg HOROVOD_WITH_MPI="$(HOROVOD_WITH_MPI)" \
@@ -268,6 +272,7 @@ build-tf28-gpu: build-gpu-cuda-112-base
 	docker build -f Dockerfile-default-gpu \
 		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_112_BASE_NAME)-$(SHORT_GIT_HASH)" \
 		--build-arg TENSORFLOW_PIP="tensorflow==2.8.3" \
+		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
 		--build-arg HOROVOD_PIP="horovod==0.24.2" \
 		--build-arg HOROVOD_WITH_PYTORCH=0 \
 		-t $(DOCKERHUB_REGISTRY)/$(GPU_TF28_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
@@ -320,6 +325,7 @@ build-tf2-cpu: build-cpu-py-39-base
 	    --platform "$(PLATFORMS)" \
 		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(CPU_PY_39_BASE_NAME)-$(SHORT_GIT_HASH)" \
 		--build-arg TENSORFLOW_PIP="$(TF2_PIP_CPU)" \
+		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
 		--build-arg TORCH_PIP="$(TORCH_PIP_CPU)" \
 		--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
 		--build-arg HOROVOD_PIP="$(HOROVOD_PIP_COMMAND)" \
@@ -351,6 +357,7 @@ build-tf2-gpu: build-gpu-cuda-113-base
 		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_113_BASE_NAME)-$(SHORT_GIT_HASH)" \
 		--build-arg TENSORFLOW_PIP="$(TF2_PIP_GPU)" \
 		--build-arg TORCH_PIP="$(TORCH_PIP_GPU)" \
+		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
 		--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
 		--build-arg TORCH_CUDA_ARCH_LIST="3.7;6.0;6.1;6.2;7.0;7.5;8.0" \
 		--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \